diff --git a/VERSION b/VERSION index 3c43790f..fd9d1a5a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.6 +1.2.14 diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index fe42d453..23d3f746 100644 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -199,3 +199,7 @@ DEF_OP(CROP_AND_RESIZE) DEF_OP(TAN) DEF_OP(RMSNORM) DEF_OP(SHAPE) +DEF_OP(BITCAST) +DEF_OP(GROUPED_CONV3D) +DEF_OP(COL2IM) +DEF_OP(L1_LAYER_NORM) diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bitcast.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bitcast.h new file mode 100644 index 00000000..9592e6a0 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bitcast.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_BITCAST_H +#define _VSI_NN_OP_BITCAST_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_bitcast_param +{ + struct _bitcast_local_data_t* local; +} vsi_nn_bitcast_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_col2im.h b/src/tim/vx/internal/include/ops/vsi_nn_op_col2im.h new file mode 100644 index 00000000..0cbadb72 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_col2im.h @@ -0,0 +1,49 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_COL2IM_H +#define _VSI_NN_OP_COL2IM_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_col2im_param +{ + const int32_t* image_shape; + const int32_t* block_shape; + int32_t strides[3]; + int32_t pads[6]; + int32_t dilations[3]; + int32_t dim_num; +} vsi_nn_col2im_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv3d.h new file mode 100644 index 00000000..87de1e79 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv3d.h @@ -0,0 +1,55 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_GROUPED_CONV3D_H +#define _VSI_NN_OP_GROUPED_CONV3D_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_grouped_conv3d_param +{ + void* local; + uint32_t ksize[3]; + uint32_t stride[3]; + /* Pad left, right, top, bottom, front, rear */ + uint32_t pad[6]; + /* Pad type default value shall be AUTO */ + vsi_nn_pad_e pad_type; + uint32_t weights; + uint32_t group; + uint32_t dilation[3]; + int32_t multiplier; + vsi_nn_pad_mode_e pad_mode; +} vsi_nn_grouped_conv3d_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_l1_layer_norm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_l1_layer_norm.h new file mode 100644 index 00000000..80de07e7 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_l1_layer_norm.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_L1_LAYER_NORM_H +#define _VSI_NN_OP_L1_LAYER_NORM_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_l1_layer_norm_param +{ + struct _l1_layer_norm_local_data_t * local; + float eps; + int32_t axis; +} vsi_nn_l1_layer_norm_param; + + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h index 007983c6..010b52c7 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_util.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h @@ -349,7 +349,7 @@ vsi_bool vsi_nn_IsEVISFeatureAvaiable vsi_nn_context_t context ); -int32_t vsi_nn_compareVersion +OVXLIB_API int32_t vsi_nn_compareVersion ( vsi_nn_graph_t * graph, uint32_t version_major, diff --git a/src/tim/vx/internal/include/vsi_nn/vsi_nn.h b/src/tim/vx/internal/include/vsi_nn/vsi_nn.h new file mode 100644 index 00000000..115a2e81 --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn/vsi_nn.h @@ -0,0 +1,2034 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +/** + * @file vsi_nn.h + */ +#ifndef _VSI_NN_INTERFACE_H +#define _VSI_NN_INTERFACE_H + +#if defined(_MSC_VER) +#define EXPORT __declspec(dllexport) +#elif defined(__linux__) +#define EXPORT __attribute__((visibility("default"))) +#else +#define EXPORT +#endif + +#if !defined(_IN) +#define _IN +#endif +#if !defined(_OUT) +#define _OUT +#endif +#if !defined(_INOUT) +#define _INOUT +#endif +#if !defined(_OPTIONAL) +#define _OPTIONAL +#endif + +#include +#include + +#if defined(__cplusplus) +#define __BEGIN_DECLS extern "C" { +#define __END_DECLS } +#else +#define __BEGIN_DECLS +#define __END_DECLS +#endif + +__BEGIN_DECLS + + +#ifndef TRUE +#define TRUE (1) +#endif +#ifndef FALSE +#define FALSE (0) +#endif + + +/** + * Return codes. + */ +typedef enum +{ + /** + * Operation was succesful. + */ + VSI_NN_ERROR_OK = 0, + + /** + * Failure caused by vsi_nn api fail. + */ + VSI_NN_ERROR_API_FAIL = 1, + + /** + * Failure caused by not enough available memory. + */ + VSI_NN_ERROR_OUT_OF_MEMORY = 2, + + /** + * Failure caused by unexpected null argument. + */ + VSI_NN_ERROR_UNEXPECTED_NULL = 3, + + /** + * Failure caused by invalid function arguments, invalid model definition, + * invalid execution definition or invalid data at execution time. + */ + VSI_NN_ERROR_VALUED_ERROR = 4, + + /** + * Failure caused by operations that need completed graph. + */ + VSI_NN_ERROR_UNCOMPLETE_GRAPH = 5, + + /** + * Failure caused by insearting a keyword argument repeatly. + */ + VSI_NN_ERROR_KWARGS_REPEAT = 6, +} VSI_NN_error_e; + +/** + * Implicit padding algorithms. + */ +typedef enum +{ + /** + * Pad with const value which are specific by others parameters. + */ + VSI_NN_IMPLICIT_PADDING_NONE = 0, + + /** + * Implicit(VALID) padding. + * No padding. + */ + VSI_NN_IMPLICIT_PADDING_VALID = 1, + + /** + * Implicit(SAME) padding. + * Padding on both ends are the "same". + */ + VSI_NN_IMPLICIT_PADDING_SAME = 2, +} VSI_NN_implicit_padding_e; + +/** + * Padding mode. + */ +typedef enum +{ + /** + * Pad with const value which are specific by others parameters, default 0. + */ + VSI_NN_PADDING_MODE_CONSTANT = 0, + + /** + * Reflect padding mode + */ + VSI_NN_PADDING_MODE_REFLECT = 1, + + /** + * Symmetric padding mode + */ + VSI_NN_PADDING_MODE_SYMMETRIC = 2, + + /** + * Replicate padding mode + */ + VSI_NN_PADDING_MODE_REPLICATE = 3, +} VSI_NN_padding_mode_e; + +/** + * Rounding methods + */ +typedef enum +{ + /** + * Floor rounding + */ + VSI_NN_ROUNDING_FLOOR = 0, + /** + * Ceiling rounding + */ + VSI_NN_ROUNDING_CEIL = 1, +} VSI_NN_rounding_e; + +/** + * LSH Projection supported types. + */ +typedef enum +{ + /** + * Computed bit vector is considered to be sparse. + */ + VSI_NN_LSH_PROJECTION_SPARSE = 1, + /** + * Computed bit vector is considered to be dense. + */ + VSI_NN_LSH_PROJECTION_DENSE = 2, +} VSI_NN_lsh_projection_type_e; + +/** + * Supported activation function types. + */ +typedef enum +{ + /** No activation */ + VSI_NN_ACTIVATION_NONE = 0, + /** ReLU activation */ + VSI_NN_ACTIVATION_RELU = 1, + /** ReLU1 activation */ + VSI_NN_ACTIVATION_RELU1 = 2, + /** ReLU6 activation */ + VSI_NN_ACTIVATION_RELU6 = 3, + /** TanH activation */ + VSI_NN_ACTIVATION_TANH = 4, + /** Sigmoid activation */ + VSI_NN_ACTIVATION_SIGMOID = 5, +} VSI_NN_activation_e; + +/** + * Tensor types. + * + * The type of tensors that can be added to a graph. + */ +typedef enum +{ + /** A tensor of IEEE 754 16 bit floating point values */ + VSI_NN_TENSOR_FLOAT16 = 0, + /** A tensor of 32 bit floating point values */ + VSI_NN_TENSOR_FLOAT32 = 1, + /** A tensor of 64 bit floating point values */ + VSI_NN_TENSOR_FLOAT64 = 2, + /** + * A tensor of 8 bit boolean values. + * + * Values of this operand type are either true or false. A zero value + * represents false; any other value represents true. + */ + VSI_NN_TENSOR_BOOL8 = 3, + /** A tensor of 8 bit integer values */ + VSI_NN_TENSOR_INT8 = 4, + /** A tensor of 16 bit integer values */ + VSI_NN_TENSOR_INT16 = 5, + /** A tensor of 32 bit integer values */ + VSI_NN_TENSOR_INT32 = 6, + /** A tensor of 64 bit integer values */ + VSI_NN_TENSOR_INT64 = 7, + /** A tensor of 8 bit unsigned integer values */ + VSI_NN_TENSOR_UINT8 = 8, + /** A tensor of 16 bit unsigned integer values */ + VSI_NN_TENSOR_UINT16 = 9, + /** A tensor of 32 bit unsigned integer values */ + VSI_NN_TENSOR_UINT32 = 10, + /** A tensor of 64 bit unsigned integer values */ + VSI_NN_TENSOR_UINT64 = 11, + /** A tensor of 16 bit truncate floating point values */ + VSI_NN_TENSOR_BFLOAT16 = 12, +} VSI_NN_tensor_type_e; + +typedef enum { + /** Not a quantized tensor */ + VSI_NN_TENSOR_QUANT_NONE = 0, + /** + * A tensor of 8 bit signed integer values that represent real numbers + * + * Attached to this tensor is a number that can be used to convert + * the 8 bit integer to the real value. + * + * fraction_length: a 32 bit signed integer, in range [-128, 127]. + * + * The formula is: + * real_value = integer_value / pow(2, fraction_length). + */ + VSI_NN_TENSOR_QUANT8_DFP = 1, + /** + * A tensor of 16 bit signed integer values that represent real numbers + * + * Attached to this tensor is a number that can be used to convert + * the 16 bit integer to the real value. + * + * fraction_length: a 32 bit signed integer, in range [-128, 127]. + * + * The formula is: + * real_value = integer_value / pow(2, fraction_length). + */ + VSI_NN_TENSOR_QUANT16_DFP = 2, + /** + * A tensor of 32 bit signed integer values that represent real numbers + * + * Attached to this tensor is a number that can be used to convert + * the 16 bit integer to the real value. + * + * fraction_length: a 32 bit signed integer, in range [-128, 127]. + * + * The formula is: + * real_value = integer_value / pow(2, fraction_length). + */ + VSI_NN_TENSOR_QUANT32_DFP = 3, + /** + * A tensor of 64 bit signed integer values that represent real numbers + * + * Attached to this tensor is a number that can be used to convert + * the 16 bit integer to the real value. + * + * fraction_length: a 32 bit signed integer, in range [-128, 127]. + * + * The formula is: + * real_value = integer_value / pow(2, fraction_length). + */ + VSI_NN_TENSOR_QUANT64_DFP = 4, + /** + * A tensor of 8 bit signed integer values that represent real numbers + * + * Attached to this tensor is a numbers that can be used to convert + * the 8 bit integer to the real value. + * + * scale: a 32 bit floating point value greater than zero. + * + * The formula is: + * real_value = integer_value * scale. + */ + VSI_NN_TENSOR_QUANT8_SYMM = 5, + /** + * A tensor of 32 bit signed integer values that represent real numbers + * + * Attached to this tensor is a numbers that can be used to convert + * the 8 bit integer to the real value. + * + * scale: a 32 bit floating point value greater than zero. + * + * The formula is: + * real_value = integer_value * scale. + */ + VSI_NN_TENSOR_QUANT32_SYMM = 6, + /** + * A tensor of 8 bit unsigned integer values that represent real numbers + * + * Attached to this tensor are two numbers that can be used to convert + * the 8 bit integer to the real value. + * + * scale: a 32 bit floating point value greater than zero. + * zero_point: a 32 bit signed integer, in range [0, 255]. + * + * The formula is: + * real_value = (integer_value - zero_point) * scale. + */ + VSI_NN_TENSOR_QUANT8_ASYMM = 7, + /** + * A tensor of 8 bit signed integers that represent real numbers. + * + * Attached to this tensor are two numbers that can be used to convert + * the 8 bit integer to the real value. + * + * channel_dim: a 32 bit unsigned integer indicating channel dimension. + * scales: an array of positive 32 bit floating point values. + * The size of the scales array must be equal to shape[channel_dim]. + * + * The formula is: + * realValue[..., C, ...] = integerValue[..., C, ...] * scales[C] + * where C is an index in the Channel dimension. + */ + VSI_NN_TENSOR_QUANT8_PERCHANNEL_SYMM = 8, + /** + * A tensor of 32 bit signed integers that represent real numbers. + * + * Attached to this tensor are two numbers that can be used to convert + * the 8 bit integer to the real value. + * + * channel_dim: a 32 bit unsigned integer indicating channel dimension. + * scales: an array of positive 32 bit floating point values. + * The size of the scales array must be equal to shape[channel_dim]. + * + * The formula is: + * realValue[..., C, ...] = integerValue[..., C, ...] * scales[C] + * where C is an index in the Channel dimension. + */ + VSI_NN_TENSOR_QUANT32_PERCHANNEL_SYMM = 9, +} VSI_NN_tensor_quant_type_e; + +/** Parameters for VSI_NN_TENSOR_QUANT8_ASYMM */ +typedef struct +{ + float scale; + int32_t zero_point; +} VSI_NN_quant_param_asymm; + +/** Parameters for VSI_NN_TENSOR_QUANT8_SYMM */ +typedef struct +{ + float scale; +} VSI_NN_quant_param_symm; + +/** Parameters for VSI_NN_TENSOR_QUANT8_DFP */ +typedef struct +{ + int32_t fraction_length; +} VSI_NN_quant_param_dfp; + +/** Parameters for VSI_NN_TENSOR_QUANT8_PERCHANNEL_SYMM */ +typedef struct +{ + /** The index of the channel dimension. */ + int32_t channel_dim; + + /** + * The array of scaling values for each channel. + * Each value must be greater than zero. + */ + const float* scales; + + /** + * The size of the scale array. + * Should be equal to shape[channel_dim] of the tensor. + * */ + int32_t scale_count; +} VSI_NN_quant_param_perchannel_symm; + +/** Parameters for quantization */ +typedef struct +{ + /** Tensor quantize type */ + VSI_NN_tensor_quant_type_e type; + union + { + /** Dynamic fixed point quantization */ + VSI_NN_quant_param_dfp dfp; + /** Asymmetric affine quantization */ + VSI_NN_quant_param_asymm asymm; + /** Symmetric affine quantization */ + VSI_NN_quant_param_symm symm; + /** Perchannel symmetric affine quantization */ + VSI_NN_quant_param_perchannel_symm perchannel_symm; + } param; +} VSI_NN_tensor_quant_param; + +/** + * NN Runtime context + */ +typedef struct _vsi_nn_context_t VSI_NN_context; + +/** + * VSI_NN_graph is an opaque type that contains a description of the network operations. + * + * Create graph by calling VSI_NN_graph_create. + * A graph is completed by calling VSI_NN_graph_verify. + * A graph is destroyed by calling VSI_NN_graph_release. + * + */ +typedef struct _vsi_nn_graph VSI_NN_graph; + +/** + * VSI_NN_tensor is an opaque type that can be used to describe a tensor. + * + * Create tensor by calling VSI_NN_tensor_create. + * + */ +typedef struct _vsi_nn_tensor VSI_NN_tensor; + +/** + * Create context + * + * @return Context handle on success or NULL otherwise. + */ +EXPORT VSI_NN_context* VSI_NN_context_create(); + +/** + * Release context + * + * @param[in] ctx_ptr The pointer to context to release, and reset point to null. + */ +EXPORT void VSI_NN_context_release + ( + _IN VSI_NN_context** ctx_ptr + ); + +/** + * Create graph + * Create a net graph. + * + * @param[in] ctx The context used to create graph. + * @return The graph on success, or NULL otherwise. + */ +EXPORT VSI_NN_graph* VSI_NN_graph_create + ( + VSI_NN_context* ctx + ); + +/** + * Release graph + * Release a graph and free its resource. + * + * @param[in] graph_ptr The graph to be release. + */ +EXPORT void VSI_NN_graph_release + ( + _IN VSI_NN_graph** graph_ptr + ); + +/** + * Identify graph inputs and outputs + * Identify the input and output tensors of a graph. User should call this to + * specific the inputs and outputs, they are used to exchange data between application + * level and VSI_NN level. + * + * @param[in] graph The graph to be identify. + * @param[in] input_tensors Input tensors. + * @param[in] input_tensors_num Number of input tensors. + * @param[in] output_tensors Output tensors. + * @param[in] output_tensors_num Number of output tensors. + * @return VSI_NN_ERROR_OK on success + */ +EXPORT VSI_NN_error_e VSI_NN_graph_identify_input_output + ( + _IN VSI_NN_graph* graph, + _IN const VSI_NN_tensor** input_tensors, + _IN const int32_t input_tensors_num, + _IN const VSI_NN_tensor** output_tensors, + _IN const int32_t output_tensors_num + ); + +/** + * To freeze a graph with verifying and compiling. + * + * This function may take a long time to compile the graph, and it must only be called + * once for a given graph. + * + * A frozen graph cannot be modified. + * + * @param[in] graph The graph to be finished. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_graph_verify + ( + _IN VSI_NN_graph* graph + ); + +/** + * Compute a frozen graph. + * + * @param[in] graph The graph to be executed. + * + * @return VSI_NN_ERROR_OK on success. VSI_NN_ERROR_UNCOMPLETE_GRAPH if + * the graph is not finished. + */ +EXPORT VSI_NN_error_e VSI_NN_graph_compute + ( + _IN const VSI_NN_graph* graph + ); + +//EXPORT VSI_NN_error_e VSI_NN_GRPAH_profile(_IN const VSI_NN_graph* graph); + +/** + * Add a tensor to a graph. + * + * @param[in] graph The graph to be added. + * @param[in] dtype The data type. + * @param[in] shape The shape for the tensor. + * @param[in] ndim The rank for the tensor. + * @param[in] memory The memory address to the data, the memory address + * must be 64-byte align. If it's set to null, vsi_nn can + * optimize the memory allocation and this is default behavior. + * @param[in] memory_size The size of memory. + * @param[in] quant_param The quantization parameters for the tensor, set + * null if it's not quantized tensor. + * + * @return Tensor handle on success, or NULL if get failure. + */ +EXPORT VSI_NN_tensor* VSI_NN_tensor_create + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor_type_e dtype, + _IN const int32_t* shape, + _IN int32_t ndim, + _IN const VSI_NN_tensor_quant_param* quant_param, + _IN void* memory, + _IN size_t memory_size, + _IN int32_t is_constant + ); + +/** + * Add a virtual tensor to a graph. + * + * @param[in] graph The graph to be added. + * @param[in] dtype The data type. + * + * @return Tensor handle on success, or NULL if get failure. + */ +EXPORT VSI_NN_tensor* VSI_NN_tensor_create_virtual + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor_type_e dtype, + _IN const VSI_NN_tensor_quant_param* quant_param + ); + +/** + * Get element size of a tensor. + * + * @param[in] tensor Tensor to query element size. + * + * @return Element size of the tensor. + */ +EXPORT int32_t VSI_NN_tensor_get_size + ( + _IN const VSI_NN_tensor* tensor + ); + +/** + * Get bytes of a tensor. + * + * @param[in] tensor Tensor to query element size. + * + * @return Bytes of the tensor. + */ +EXPORT int32_t VSI_NN_tensor_get_bytes + ( + _IN const VSI_NN_tensor* tensor + ); + +/** + * Read tensor data. + * + * @param[in] tensor Tensor to read. + * @param[in] memory Memory to fill the data. + * @param[in] memory_size Element size of the read data, + * must be equal to tensor size. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_tensor_read + ( + _IN VSI_NN_tensor* tensor, + _IN void* memory, + _IN size_t memory_size + ); + +/** + * Write data to tensor. + * + * @param[in] tensor Tensor to write. + * @param[in] memory Memory with the data. + * @param[in] memory_size Element size of the write data, + * must be equal to tensor size. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_tensor_write + ( + _IN VSI_NN_tensor* tensor, + _IN void* memory, + _IN size_t memory_size + ); + +/** + * Swap tensors' memories. + * + * @param[in] tensor1 Tensor to swap the memory. + * @param[in] tensor2 Tensor to swap the memory. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_tensor_swap + ( + _IN VSI_NN_tensor* tensor1, + _IN VSI_NN_tensor* tensor2 + ); + +/** + * Swap tensor memories. + * User can use this api to get tensor's original memory. + * + * @param[in] tensor Tensor to swap the memory. + * @param[in] new_memory The new memory for the tensor, + * if NULL, there is no memory swapped. + * @param[in] old_memory Pointer for the tensor's original memory. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_tensor_swap_memory + ( + _IN VSI_NN_tensor* tensor, + _IN _OPTIONAL void* new_memory, + _INOUT void** old_memory + ); + +/** + * Flush tensor memory + * Once a tensor's memory is dirty, user should call this api to sync NPU memory. + * + * @param[in] tensor Tensor to flush memory + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_tensor_flush_memory + ( + _IN const VSI_NN_tensor* tensor + ); + +/** Convolutional */ +/** + * Convolution 1D node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] kernel Kernel with a 3D tensor. + * @param[in] bias Bias with a 1D tensor. + * @param[in] output Node output tensor. + * @param[in] stride Convolution stride. + * @param[in] dilation Convolution dilation rate. + * @param[in] pad_front Padding front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] pad_end Padding end value. + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_conv_1d + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* kernel, + _IN _OPTIONAL VSI_NN_tensor* bias, + _IN VSI_NN_tensor* output, + _IN int32_t stride, + _IN int32_t dilation, + _IN int32_t pad_front, _IN int32_t pad_end, + _IN VSI_NN_implicit_padding_e implicit_padding + ); + +/** + * Convolution 2D node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] kernel Kernel with a 4D tensor. + * @param[in] bias Bias with a 1D tensor. + * @param[in] output Node output tensor. + * @param[in] stride_h Convolution stride height. + * @param[in] stride_w Convolution stride width. + * @param[in] dilation_h Convolution height dilation rate. + * @param[in] dilation_w Convolution width dilation rate. + * @param[in] pad_h_front Padding height front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] pad_h_end Padding height front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] pad_w_front Padding width front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] pad_w_end Padding widht front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_conv_2d + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* kernel, + _IN _OPTIONAL VSI_NN_tensor* bias, + _IN VSI_NN_tensor* output, + _IN int32_t stride_h, _IN int32_t stride_w, + _IN int32_t dilation_h, _IN int32_t dilation_w, + _IN int32_t pad_h_front, _IN int32_t pad_h_end, + _IN int32_t pad_w_front, _IN int32_t pad_w_end, + _IN VSI_NN_implicit_padding_e implicit_padding + ); + +/** + * Depthwise Convolution 2D node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] kernel Kernel with a 4D tensor. + * @param[in] bias Bias with a 1D tensor. + * @param[in] output Node output tensor. + * @param[in] multiplier Depthwise convolution multiplier. + * @param[in] stride_h Convolution stride height. + * @param[in] stride_w Convolution stride width. + * @param[in] dilation_h Convolution height dilation rate. + * @param[in] dilation_w Convolution width dilation rate. + * @param[in] pad_h_front Padding height front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] pad_h_end Padding height front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] pad_w_front Padding width front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] pad_w_end Padding widht front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_depthwise_conv_2d + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* kernel, + _IN _OPTIONAL VSI_NN_tensor* bias, + _IN VSI_NN_tensor* output, + _IN int32_t multiplier, + _IN int32_t stride_h, _IN int32_t stride_w, + _IN int32_t dilation_h, _IN int32_t dilation_w, + _IN int32_t pad_h_front, _IN int32_t pad_h_end, + _IN int32_t pad_w_front, _IN int32_t pad_w_end, + _IN VSI_NN_implicit_padding_e implicit_padding + ); + +/** + * Grouped Convolution 2D node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] kernel Kernel with a 4D tensor. + * @param[in] bias Bias with a 1D tensor. + * @param[in] output Node output tensor. + * @param[in] group_number Group number for the convolution. + * @param[in] stride_h Convolution stride height. + * @param[in] stride_w Convolution stride width. + * @param[in] dilation_h Convolution height dilation rate. + * @param[in] dilation_w Convolution width dilation rate. + * @param[in] pad_h_front Padding height front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] pad_h_end Padding height front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] pad_w_front Padding width front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] pad_w_end Padding widht front value, + * this field only effect when implicit + * padding is VSI_NN_IMPLICIT_PADDING_NONE. + * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_grouped_conv_2d + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* kernel, + _IN _OPTIONAL VSI_NN_tensor* bias, + _IN VSI_NN_tensor* output, + _IN int32_t group_number, + _IN int32_t stride_h, _IN int32_t stride_w, + _IN int32_t dilation_h, _IN int32_t dilation_w, + _IN int32_t pad_h_front, _IN int32_t pad_h_end, + _IN int32_t pad_w_front, _IN int32_t pad_w_end, + _IN VSI_NN_implicit_padding_e implicit_padding + ); + +EXPORT VSI_NN_error_e VSI_NN_node_transposed_conv_2d + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* kernel, + _IN _OPTIONAL VSI_NN_tensor* bias, + _IN VSI_NN_tensor* output, + _IN int32_t stride_h, _IN int32_t stride_w, + _IN int32_t dilation_h, _IN int32_t dilation_w, + _IN int32_t pad_h_front, _IN int32_t pad_h_end, + _IN int32_t pad_w_front, _IN int32_t pad_w_end, + _IN int32_t output_pad_h, _IN int32_t output_pad_w + ); + +/** Pooling */ +EXPORT VSI_NN_error_e VSI_NN_node_average_pool_2d + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN int32_t ksize_h, _IN int32_t ksize_w, + _IN int32_t stride_h, _IN int32_t stride_w, + _IN int32_t pad_h_front, _IN int32_t pad_h_end, + _IN int32_t pad_w_front, _IN int32_t pad_w_end, + _IN VSI_NN_implicit_padding_e implicit_padding, + _IN VSI_NN_rounding_e size_rounding + ); + +EXPORT VSI_NN_error_e VSI_NN_node_max_pool_2d + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN int32_t ksize_h, _IN int32_t ksize_w, + _IN int32_t stride_h, _IN int32_t stride_w, + _IN int32_t pad_h_front, _IN int32_t pad_h_end, + _IN int32_t pad_w_front, _IN int32_t pad_w_end, + _IN VSI_NN_implicit_padding_e implicit_padding, + _IN VSI_NN_rounding_e size_rounding + ); + +EXPORT VSI_NN_error_e VSI_NN_node_l2_pool_2d + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN int32_t ksize_h, _IN int32_t ksize_w, + _IN int32_t stride_h, _IN int32_t stride_w, + _IN int32_t pad_h_front, _IN int32_t pad_h_end, + _IN int32_t pad_w_front, _IN int32_t pad_w_end, + _IN VSI_NN_implicit_padding_e implicit_padding, + _IN VSI_NN_rounding_e size_rounding + ); + +EXPORT VSI_NN_error_e VSI_NN_node_unpool_2d(); + +/** Normalization */ +EXPORT VSI_NN_error_e VSI_NN_node_batch_normalization + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* mean, + _IN VSI_NN_tensor* variance, + _IN VSI_NN_tensor* offset, + _IN VSI_NN_tensor* scale, + _IN VSI_NN_tensor* output, + _IN float variance_epsilon + ); + +/** + * L2 Normalization node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * @param[in] axis Normalize axis. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_l2_normalization + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN int32_t axis + ); + +EXPORT VSI_NN_error_e VSI_NN_node_local_response_normalization + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN int32_t depth_radius, + _IN float bias, + _IN float alpha, + _IN float beta, + _IN int32_t axis + ); + +EXPORT VSI_NN_error_e VSI_NN_node_instance_normalization + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* offset, + _IN VSI_NN_tensor* scale, + _IN VSI_NN_tensor* output, + _IN float variance_epsilon + ); + +/** Math */ +/** + * Add node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_add + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Multiply node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_mul + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Divide node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_div + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Subtract node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_sub + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Floor node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_floor + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Square node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_square + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Sqrt node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_sqrt + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Rsqrt node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_rsqrt + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Matmul node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * @param[in] transpose_input1 Whether to do transpose on input1. + * @param[in] transpose_input2 Whether to do transpose on input2. + * @param[in] transpose_output Whether to do transpose on output. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_matmul + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output, + _IN int transpose_input1, + _IN int transpose_input2, + _IN int transpose_output + ); + +/** + * Abs node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_abs + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Pow node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_pow + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Maximum node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_maximum + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Minimum node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_minimum + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Exp node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_exp + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Reverse node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * @param[in] axes Axes to reverse. + * @param[in] axes_size Number of axis to reverse. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_reverse + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* axes, + _IN int32_t axes_size + ); + +/** + * Transpose node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * @param[in] perm Transpose order. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_transpose + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* perm + ); + +EXPORT VSI_NN_error_e VSI_NN_node_gather + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* indices, + _IN VSI_NN_tensor* output, + _IN int32_t axis + ); + +/** + * Neg node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_neg + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Reduce max node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * @param[in] axes Axes to reduce. + * @param[in] axes_size Number of axis to reduce. + * @param[in] keep_dim Whether to keep dims on output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_reduce_max + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* axes, + _IN int32_t axes_size, + _IN int32_t keep_dim + ); + +/** + * Reduce min node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * @param[in] axes Axes to reduce. + * @param[in] axes_size Number of axis to reduce. + * @param[in] keep_dim Whether to keep dims on output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_reduce_min + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* axes, + _IN int32_t axes_size, + _IN int32_t keep_dim + ); + +/** + * Reduce sum node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * @param[in] axes Axes to reduce. + * @param[in] axes_size Number of axis to reduce. + * @param[in] keep_dim Whether to keep dims on output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_reduce_sum + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* axes, + _IN int32_t axes_size, + _IN int32_t keep_dim + ); + +/** + * Reduce mean node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * @param[in] axes Axes to reduce. + * @param[in] axes_size Number of axis to reduce. + * @param[in] keep_dim Whether to keep dims on output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_reduce_mean + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* axes, + _IN int32_t axes_size, + _IN int32_t keep_dim + ); + +/** + * Sin node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_sin + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +EXPORT VSI_NN_error_e VSI_NN_node_tile + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* multiples, + _IN int32_t multiples_size + ); + +EXPORT VSI_NN_error_e VSI_NN_node_topk + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN VSI_NN_tensor* output_indices, + _IN int32_t k + ); + +/** Logical */ +/** + * Equal node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_equal + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Greater node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_greater + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Greater equal node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_greater_equal + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Less node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_less + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Less equal node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_less_equal + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Logical and node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_logical_and + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Logical or node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_logical_or + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Logical not node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_logical_not + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Not equal node. + * + * @param[in] graph Graph to create the node. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_not_equal + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** + * Select node. + * If conditon is true, then output input1 tensor, + * else output input2 tensor. + * + * @param[in] graph Graph to create the node. + * @param[in] condition Conditon tensor.. + * @param[in] input1 Node input tensor. + * @param[in] input2 Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_select + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* condition, + _IN VSI_NN_tensor* input1, + _IN VSI_NN_tensor* input2, + _IN VSI_NN_tensor* output + ); + +/** Activation */ +/** + * relu node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_relu + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * ReLU1 node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_relu1 + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * ReLU6 node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_relu6 + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +EXPORT VSI_NN_error_e VSI_NN_node_tanh + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN float scale_a, + _IN float scale_b + ); + +/** + * Sigmoid node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_sigmoid + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Hard sigmoid node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_hard_sigmoid + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Mish node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_mish + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +EXPORT VSI_NN_error_e VSI_NN_node_leaky_relu + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN float ratio + ); + +EXPORT VSI_NN_error_e VSI_NN_node_prelu + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* alpha, + _IN VSI_NN_tensor* output + ); + +/** + * Soft relu node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_soft_relu + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Elu node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_elu + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** Misc */ +EXPORT VSI_NN_error_e VSI_NN_node_pad + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN VSI_NN_padding_mode_e mode, + _IN const int32_t* pad_front, + _IN const int32_t* pad_end, + _IN int32_t pad_value + ); + +EXPORT VSI_NN_error_e VSI_NN_node_fully_connected + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* kernel, + _IN _OPTIONAL VSI_NN_tensor* bias, + _IN VSI_NN_tensor* output, + _IN int32_t axis + ); + +EXPORT VSI_NN_error_e VSI_NN_node_concate + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* const inputs[], + _IN int32_t input_num, + _IN VSI_NN_tensor* output, + _IN int32_t axis + ); + +EXPORT VSI_NN_error_e VSI_NN_node_split + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* const outputs[], + _IN int32_t output_num, + _IN const int32_t* slices, + _IN int32_t slices_size, + _IN int32_t axis + ); + +/** + * Cast node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_cast + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Quantize node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_quantize + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +/** + * Dequantize node. + * + * @param[in] graph Graph to create the node. + * @param[in] input Node input tensor. + * @param[in] output Node output tensor. + * + * @return VSI_NN_ERROR_OK on success. + */ +EXPORT VSI_NN_error_e VSI_NN_node_dequantize + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output + ); + +EXPORT VSI_NN_error_e VSI_NN_node_space_to_batch + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* block_size, + _IN int32_t block_size_num, + _IN const int32_t* pad_front, + _IN const int32_t* pad_end + ); + +EXPORT VSI_NN_error_e VSI_NN_node_batch_to_space + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* block_size, + _IN int32_t block_size_num, + _IN const int32_t* crop_front, + _IN const int32_t* crop_end + ); + +EXPORT VSI_NN_error_e VSI_NN_node_space_to_depth + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* block_size, + _IN int32_t block_size_num, + _IN const int32_t* pad_front, + _IN const int32_t* pad_end + ); + +EXPORT VSI_NN_error_e VSI_NN_node_depth_to_space + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* block_size, + _IN int32_t block_size_num, + _IN const int32_t* crop_front, + _IN const int32_t* crop_end + ); + +EXPORT VSI_NN_error_e VSI_NN_node_channel_shuffle + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN int32_t group_number, + _IN int32_t axis + ); + +EXPORT VSI_NN_error_e VSI_NN_node_expand_dims + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN int32_t axis + ); + +EXPORT VSI_NN_error_e VSI_NN_node_hashtable_lookup + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* lookups, + _IN VSI_NN_tensor* keys, + _IN VSI_NN_tensor* values, + _IN VSI_NN_tensor* output, + _IN VSI_NN_tensor* output_hits + ); + +EXPORT VSI_NN_error_e VSI_NN_node_embedding_lookup + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* lookups, + _IN VSI_NN_tensor* values, + _IN VSI_NN_tensor* output + ); + +EXPORT VSI_NN_error_e VSI_NN_node_lsh_projection + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* hash_func, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* weight, + _IN VSI_NN_tensor* output, + _IN VSI_NN_lsh_projection_type_e type + ); + +EXPORT VSI_NN_error_e VSI_NN_node_slice + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* begin, + _IN const int32_t* size + ); + +EXPORT VSI_NN_error_e VSI_NN_node_strided_slice + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN const int32_t* begin, + _IN const int32_t* end, + _IN const int32_t* strides, + _IN int32_t begin_mask, + _IN int32_t end_mask, + _IN int32_t shrink_axis_mask + ); + +EXPORT VSI_NN_error_e VSI_NN_node_argmax + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN int32_t axis + ); + +EXPORT VSI_NN_error_e VSI_NN_node_argmin + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN int32_t axis + ); + +/** Detection */ +EXPORT VSI_NN_error_e VSI_NN_node_roi_pool + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* feature_map, + _IN VSI_NN_tensor* loc, + _IN VSI_NN_tensor* batch_index, + _IN VSI_NN_tensor* output, + _IN int32_t output_h, + _IN int32_t output_w, + _IN float ratio_h, + _IN float ratio_w + ); + +EXPORT VSI_NN_error_e VSI_NN_node_roi_align + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* feature_map, + _IN VSI_NN_tensor* loc, + _IN VSI_NN_tensor* batch_index, + _IN VSI_NN_tensor* output, + _IN int32_t output_h, + _IN int32_t output_w, + _IN float ratio_h, + _IN float ratio_w, + _IN int32_t sample_num_h, + _IN int32_t sample_num_w + ); + +/** Image transform */ +EXPORT VSI_NN_error_e VSI_NN_node_resize_bilinear + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN int32_t scale_h, + _IN int32_t scale_w + ); + +EXPORT VSI_NN_error_e VSI_NN_node_resize_nearest + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* output, + _IN int32_t scale_h, + _IN int32_t scale_w + ); + +/** RNN */ +EXPORT VSI_NN_error_e VSI_NN_node_svdf + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* weights_feature, + _IN VSI_NN_tensor* weights_time, + _IN VSI_NN_tensor* bias, + _IN VSI_NN_tensor* input_state, + _IN VSI_NN_tensor* output, + _IN VSI_NN_tensor* output_state, + _IN int32_t rank + ); + +//EXPORT VSI_NN_error_e VSI_NN_node_rnn(); + +EXPORT VSI_NN_error_e VSI_NN_node_rnn_unit + ( + _IN VSI_NN_graph* graph, + _IN VSI_NN_tensor* input, + _IN VSI_NN_tensor* input_state, + _IN VSI_NN_tensor* weight, _IN VSI_NN_tensor* recrrent_weight, + _IN VSI_NN_tensor* bias, + _IN VSI_NN_tensor* output, + _IN VSI_NN_tensor* output_state, + _IN VSI_NN_activation_e activation + ); + +EXPORT VSI_NN_error_e VSI_NN_node_lstm_unit + ( + _IN VSI_NN_graph* graph + ); + +__END_DECLS +#endif diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h index 4ac9f611..b426e4bd 100644 --- a/src/tim/vx/internal/include/vsi_nn_context.h +++ b/src/tim/vx/internal/include/vsi_nn_context.h @@ -26,6 +26,7 @@ #define _VSI_NN_CONTEXT_H #include "vsi_nn_platform.h" +#include "vsi_nn_types.h" #ifdef __cplusplus extern "C" { @@ -75,12 +76,19 @@ typedef struct _vsi_nn_runtime_option_t int32_t enable_shader; int32_t enable_opcheck; int32_t enable_concat_optimize; - int32_t enable_asymi8_to_u8; + /* 0: disable convert int8 to uint8 + * 1: enable convert asymm int8 to asymm uint8 + * 2: enable convert both asymm and sym int8 to asymm uint8 + */ + int32_t enable_i8_to_u8; int32_t enable_dataconvert_optimize; int32_t enable_stream_processor; int32_t enable_rgb88_planar_nhwc; int32_t enable_slice_optimize; int32_t enable_batch_opt; + int32_t enable_save_file_type; + int32_t enable_use_image_process; + int32_t enable_use_from_handle; } vsi_nn_runtime_option_t; /** @@ -101,6 +109,10 @@ typedef struct _vsi_nn_context_t OVXLIB_API vsi_nn_context_t vsi_nn_CreateContext ( void ); +OVXLIB_API vsi_status vsi_nn_initOptions + ( + vsi_nn_runtime_option_t *options + ); /** * Release context * Release ovxlib NN runtime resource and reset context handle to NULL. diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h index 7918ae3e..b70b1dca 100644 --- a/src/tim/vx/internal/include/vsi_nn_feature_config.h +++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h @@ -53,5 +53,9 @@ #if defined(VX_13_NN_COMPATIBLITY) #define VSI_MAP_TENSOR_PATCH_SUPPORT #endif +#if defined (VX_QUANT_PER_GROUP_SUPPORT) +#define VSI_PER_GROUP_QUANTIZATION_SUPPORT +#endif +#define VSI_GRAPH_RUNTIME_ENV_SUPPORT #endif diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h index 89786c42..c074cd5d 100644 --- a/src/tim/vx/internal/include/vsi_nn_graph.h +++ b/src/tim/vx/internal/include/vsi_nn_graph.h @@ -814,11 +814,77 @@ OVXLIB_API vsi_status vsi_nn_ExecuteGraphLoop vsi_nn_tensor_t *max_iteration_tensor ); -OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption +/** + * Set runtime variable + * Set runtime variable for ovxlib and driver. + * + * @param[in] graph Graph handle + * @param[in] key Ovxlib and driver Envoriment variable name + * Ovxlib supported keys: + * VSI_NN_ENABLE_I8TOU8 + * VSI_NN_ENABLE_OPCHECK + * VSI_SAVE_FILE_TYPE + * VSI_USE_IMAGE_PROCESS + * VSI_NN_ENABLE_CONCAT_OPTIMIZE + * VSI_NN_ENABLE_DATACONVERT_OPTIMIZE + * VSI_VX_ENABLE_STREAM_PROCESSOR + * VSI_NN_FORCE_RGB888_OUT_NHWC + * VSI_NN_ENABLE_SLICE_OPTIMIZE + * VSI_VX_ENABLE_BATCH_OPT + * VSI_USE_FROM_HANDLE + * Driver keys: + * VIV_VX_ENABLE_GRAPH_TRANSFORM + * VIV_VX_ENABLE_SHADER + * In addition to the ovxlib keys listed above, all others will be treated as the driver envoriment variable. + * @return VSI_SUCCESS on success, or appropriate error code otherwise + */ +OVXLIB_API vsi_status vsi_nn_SetRunTimeVariable ( vsi_nn_graph_t* graph, - const char* ctrl_str, - size_t size + const char* key, + const char* value + ); + +/** + * Get runtime variable + * Get runtime variable of ovxlib. + * + * @param[in] graph Graph handle + * @param[in] key Envoriment variable name + * Supported keys: + * VSI_NN_ENABLE_I8TOU8 + * VSI_NN_ENABLE_OPCHECK + * VSI_SAVE_FILE_TYPE + * VSI_USE_IMAGE_PROCESS + * VSI_NN_ENABLE_CONCAT_OPTIMIZE + * VSI_NN_ENABLE_DATACONVERT_OPTIMIZE + * VSI_VX_ENABLE_STREAM_PROCESSOR + * VSI_NN_FORCE_RGB888_OUT_NHWC + * VSI_NN_ENABLE_SLICE_OPTIMIZE + * VSI_VX_ENABLE_BATCH_OPT + * VSI_USE_FROM_HANDLE + * VIV_VX_ENABLE_GRAPH_TRANSFORM + * VIV_VX_ENABLE_SHADER + * Only supported the keys listed above. + * @return Variable's value on success, or NULL otherwise, attention: if success, + * the caller need release the memory after use the return value. + */ +OVXLIB_API char* vsi_nn_GetRunTimeVariable + ( + const vsi_nn_graph_t* graph, + const char* key + ); + +int32_t vsi_nn_GetVariable(const char* variableKey); + +OVXLIB_API char* vsi_nn_GenerateGraphJson + ( + vsi_nn_graph_t* graph + ); + +OVXLIB_API vsi_status vsi_nn_ReleaseGraphJson + ( + char* json ); /** diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index a18e8949..dc82aeb5 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -212,6 +212,10 @@ #include "ops/vsi_nn_op_crop_and_resize.h" #include "ops/vsi_nn_op_rmsnorm.h" #include "ops/vsi_nn_op_shape.h" +#include "ops/vsi_nn_op_bitcast.h" +#include "ops/vsi_nn_op_grouped_conv3d.h" +#include "ops/vsi_nn_op_col2im.h" +#include "ops/vsi_nn_op_l1_layer_norm.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" #include "ops/vsi_nn_op_inverse_sigmoid.h" @@ -412,6 +416,10 @@ typedef union _vsi_nn_nn_param vsi_nn_crop_and_resize_param crop_and_resize; vsi_nn_rmsnorm_param rmsnorm; vsi_nn_shape_param shape; + vsi_nn_bitcast_param bitcast; + vsi_nn_grouped_conv3d_param grouped_conv3d; + vsi_nn_col2im_param col2im; + vsi_nn_l1_layer_norm_param l1_layer_norm; void* client_param; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_tensor.h b/src/tim/vx/internal/include/vsi_nn_tensor.h index d6ed0904..90dcb224 100644 --- a/src/tim/vx/internal/include/vsi_nn_tensor.h +++ b/src/tim/vx/internal/include/vsi_nn_tensor.h @@ -86,6 +86,8 @@ typedef enum VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6, /** perchannel float8 */ VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7, + /** GPQT */ + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8, /** undefined type */ VSI_NN_QNT_TYPE_NA = 0xff, } vsi_nn_qnt_type_e; @@ -126,6 +128,16 @@ typedef struct vsi_nn_dtype const int32_t * zero_points; int32_t zero_points_dim; }; +#endif +#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT + /** Meanful in GPTQ_SYMMETRIC */ + struct { + const float* group_scales; + int32_t group_channel_dim; + int32_t group_size; + const int32_t* group_zero_points; + int32_t group_count; + }; #endif }; }; diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index 92f83491..37368a49 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 2 -#define VSI_NN_VERSION_PATCH 5 +#define VSI_NN_VERSION_PATCH 14 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c index bc7d36ef..9d3ead3d 100644 --- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c @@ -35,6 +35,8 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" +#if (!VX_ARGMAX_VX_SUPPORT) + __BEGIN_DECLS @@ -289,3 +291,5 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( argmax, _setup ) + +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/cl/col2im_cl.c b/src/tim/vx/internal/src/kernel/cl/col2im_cl.c new file mode 100644 index 00000000..4daf9d48 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/col2im_cl.c @@ -0,0 +1,432 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +#define _COL2IM_KERNEL_SOURCE_NAME "col2im" + +// Add kernel hashtable here +#define COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d) \ + (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 | (_image_2d))) +#define COL2IM_KERNELS( IN_DTYPE, OUT_DTYPE ) \ + { COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE , 0), \ + CVIVANTE_NAMESPACE("cl.col2im_"#IN_DTYPE"to"#OUT_DTYPE), \ + _COL2IM_KERNEL_SOURCE_NAME } + +#define COL2IM_KERNELS_2D( IN_DTYPE, OUT_DTYPE ) \ + { COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE , 1), \ + CVIVANTE_NAMESPACE("cl.col2im_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + _COL2IM_KERNEL_SOURCE_NAME } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _col2im_kernel_map[] = +{ + // Register kernel here + COL2IM_KERNELS( F32, F32 ), + COL2IM_KERNELS( F32, U32 ), + COL2IM_KERNELS( F32, I32 ), + COL2IM_KERNELS( U32, U32 ), + COL2IM_KERNELS( U32, F32 ), + COL2IM_KERNELS( U32, I32 ), + COL2IM_KERNELS( I32, I32 ), + COL2IM_KERNELS( I32, U32 ), + COL2IM_KERNELS( I32, F32 ), + + COL2IM_KERNELS_2D( F32, F32 ), + COL2IM_KERNELS_2D( F32, U32 ), + COL2IM_KERNELS_2D( F32, I32 ), + COL2IM_KERNELS_2D( U32, U32 ), + COL2IM_KERNELS_2D( U32, F32 ), + COL2IM_KERNELS_2D( U32, I32 ), + COL2IM_KERNELS_2D( I32, I32 ), + COL2IM_KERNELS_2D( I32, U32 ), + COL2IM_KERNELS_2D( I32, F32 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _col2im_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _COL2IM_PARAM_NUM _cnt_of_array( _col2im_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_col2im_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_size_array_t * in_shape = NULL; + int32_t stride_w = 1, stride_h = 1; + int32_t dilation_w = 1, dilation_h = 1, dilation_d = 1; + int32_t pad_w_front = 0, pad_w_end = 0, pad_h_front = 0, pad_h_end = 0, pad_d_front = 0, pad_d_end = 0; + int32_t kernel_w = 1, kernel_h = 1, kernel_d = 1; + int32_t move_time_x = 0; + int32_t move_time_y = 0; + int32_t width_pad = 0; + int32_t height_pad = 0; + int32_t depth_pad = 0; + int32_t kernel_x_new = 1; + int32_t kernel_y_new = 1; + int32_t kernel_z_new = 1; + int32_t batch = 1; + int32_t width = 1; + int32_t height = 1; + int32_t depth = 1; + + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &stride_w); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &stride_h); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &dilation_w); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &dilation_h); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &dilation_d); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &pad_w_front); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &pad_w_end); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &pad_h_front); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &pad_h_end); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_d_front); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_d_end); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &kernel_w); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &kernel_h); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[16], &kernel_d); + CHECK_STATUS_FAIL_GOTO(status, final ); + + batch = (int32_t)(attr[0]->shape->data[2]); + width = (int32_t)(attr[1]->shape->data[0]); + height = (int32_t)(attr[1]->shape->data[1]); + depth = (int32_t)(attr[1]->shape->data[2]) / batch; + width_pad = width + pad_w_front + pad_w_end; + height_pad = height + pad_h_front + pad_h_end; + depth_pad = depth + pad_d_front + pad_d_end; + move_time_x = (width_pad - ((kernel_w - 1) * dilation_w + 1) + stride_w) / stride_w; + move_time_y = (height_pad - ((kernel_h - 1) * dilation_h + 1) + stride_h) / stride_h; + kernel_x_new = (kernel_w - 1) * dilation_w + 1; + kernel_y_new = (kernel_h - 1) * dilation_h + 1; + kernel_z_new = (kernel_d - 1) * dilation_d + 1; + + status = vsi_nn_kernel_gpu_add_param( node, "width_pad", &width_pad ); + status |= vsi_nn_kernel_gpu_add_param( node, "height_pad", &height_pad ); + status |= vsi_nn_kernel_gpu_add_param( node, "depth_pad", &depth_pad ); + status |= vsi_nn_kernel_gpu_add_param( node, "move_time_x", &move_time_x ); + status |= vsi_nn_kernel_gpu_add_param( node, "move_time_y", &move_time_y ); + status |= vsi_nn_kernel_gpu_add_param( node, "kernel_x_new", &kernel_x_new ); + status |= vsi_nn_kernel_gpu_add_param( node, "kernel_y_new", &kernel_y_new ); + status |= vsi_nn_kernel_gpu_add_param( node, "kernel_z_new", &kernel_z_new ); + status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + in_shape = attr[1]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = in_shape->data[0]; + gpu_param.global_size[1] = in_shape->data[1]; + gpu_param.global_size[2] = in_shape->data[2]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + } + return status; +} /* _col2im_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _col2im_kernel_map; + size_t kernel_map_size = _cnt_of_array( _col2im_kernel_map ); + vx_param_description_t * param_def = _col2im_kernel_param_def; + vx_kernel_initialize_f initializer = _col2im_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + else if (U8 == in_dtype) + { + in_dtype = U32; + } + else if (I8 == in_dtype || I16 == in_dtype) + { + in_dtype = I32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + else if (U8 == out_dtype) + { + out_dtype = U32; + } + else if (I8 == out_dtype || I16 == out_dtype) + { + out_dtype = I32; + } + + key = COL2IM_HASH_KEY( in_dtype, out_dtype ,image_2d); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _col2im_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_COL2IM_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}}; + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float inputZp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float outputZp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inOutScale = inputScale / outputScale; + float inOutTile = outputZp - inOutScale * inputZp; + int32_t stride_w = vsi_nn_kernel_param_get_int32( params, "stride_w" ); + int32_t stride_h = vsi_nn_kernel_param_get_int32( params, "stride_h" ); + int32_t stride_d = vsi_nn_kernel_param_get_int32( params, "stride_d" ); + int32_t dilation_w = vsi_nn_kernel_param_get_int32( params, "dilation_w" ); + int32_t dilation_h = vsi_nn_kernel_param_get_int32( params, "dilation_h" ); + int32_t dilation_d = vsi_nn_kernel_param_get_int32( params, "dilation_d" ); + int32_t pad_w_front = vsi_nn_kernel_param_get_int32( params, "pad_w_front" ); + int32_t pad_w_end = vsi_nn_kernel_param_get_int32( params, "pad_w_end" ); + int32_t pad_h_front = vsi_nn_kernel_param_get_int32( params, "pad_h_front" ); + int32_t pad_h_end = vsi_nn_kernel_param_get_int32( params, "pad_h_end" ); + int32_t pad_d_front = vsi_nn_kernel_param_get_int32( params, "pad_d_front" ); + int32_t pad_d_end = vsi_nn_kernel_param_get_int32( params, "pad_d_end" ); + size_t dim_num = 0; + int32_t* block_shape = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "block_shape", &dim_num); + int32_t kernel_w = block_shape[0]; + int32_t kernel_h = dim_num > 1 ? block_shape[1] : 1; + int32_t kernel_d = dim_num > 2 ? block_shape[2] : 1; + + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + image_2d = dim_num > 2 ? FALSE : TRUE; + + shapes[0][0] = inputs[0]->attr.size[0]; + shapes[0][1] = inputs[0]->attr.size[1] / outputs[0]->attr.size[dim_num]; + shapes[0][2] = inputs[0]->attr.size[2] * outputs[0]->attr.size[dim_num]; + + shapes[1][0] = outputs[0]->attr.size[0]; + shapes[1][1] = outputs[0]->attr.size[1]; + if (image_2d) + { + shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3]; + } + else + { + shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3] * outputs[0]->attr.size[4]; + } + + rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 ); + rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 ); + + if (rs_input == NULL || rs_output == NULL) + { + goto final; + } + + status = _query_kernel( kernel, inputs, outputs, image_2d ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + node_params[0] = rs_input; + node_params[1] = rs_output; + node_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &stride_w ); + node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &stride_h ); + node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &stride_d ); + node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_w ); + node_params[6] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_h ); + node_params[7] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_d ); + node_params[8] = vsi_nn_kernel_scalar_create( graph, I32, &pad_w_front ); + node_params[9] = vsi_nn_kernel_scalar_create( graph, I32, &pad_w_end ); + node_params[10] = vsi_nn_kernel_scalar_create( graph, I32, &pad_h_front ); + node_params[11] = vsi_nn_kernel_scalar_create( graph, I32, &pad_h_end ); + node_params[12] = vsi_nn_kernel_scalar_create( graph, I32, &pad_d_front ); + node_params[13] = vsi_nn_kernel_scalar_create( graph, I32, &pad_d_end ); + node_params[14] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_w ); + node_params[15] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_h ); + node_params[16] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_d ); + node_params[17] = vsi_nn_kernel_scalar_create( graph, F32, &inOutScale ); + node_params[18] = vsi_nn_kernel_scalar_create( graph, F32, &inOutTile ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, _COL2IM_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + vsi_nn_kernel_scalar_release( &node_params[12] ); + vsi_nn_kernel_scalar_release( &node_params[13] ); + vsi_nn_kernel_scalar_release( &node_params[14] ); + vsi_nn_kernel_scalar_release( &node_params[15] ); + vsi_nn_kernel_scalar_release( &node_params[16] ); + vsi_nn_kernel_scalar_release( &node_params[17] ); + } + } +final: + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( col2im, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c index 3a5e0d7b..50c435ba 100644 --- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c @@ -46,21 +46,36 @@ __BEGIN_DECLS #define KERNEL_SOURCE_1 "cumsum" #define KERNEL_SOURCE_2 "cumsum_2d" +#define KERNEL_SOURCE_3 "cumsum_array_axis0" +#define KERNEL_SOURCE_4 "cumsum_array_axis1" +#define KERNEL_SOURCE_5 "cumsum_array_axis2" +#define KERNEL_SOURCE_6 "cumsum_array_2d_axis0" +#define KERNEL_SOURCE_7 "cumsum_array_2d_axis1" // Add kernel hashtable here -#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ - ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) +#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d, is_array) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 8) | (_image_2d << 4) | (is_array)) #define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ - { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \ CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \ KERNEL_SOURCE_1 }, #define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ - { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \ CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \ KERNEL_SOURCE_2 }, +#define HASH_CUMSUM_ARRAY_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \ + CVIVANTE_NAMESPACE("cl.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \ + SOURCE }, + +#define HASH_CUMSUM_ARRAY_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 1), \ + CVIVANTE_NAMESPACE("cl.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \ + SOURCE }, + static const struct { uint32_t key; char* function_name; @@ -82,6 +97,22 @@ static const struct { HASH_CUMSUM_KERNELS_2D(1, U8, U8) HASH_CUMSUM_KERNELS_2D(1, F32, F32) HASH_CUMSUM_KERNELS_2D(1, F32, U8) + + HASH_CUMSUM_ARRAY_KERNELS(0, U8, U8, KERNEL_SOURCE_3) + HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3) + HASH_CUMSUM_ARRAY_KERNELS(0, F32, U8, KERNEL_SOURCE_3) + HASH_CUMSUM_ARRAY_KERNELS(1, U8, U8, KERNEL_SOURCE_4) + HASH_CUMSUM_ARRAY_KERNELS(1, F32, F32, KERNEL_SOURCE_4) + HASH_CUMSUM_ARRAY_KERNELS(1, F32, U8, KERNEL_SOURCE_4) + HASH_CUMSUM_ARRAY_KERNELS(2, U8, U8, KERNEL_SOURCE_5) + HASH_CUMSUM_ARRAY_KERNELS(2, F32, F32, KERNEL_SOURCE_5) + HASH_CUMSUM_ARRAY_KERNELS(2, F32, U8, KERNEL_SOURCE_5) + HASH_CUMSUM_ARRAY_KERNELS_2D(0, U8, U8, KERNEL_SOURCE_6) + HASH_CUMSUM_ARRAY_KERNELS_2D(0, F32, F32, KERNEL_SOURCE_6) + HASH_CUMSUM_ARRAY_KERNELS_2D(0, F32, U8, KERNEL_SOURCE_6) + HASH_CUMSUM_ARRAY_KERNELS_2D(1, U8, U8, KERNEL_SOURCE_7) + HASH_CUMSUM_ARRAY_KERNELS_2D(1, F32, F32, KERNEL_SOURCE_7) + HASH_CUMSUM_ARRAY_KERNELS_2D(1, F32, U8, KERNEL_SOURCE_7) }; /* @@ -197,7 +228,8 @@ static vsi_status _query_kernel vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const outputs, int32_t axis, - int32_t is_2d + int32_t is_2d, + int32_t is_array /* Add extra params */ ) { @@ -230,7 +262,7 @@ static vsi_status _query_kernel output_dtype = F32; } - key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d); + key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d, is_array); for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ ) { @@ -270,6 +302,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_t * kernel ) { +#define VSI_NN_MAX_BLOCK_SIZE GPU_TENSOR_MAX_WIDTH vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_CUMSUM_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; @@ -291,6 +324,7 @@ static vsi_nn_kernel_node_t _setup int32_t height = 0; int32_t channel = 1; uint32_t i = 0; + int32_t is_array = 0; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); @@ -326,13 +360,16 @@ static vsi_nn_kernel_node_t _setup reshape_tensors[1] = vsi_nn_reshape_tensor( graph, outputs[0], shapes[0], (vsi_size_t)rs_dim ); - if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, - outputs[0]->attr.dim_num ) ) + for (i = 0; i < rs_dim; i++) { - return NULL; + if (shapes[0][i] > VSI_NN_MAX_BLOCK_SIZE) + { + is_array = 1; + } } +#undef VSI_NN_MAX_BLOCK_SIZE - status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d ); + status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d, is_array); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c index 66943314..a3ff29e4 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c @@ -22,7 +22,7 @@ * *****************************************************************************/ -#if !(VX_TENSOR_GATHER_API_SUPPORT) + #include #include #include @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" - +#if !(VX_TENSOR_GATHER_API_SUPPORT) __BEGIN_DECLS /* diff --git a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c index cecb25ae..d439b4d8 100644 --- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c @@ -22,7 +22,7 @@ * *****************************************************************************/ -#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) + #include #include #include @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" - +#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) __BEGIN_DECLS /* diff --git a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c index a7bcaae3..99936150 100644 --- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c @@ -22,7 +22,7 @@ * *****************************************************************************/ -#if !(VX_LOGSOFTMAX_VX_SUPPORT) + #include #include #include @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" - +#if !(VX_LOGSOFTMAX_VX_SUPPORT) __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c index 8eee2c47..05cc3034 100644 --- a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c @@ -36,6 +36,8 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" +#if (!VX_NEAREST_GRID_SAMPLE_VX_SUPPORT) + __BEGIN_DECLS /* @@ -412,3 +414,4 @@ __END_DECLS REGISTER_BACKEND_CL( nearest_grid_sample, _setup ) +#endif diff --git a/src/tim/vx/internal/src/kernel/cl/pow_cl.c b/src/tim/vx/internal/src/kernel/cl/pow_cl.c index 06e3652b..fbce08af 100644 --- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c @@ -22,7 +22,7 @@ * *****************************************************************************/ -#if !(VX_TENSOR_POW_API_SUPPORT) + #include #include #include @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" - +#if !(VX_TENSOR_POW_API_SUPPORT) __BEGIN_DECLS /* diff --git a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c index 60fbda3e..21cd7100 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "libnnext/vx_lib_nnext.h" - +#if (!VX_RESIZE_BILINEAR_SH_SUPPORT) __BEGIN_DECLS #define _RESIZE_BILINEAR_KERNEL_SOURCE() "resize_bilinear" @@ -319,3 +319,4 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( resize_bilinear, _setup ) +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/cl/tile_cl.c b/src/tim/vx/internal/src/kernel/cl/tile_cl.c index 8227a365..a672e21d 100644 --- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c @@ -22,7 +22,7 @@ * *****************************************************************************/ -#if !(VX_TENSOR_TILE_API_SUPPORT) + #include #include #include @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" - +#if !(VX_TENSOR_TILE_API_SUPPORT) __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c index b8cdfd08..78b9a9bb 100644 --- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c @@ -34,20 +34,24 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS #define _TOPK_KERNEL_SOURCE "topk" #define STR(a) #a // Add kernel hashtable here -#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \ - ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) ) +#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \ + ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26)) #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \ - { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \ + { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \ CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \ _TOPK_KERNEL_SOURCE } +#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \ + CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \ + "topk2" } + #define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) ) #define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ @@ -79,6 +83,7 @@ static const _kernel_map_type _topk_kernel_map[] = PACK_KERNEL_MAP( F32, F32, 4 ), PACK_KERNEL_MAP( F32, F32, 5 ), PACK_KERNEL_MAP( F32, F32, 6 ), + PACK_KERNEL_MAP( F32, F32, 9 ), PACK_KERNEL_MAP( U32, U32, 0 ), PACK_KERNEL_MAP( U32, U32, 1 ), @@ -87,6 +92,7 @@ static const _kernel_map_type _topk_kernel_map[] = PACK_KERNEL_MAP( U32, U32, 4 ), PACK_KERNEL_MAP( U32, U32, 5 ), PACK_KERNEL_MAP( U32, U32, 6 ), + PACK_KERNEL_MAP( U32, U32, 9 ), PACK_KERNEL_MAP( I32, I32, 0 ), PACK_KERNEL_MAP( I32, I32, 1 ), @@ -95,6 +101,7 @@ static const _kernel_map_type _topk_kernel_map[] = PACK_KERNEL_MAP( I32, I32, 4 ), PACK_KERNEL_MAP( I32, I32, 5 ), PACK_KERNEL_MAP( I32, I32, 6 ), + PACK_KERNEL_MAP( I32, I32, 9 ), PACK_KERNEL_MAP( F32, U32, 0 ), PACK_KERNEL_MAP( F32, U32, 1 ), @@ -103,6 +110,7 @@ static const _kernel_map_type _topk_kernel_map[] = PACK_KERNEL_MAP( F32, U32, 4 ), PACK_KERNEL_MAP( F32, U32, 5 ), PACK_KERNEL_MAP( F32, U32, 6 ), + PACK_KERNEL_MAP( F32, U32, 9 ), PACK_KERNEL_MAP( F32, I32, 0 ), PACK_KERNEL_MAP( F32, I32, 1 ), @@ -111,6 +119,10 @@ static const _kernel_map_type _topk_kernel_map[] = PACK_KERNEL_MAP( F32, I32, 4 ), PACK_KERNEL_MAP( F32, I32, 5 ), PACK_KERNEL_MAP( F32, I32, 6 ), + PACK_KERNEL_MAP( F32, I32, 9 ), + + PACK_MERGE_KERNEL_MAP(U32, U32), + PACK_MERGE_KERNEL_MAP(I32, I32), }; static const _kernel_map_type _topk_odd_even_sort_kernel_map[] = @@ -254,7 +266,8 @@ static vsi_status _query_kernel vsi_nn_kernel_t * kernel, vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const outputs, - int32_t num_stages + int32_t num_stages, + vsi_bool is_bitnoic_segment ) { vsi_status status = VSI_FAILURE; @@ -272,21 +285,23 @@ static vsi_status _query_kernel in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + num_stages = is_bitnoic_segment ? 0 : num_stages; + switch (_PACK_SELECT_KEY(in_dtype, out_dtype)) { case _PACK_SELECT_KEY(F32, F32): case _PACK_SELECT_KEY(F16, F16): - key = TOPK_HASH_KEY( F32, F32, num_stages ); + key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment ); break; case _PACK_SELECT_KEY(U32, U32): case _PACK_SELECT_KEY(U16, U16): case _PACK_SELECT_KEY(U8, U8): - key = TOPK_HASH_KEY( U32, U32, num_stages ); + key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment ); break; case _PACK_SELECT_KEY(I32, I32): case _PACK_SELECT_KEY(I16, I16): case _PACK_SELECT_KEY(I8, I8): - key = TOPK_HASH_KEY( I32, I32, num_stages ); + key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment ); break; case _PACK_SELECT_KEY(F32, U32): case _PACK_SELECT_KEY(F16, U32): @@ -294,7 +309,7 @@ static vsi_status _query_kernel case _PACK_SELECT_KEY(F16, U16): case _PACK_SELECT_KEY(F32, U8): case _PACK_SELECT_KEY(F16, U8): - key = TOPK_HASH_KEY( F32, U32, num_stages ); + key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment ); break; case _PACK_SELECT_KEY(F32, I32): case _PACK_SELECT_KEY(F16, I32): @@ -302,7 +317,7 @@ static vsi_status _query_kernel case _PACK_SELECT_KEY(F16, I16): case _PACK_SELECT_KEY(F32, I8): case _PACK_SELECT_KEY(F16, I8): - key = TOPK_HASH_KEY( F32, I32, num_stages ); + key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment ); break; default: break; @@ -440,7 +455,12 @@ static vsi_nn_kernel_node_t _setup int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k"); int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0); vsi_bool is_odd_even_sort = FALSE; + vsi_bool is_bitnoic_segment = FALSE; size_t param_num = _TOPK_PARAM_NUM; + int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2); + vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); float outputScale = vsi_nn_get_tensor_scale(outputs[0]); @@ -471,9 +491,22 @@ static vsi_nn_kernel_node_t _setup rs_tensors[0] = vsi_nn_reshape_tensor( graph, inputs[0], shape[0], 2 ); - if (num_stages < 7) + is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) && + type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32); + + if (is_bitnoic_segment && num_stages == 9) + { + is_bitnoic_segment = FALSE; + } + else + { + num_stages = is_bitnoic_segment ? 9 : num_stages; + max_stages = is_bitnoic_segment ? max_stages : 7; + } + + if (num_stages < max_stages || is_bitnoic_segment) { - status = _query_kernel( kernel, inputs, outputs, num_stages ); + status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment ); rs_tensors[1] = vsi_nn_reshape_tensor( graph, outputs[0], shape[1], 2 ); diff --git a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c index f5010111..1ebde506 100644 --- a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c @@ -35,6 +35,8 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" +#if (!VX_ARGMAX_VX_SUPPORT) + __BEGIN_DECLS #define HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ @@ -510,3 +512,4 @@ __END_DECLS REGISTER_BACKEND_EVIS( argmax, _setup ) +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c index 4660e894..c7589ffb 100644 --- a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c @@ -51,26 +51,49 @@ __BEGIN_DECLS #define KERNEL_SOURCE_5 "cumsum_ex_rev_axis0" #define KERNEL_SOURCE_6 "cumsum_ex_rev_axis1" #define KERNEL_SOURCE_7 "cumsum_ex_rev_axis2" +#define KERNEL_SOURCE_8 "cumsum_array" +#define KERNEL_SOURCE_9 "cumsum_array_2d" +#define KERNEL_SOURCE_10 "cumsum_array_bf16" +#define KERNEL_SOURCE_11 "cumsum_array_f16_u8" +#define KERNEL_SOURCE_12 "cumsum_array_ex_rev_axis0" +#define KERNEL_SOURCE_13 "cumsum_array_ex_rev_axis1" +#define KERNEL_SOURCE_14 "cumsum_array_ex_rev_axis2" +#define KERNEL_SOURCE_15 "cumsum_array_f16_u8_2d" // Add kernel hashtable here -#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d) \ - ((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) +#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d, is_array) \ + ((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 8) | (_image_2d << 4) | (is_array)) #define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ - { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0, 0), \ CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \ SOURCE }, #define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ - { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1, 0), \ CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \ SOURCE }, #define HASH_CUMSUM_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ - { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0, 0), \ CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \ SOURCE }, +#define HASH_CUMSUM_ARRAY_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0, 1), \ + CVIVANTE_NAMESPACE("evis.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \ + SOURCE }, + +#define HASH_CUMSUM_ARRAY_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1, 1), \ + CVIVANTE_NAMESPACE("evis.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \ + SOURCE }, + +#define HASH_CUMSUM_ARRAY_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0, 1), \ + CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \ + SOURCE }, + static const struct { uint32_t key; char* function_name; @@ -135,6 +158,65 @@ static const struct { HASH_CUMSUM_EX_REV_KERNELS(2, F16, U8, KERNEL_SOURCE_4) HASH_CUMSUM_EX_REV_KERNELS(2, F16, I8, KERNEL_SOURCE_4) HASH_CUMSUM_EX_REV_KERNELS(2, F16, I16, KERNEL_SOURCE_4) + + HASH_CUMSUM_ARRAY_KERNELS(0, U8, U8, KERNEL_SOURCE_8) + HASH_CUMSUM_ARRAY_KERNELS(0, I8, I8, KERNEL_SOURCE_8) + HASH_CUMSUM_ARRAY_KERNELS(0, I16, I16, KERNEL_SOURCE_8) + HASH_CUMSUM_ARRAY_KERNELS(0, F16, F16, KERNEL_SOURCE_8) + HASH_CUMSUM_ARRAY_KERNELS(0, BF16, BF16, KERNEL_SOURCE_10) + HASH_CUMSUM_ARRAY_KERNELS(1, U8, U8, KERNEL_SOURCE_8) + HASH_CUMSUM_ARRAY_KERNELS(1, I8, I8, KERNEL_SOURCE_8) + HASH_CUMSUM_ARRAY_KERNELS(1, I16, I16, KERNEL_SOURCE_8) + HASH_CUMSUM_ARRAY_KERNELS(1, F16, F16, KERNEL_SOURCE_8) + HASH_CUMSUM_ARRAY_KERNELS(1, BF16, BF16, KERNEL_SOURCE_10) + HASH_CUMSUM_ARRAY_KERNELS(2, U8, U8, KERNEL_SOURCE_8) + HASH_CUMSUM_ARRAY_KERNELS(2, I8, I8, KERNEL_SOURCE_8) + HASH_CUMSUM_ARRAY_KERNELS(2, I16, I16, KERNEL_SOURCE_8) + HASH_CUMSUM_ARRAY_KERNELS(2, F16, F16, KERNEL_SOURCE_8) + HASH_CUMSUM_ARRAY_KERNELS(2, BF16, BF16, KERNEL_SOURCE_10) + HASH_CUMSUM_ARRAY_KERNELS_2D(0, U8, U8, KERNEL_SOURCE_9) + HASH_CUMSUM_ARRAY_KERNELS_2D(0, I8, I8, KERNEL_SOURCE_9) + HASH_CUMSUM_ARRAY_KERNELS_2D(0, I16, I16, KERNEL_SOURCE_9) + HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16, F16, KERNEL_SOURCE_9) + HASH_CUMSUM_ARRAY_KERNELS_2D(0, BF16, BF16, KERNEL_SOURCE_10) + HASH_CUMSUM_ARRAY_KERNELS_2D(1, U8, U8, KERNEL_SOURCE_9) + HASH_CUMSUM_ARRAY_KERNELS_2D(1, I8, I8, KERNEL_SOURCE_9) + HASH_CUMSUM_ARRAY_KERNELS_2D(1, I16, I16, KERNEL_SOURCE_9) + HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16, F16, KERNEL_SOURCE_9) + HASH_CUMSUM_ARRAY_KERNELS_2D(1, BF16, BF16, KERNEL_SOURCE_10) + HASH_CUMSUM_ARRAY_KERNELS(0, F16, U8, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_KERNELS(0, F16, I8, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_KERNELS(0, F16, I16, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_KERNELS(1, F16, U8, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_KERNELS(1, F16, I8, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_KERNELS(1, F16, I16, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_KERNELS(2, F16, U8, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_KERNELS(2, F16, I8, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_KERNELS(2, F16, I16, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16, U8, KERNEL_SOURCE_15) + HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16, I8, KERNEL_SOURCE_15) + HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16, I16, KERNEL_SOURCE_15) + HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16, U8, KERNEL_SOURCE_15) + HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16, I8, KERNEL_SOURCE_15) + HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16, I16, KERNEL_SOURCE_15) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, U8, U8, KERNEL_SOURCE_12) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, I8, I8, KERNEL_SOURCE_12) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, I16, I16, KERNEL_SOURCE_12) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, F16, F16, KERNEL_SOURCE_12) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, U8, U8, KERNEL_SOURCE_13) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, I8, I8, KERNEL_SOURCE_13) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, I16, I16, KERNEL_SOURCE_13) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16, F16, KERNEL_SOURCE_13) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, U8, U8, KERNEL_SOURCE_14) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, I8, I8, KERNEL_SOURCE_14) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, I16, I16, KERNEL_SOURCE_14) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16, F16, KERNEL_SOURCE_14) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16, U8, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16, I8, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16, I16, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16, U8, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16, I8, KERNEL_SOURCE_11) + HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16, I16, KERNEL_SOURCE_11) }; /* @@ -161,6 +243,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) size_t param_size ) { +#define VSI_NN_MAX_BLOCK_SIZE GPU_TENSOR_MAX_WIDTH vsi_status status = VSI_FAILURE; gpu_param_t shaderParam = { 3, // workdim @@ -188,6 +271,9 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) float in_out_zp_scale = 1.0f; float in_out_scale = 1.0f; + int32_t is_array = 0; + int32_t remainder = 0; + uint32_t pack_key = 0; VSI_UNREFERENCED(param_size); @@ -219,7 +305,15 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) height = (int32_t)(input_shape->data[1]); channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1); + if (width > VSI_NN_MAX_BLOCK_SIZE || + height > VSI_NN_MAX_BLOCK_SIZE || + channel > VSI_NN_MAX_BLOCK_SIZE) + { + is_array = 1; + } + +#undef VSI_NN_MAX_BLOCK_SIZE if (axis == 0) { w = 1; @@ -245,6 +339,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) { shaderParam.global_scale[0] = 16; } + remainder = w % shaderParam.global_scale[0]; shaderParam.global_scale[1] = 1; shaderParam.global_scale[2] = 1; shaderParam.global_size[0] = (w + shaderParam.global_scale[0] - 1) / shaderParam.global_scale[0]; @@ -253,6 +348,12 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) status = vsi_nn_kernel_gpu_config( node, &shaderParam ); CHECK_STATUS_FAIL_GOTO(status, OnError); + if (is_array) + { + status = vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder); + status |= vsi_nn_kernel_gpu_add_param(node, "w_size", &w); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } #define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, AXIS, DIM) \ (IN0_TYPE | (OUT_TYPE << 8) | (AXIS << 16) | (DIM << 24)) @@ -767,7 +868,8 @@ static vsi_status _query_kernel const vsi_nn_kernel_param_t * params, int32_t axis, int32_t is_2d, - int32_t is_ex_rev + int32_t is_ex_rev, + int32_t is_array ) { vsi_status status = VSI_FAILURE; @@ -781,7 +883,7 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d); + key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d, is_array); for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ ) { @@ -819,6 +921,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_t * kernel ) { +#define VSI_NN_MAX_BLOCK_SIZE GPU_TENSOR_MAX_WIDTH vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_CUMSUM_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; @@ -831,7 +934,10 @@ static vsi_nn_kernel_node_t _setup int32_t is_2d = 0; uint32_t rs_dim = 2; uint32_t i = 0; + int32_t is_array = 0; int32_t is_ex_or_rev = exclusive || reverse; + vsi_nn_kernel_dtype_e input0_dtype = U8; + int32_t width = 0; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); @@ -860,7 +966,30 @@ static vsi_nn_kernel_node_t _setup reshape_tensors[1] = vsi_nn_reshape_tensor( graph, outputs[0], shapes[0], (vsi_size_t)rs_dim ); - status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev); + width = (int32_t)shapes[0][0]; + + for (i = 0; i < rs_dim; i++) + { + if (shapes[0][i] > VSI_NN_MAX_BLOCK_SIZE) + { + is_array = 1; + } + } + +#undef VSI_NN_MAX_BLOCK_SIZE + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + + if (is_array && + ((axis_new == 0 && width < 8) || + (axis_new > 0 && (((input0_dtype == U8 || input0_dtype == I8) && width < 16) || + ((input0_dtype != U8 && input0_dtype != I8) && width < 8))) + )) + { + return NULL; + } + + status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev, is_array); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c index c61565c0..b005f5f8 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c @@ -22,7 +22,7 @@ * *****************************************************************************/ -#if !(VX_TENSOR_GATHER_API_SUPPORT) + #include #include #include @@ -35,7 +35,7 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" - +#if !(VX_TENSOR_GATHER_API_SUPPORT) __BEGIN_DECLS /* diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c index 1d829fdd..cdab7d77 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c @@ -58,14 +58,14 @@ __BEGIN_DECLS _3D } vsi_nn_kernel_coord_type_e; -#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _batch_dim) \ - ((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_batch_dim)) +#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _batch_dim, is_array) \ + ((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_batch_dim << 4) | (is_array)) #define HASH_GATHER_ND_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \ CVIVANTE_NAMESPACE("evis.gather_nd_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE) #define TENSOR_GATHER_ND_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \ - { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0), \ + { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0, 0), \ HASH_GATHER_ND_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \ SOURCE }, @@ -73,10 +73,26 @@ __BEGIN_DECLS CVIVANTE_NAMESPACE("evis.gather_nd_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE) #define TENSOR_GATHER_ND_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \ - { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1), \ + { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1, 0), \ HASH_GATHER_ND_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \ SOURCE }, +#define HASH_GATHER_ND_ARRAY_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \ + CVIVANTE_NAMESPACE("evis.gather_nd_array_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE) + +#define TENSOR_GATHER_ND_ARRAY_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \ + { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0, 1), \ + HASH_GATHER_ND_ARRAY_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \ + SOURCE }, + +#define HASH_GATHER_ND_ARRAY_BATCH_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \ + CVIVANTE_NAMESPACE("evis.gather_nd_array_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE) + +#define TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \ + { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1, 1), \ + HASH_GATHER_ND_ARRAY_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \ + SOURCE }, + static const struct { uint32_t key; char* function_name; @@ -125,6 +141,50 @@ static const struct { TENSOR_GATHER_ND_BATCH_KERNELS(U8, I32, U8, _2D, KERNEL_SOURCE_8) TENSOR_GATHER_ND_BATCH_KERNELS(I16, I32, I16, _2D, KERNEL_SOURCE_8) TENSOR_GATHER_ND_BATCH_KERNELS(F16, I32, F16, _2D, KERNEL_SOURCE_8) + + TENSOR_GATHER_ND_ARRAY_KERNELS(I8, I32, I8, _1D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_ARRAY_KERNELS(U8, I32, U8, _1D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _1D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _1D, KERNEL_SOURCE_1) + TENSOR_GATHER_ND_ARRAY_KERNELS(I8, I32, I8, _2D, KERNEL_SOURCE_2) + TENSOR_GATHER_ND_ARRAY_KERNELS(U8, I32, U8, _2D, KERNEL_SOURCE_2) + TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _2D, KERNEL_SOURCE_2) + TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _2D, KERNEL_SOURCE_2) + TENSOR_GATHER_ND_ARRAY_KERNELS(I8, I32, I8, _3D, KERNEL_SOURCE_3) + TENSOR_GATHER_ND_ARRAY_KERNELS(U8, I32, U8, _3D, KERNEL_SOURCE_3) + TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _3D, KERNEL_SOURCE_3) + TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _3D, KERNEL_SOURCE_3) + + TENSOR_GATHER_ND_ARRAY_KERNELS(I8, I32, F16, _1D, KERNEL_SOURCE_4) + TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _1D, KERNEL_SOURCE_4) + TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8, _1D, KERNEL_SOURCE_4) + TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _1D, KERNEL_SOURCE_4) + TENSOR_GATHER_ND_ARRAY_KERNELS(U8, I32, F16, _1D, KERNEL_SOURCE_4) + TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8, _1D, KERNEL_SOURCE_4) + + TENSOR_GATHER_ND_ARRAY_KERNELS(I8, I32, F16, _2D, KERNEL_SOURCE_5) + TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _2D, KERNEL_SOURCE_5) + TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8, _2D, KERNEL_SOURCE_5) + TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _2D, KERNEL_SOURCE_5) + TENSOR_GATHER_ND_ARRAY_KERNELS(U8, I32, F16, _2D, KERNEL_SOURCE_5) + TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8, _2D, KERNEL_SOURCE_5) + + TENSOR_GATHER_ND_ARRAY_KERNELS(I8, I32, F16, _3D, KERNEL_SOURCE_6) + TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _3D, KERNEL_SOURCE_6) + TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8, _3D, KERNEL_SOURCE_6) + TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _3D, KERNEL_SOURCE_6) + TENSOR_GATHER_ND_ARRAY_KERNELS(U8, I32, F16, _3D, KERNEL_SOURCE_6) + TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8, _3D, KERNEL_SOURCE_6) + + TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I8, I32, I8, _1D, KERNEL_SOURCE_7) + TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(U8, I32, U8, _1D, KERNEL_SOURCE_7) + TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I16, I32, I16, _1D, KERNEL_SOURCE_7) + TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(F16, I32, F16, _1D, KERNEL_SOURCE_7) + TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I8, I32, I8, _2D, KERNEL_SOURCE_8) + TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(U8, I32, U8, _2D, KERNEL_SOURCE_8) + TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I16, I32, I16, _2D, KERNEL_SOURCE_8) + TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(F16, I32, F16, _2D, KERNEL_SOURCE_8) + }; /* @@ -148,7 +208,8 @@ static vsi_status get_gather_nd_tensor_reshape_size vsi_size_t block_size, uint32_t coordDim, int32_t* newDim, - uint32_t batch_dims + uint32_t batch_dims, + int32_t* arrayFlg ) { vsi_status status = VSI_FAILURE; @@ -184,12 +245,20 @@ static vsi_status get_gather_nd_tensor_reshape_size for (i = 0; i < coordDim - 1; i++) { sizes[rank++] = input_size[i + offset]; + if (sizes[i] >= VSI_NN_MAX_IMAGE_WIDTH) + { + arrayFlg[0] = 1; + } } for (i = 0; i < batch_dims; i++) { sizes[rank] *= input_size[dims_num - i - 1]; } + if (sizes[rank] >= VSI_NN_MAX_IMAGE_WIDTH) + { + arrayFlg[0] = 1; + } newDim[0] = rank + 1; } @@ -198,6 +267,10 @@ static vsi_status get_gather_nd_tensor_reshape_size for (i = coordDim-1; i > 0; i--) { sizes[i] = input_size[i + offset - 1]; + if (sizes[i] >= VSI_NN_MAX_IMAGE_WIDTH) + { + arrayFlg[0] = 1; + } } for (i = 0; i < offset; i++) { @@ -210,6 +283,10 @@ static vsi_status get_gather_nd_tensor_reshape_size newDim[0] = 2; sizes[0] = block_size; sizes[1] = elementCnt / block_size; + if ((elementCnt / block_size) >= VSI_NN_MAX_IMAGE_WIDTH) + { + arrayFlg[0] = 1; + } } else if (coordDim == 4) { @@ -242,6 +319,14 @@ static vsi_status get_gather_nd_tensor_reshape_size status = VSI_SUCCESS; newDim[0] = 3; } + else + { + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + status = VSI_SUCCESS; + newDim[0] = 2; + arrayFlg[0] = 1; + } } #undef VSI_NN_MAX_IMAGE_WIDTH @@ -409,7 +494,8 @@ static vsi_status _query_kernel vsi_nn_tensor_t* const* const outputs, vsi_nn_kernel_t* kernel, int32_t coord_dim, - int32_t batch_dims + int32_t batch_dims, + int32_t is_array ) { vsi_status status = VSI_FAILURE; @@ -444,7 +530,7 @@ static vsi_status _query_kernel coord_type = _3D; } - key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg ); + key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg, is_array); for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ ) { @@ -482,6 +568,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_t * kernel ) { +#define VSI_NN_MAX_BLOCK_SIZE GPU_TENSOR_MAX_WIDTH vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_GATHER_ND_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; @@ -489,26 +576,41 @@ static vsi_nn_kernel_node_t _setup int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" ); int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); + int32_t input_size = 1; + int32_t no_block_batch_size = 1; int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; + int32_t is_array = 0; + int32_t i = 0; VSI_UNREFERENCED(input_num); VSI_UNREFERENCED(output_num); - status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims); - status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims); - status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims); - if (status != VSI_SUCCESS) + for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++) { - return NULL; + input_size = input_size * (int32_t)inputs[0]->attr.size[i]; } - - if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, - outputs[0]->attr.dim_num ) ) + no_block_batch_size = input_size / block_size; + is_array = no_block_batch_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0; + + status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], + block_size, coord_dim, &rs_in_dim, batch_dims, &is_array); + status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], + coord_dim, 0, &rs_idx_dim, batch_dims, &is_array); + status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], + block_size, 0, &rs_out_dim, batch_dims, &is_array); +#undef VSI_NN_MAX_BLOCK_SIZE + if (status != VSI_SUCCESS) { return NULL; } - status = _query_kernel( inputs, outputs, kernel, coord_dim, batch_dims ); + //if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + // outputs[0]->attr.dim_num ) ) + //{ + // return NULL; + //} + + status = _query_kernel( inputs, outputs, kernel, coord_dim, batch_dims, is_array); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c index 5ecb4b77..ad515d9a 100644 --- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c @@ -22,7 +22,7 @@ * *****************************************************************************/ -#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) + #include #include #include @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" - +#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) __BEGIN_DECLS #define SOURCE_AXIS0_0 "layer_normalization_0" diff --git a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c index 37fddeaf..d2d3e203 100644 --- a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c @@ -22,7 +22,7 @@ * *****************************************************************************/ -#if !(VX_LOGSOFTMAX_VX_SUPPORT) + #include #include #include @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" - +#if !(VX_LOGSOFTMAX_VX_SUPPORT) __BEGIN_DECLS #define HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ diff --git a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c index 6554c74a..ab42eec8 100644 --- a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c @@ -36,6 +36,8 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" +#if (!VX_NEAREST_GRID_SAMPLE_VX_SUPPORT) + __BEGIN_DECLS /* @@ -625,3 +627,4 @@ __END_DECLS REGISTER_BACKEND_EVIS( nearest_grid_sample, _setup ) +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/evis/pow_evis.c b/src/tim/vx/internal/src/kernel/evis/pow_evis.c index 8492528c..767ab83d 100644 --- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c @@ -22,7 +22,7 @@ * *****************************************************************************/ -#if !(VX_TENSOR_POW_API_SUPPORT) + #include #include #include @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" - +#if !(VX_TENSOR_POW_API_SUPPORT) __BEGIN_DECLS #define KERNEL_SOURCE "pow", diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c index 167db3e9..ea840776 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c @@ -750,6 +750,7 @@ static vsi_nn_kernel_node_t _setup shape[2] = 1; reshape_tensor = vsi_nn_reshape_tensor( graph, outputs[0], shape, outputs[0]->attr.dim_num ); + CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor fail.", final); if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size, outputs[0]->attr.dim_num ) ) @@ -819,6 +820,7 @@ static vsi_nn_kernel_node_t _setup final: vsi_nn_safe_free(node_params); + vsi_safe_release_tensor(reshape_tensor); return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c index 17f3bc52..f3d89392 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c @@ -911,6 +911,7 @@ static vsi_nn_kernel_node_t _setup shape[2] = 1; reshape_tensor = vsi_nn_reshape_tensor( graph, outputs[0], shape, outputs[0]->attr.dim_num ); + CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor fail.", final); if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size, outputs[0]->attr.dim_num ) ) @@ -978,6 +979,7 @@ static vsi_nn_kernel_node_t _setup final: vsi_nn_safe_free(node_params); + vsi_safe_release_tensor(reshape_tensor); return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c index d3d33755..a63fc3a8 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_dtype_util_prv.h" - +#if (!VX_RESIZE_BILINEAR_SH_SUPPORT) __BEGIN_DECLS /* @@ -1515,3 +1515,4 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( resize_bilinear, _setup ) +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c index 4fc76f92..4d3070bf 100644 --- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c @@ -22,7 +22,7 @@ * *****************************************************************************/ -#if !(VX_TENSOR_TILE_API_SUPPORT) + #include #include #include @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" - +#if !(VX_TENSOR_TILE_API_SUPPORT) __BEGIN_DECLS /* diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c index 331f2629..8ff82f54 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c @@ -29,6 +29,7 @@ #include "vsi_nn_context.h" #include "vsi_nn_prv.h" #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_graph.h" #include "vsi_nn_log.h" #include "vsi_nn_error.h" @@ -1673,7 +1674,7 @@ vsi_status vsi_nn_KernelGpuConfig static vsi_bool _check_shader_support(vsi_nn_graph_t* graph) { - int32_t enableShader = graph->ctx->options.enable_shader; + int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader; #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT if ( graph->ctx->config.subGroupSize == 0 ) diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c index b837e663..92e94f65 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c @@ -181,6 +181,9 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(cos) #if (VX_LOGSOFTMAX_VX_SUPPORT) REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax) #endif +#if (VX_BITCAST_VX_SUPPORT) +REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast) +#endif __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c index 0e0b2141..ca5ce158 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c @@ -916,11 +916,21 @@ vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node { input = in_tensor; output = tensor; + /* Create a openvx tensor if it is not exist */ + if (NULL == input->t) + { + vsi_nn_TensorReinit(graph, input); + } } else { input = tensor; output = in_tensor; + /* Create a openvx tensor if it is not exist */ + if (NULL == output->t) + { + vsi_nn_TensorReinit(graph, output); + } } vxTensorReshapeNode(graph->g, input->t, &reshape_param, sizeof(reshape_param), output->t); diff --git a/src/tim/vx/internal/src/kernel/vx/argmax_vx.c b/src/tim/vx/internal/src/kernel/vx/argmax_vx.c new file mode 100644 index 00000000..75482d28 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/argmax_vx.c @@ -0,0 +1,79 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if (VX_ARGMAX_VX_SUPPORT) + +#define REGISTER_ARGMAXOPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_ARGMAXOPENVX_KERNEL( argmax ) +{ + vx_node node = NULL; + int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + node = vxArgmaxLayer(graph->g, + inputs[0]->t, + axis, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* argmax() */ + +#undef REGISTER_ARGMAXOPENVX_KERNEL + +#endif diff --git a/src/tim/vx/internal/src/kernel/vx/bitcast_vx.c b/src/tim/vx/internal/src/kernel/vx/bitcast_vx.c new file mode 100644 index 00000000..85a72996 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/bitcast_vx.c @@ -0,0 +1,77 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if (VX_BITCAST_VX_SUPPORT) + +#define REGISTER_BITCASTOPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_BITCASTOPENVX_KERNEL( bitcast ) +{ + vx_node node = NULL; + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + node = vxBitCastLayer(graph->g, + inputs[0]->t, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* bitcast() */ + +#undef REGISTER_BITCASTOPENVX_KERNEL + +#endif diff --git a/src/tim/vx/internal/src/kernel/vx/grid_sample_vx.c b/src/tim/vx/internal/src/kernel/vx/grid_sample_vx.c new file mode 100644 index 00000000..fd6217b2 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/grid_sample_vx.c @@ -0,0 +1,91 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if (VX_NEAREST_GRID_SAMPLE_VX_SUPPORT) +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vx_node node = NULL; + int32_t mode = + vsi_nn_kernel_param_get_int32(params, "mode"); + int32_t align_corners = + vsi_nn_kernel_param_get_int32(params, "align_corners"); + int32_t pad_mode = + vsi_nn_kernel_param_get_int32(params, "padding_mode"); + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + + node = vxGridSampleLayer( + graph->g, + inputs[0]->t, + inputs[1]->t, + mode, + align_corners, + pad_mode, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* _setup() */ + +#define REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL(KERNEL_NAME) \ + static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num, \ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) \ + { \ + return _setup(graph, inputs, input_num, outputs, output_num, \ + params, kernel); \ + } \ + REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) + +REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL( nearest_grid_sample ) + +#undef REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL + +#endif diff --git a/src/tim/vx/internal/src/kernel/vx/l1_layer_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/l1_layer_norm_vx.c new file mode 100644 index 00000000..25c42629 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/l1_layer_norm_vx.c @@ -0,0 +1,82 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if (VX_L1_LAYER_NORM_VX_SUPPORT) +#define REGISTER_L1_LAYER_NORM_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_L1_LAYER_NORM_OPENVX_KERNEL( l1_layer_norm ) +{ + vx_node node = NULL; + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + node = vxL1LayerNormalizationLayer( + graph->g, + eps, + axis, + inputs[0]->t, + inputs[1]->t, + inputs[2]->t, + inputs[3]->t, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* l1_layer_norm() */ + +#undef REGISTER_L1_LAYER_NORM_OPENVX_KERNEL +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/col2im.cl b/src/tim/vx/internal/src/libnnext/ops/cl/col2im.cl new file mode 100644 index 00000000..38ac9e3c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/col2im.cl @@ -0,0 +1,162 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable +#include "cl_viv_vx_ext.h" + +_viv_uniform int width_pad; +_viv_uniform int height_pad; +_viv_uniform int depth_pad; +_viv_uniform int move_time_x; +_viv_uniform int move_time_y; +_viv_uniform int kernel_x_new; +_viv_uniform int kernel_y_new; +_viv_uniform int kernel_z_new; +_viv_uniform int depth; + +#define COL2IM(name, read_type, dst_type ,convert_type, write_type) \ +__kernel void col2im_##name \ +( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int stride_w, \ + int stride_h, \ + int stride_d, \ + int dilation_w, \ + int dilation_h, \ + int dilation_d, \ + int pad_w_front, \ + int pad_w_end, \ + int pad_h_front, \ + int pad_h_end, \ + int pad_d_front, \ + int pad_d_end, \ + int kernel_x, \ + int kernel_y, \ + int kernel_z, \ + float inOutScale, \ + float inOutTile \ +) \ +{ \ + int x = get_global_id(0); \ + int y = get_global_id(1); \ + int z = get_global_id(2); \ + int4 coord_out = (int4)(x,y,z,0); \ + int b = z / depth; \ + z = z % depth; \ + int4 coord_in = (int4)(0,0,b,0); \ + \ + float sum = 0.0f; \ + x = x + pad_w_front; \ + y = y + pad_h_front; \ + z = z + pad_d_front; \ + int offset_x = x % stride_w; \ + int offset_y = y % stride_h; \ + int offset_z = z % stride_d; \ + int i,j,k; \ + for (k = offset_z; k < kernel_z_new; k += stride_d) \ + { \ + if ((z - k) < 0 || (z + (kernel_z_new - k)) > depth_pad || k % dilation_d != 0) \ + { \ + continue; \ + } \ + for (j = offset_y; j < kernel_y_new; j = j + stride_h) \ + { \ + if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \ + { \ + continue; \ + } \ + for (i = offset_x; i < kernel_x_new; i = i + stride_w) \ + { \ + if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \ + { \ + continue; \ + } \ + coord_in.x = (x - i + stride_w - 1) / stride_w + \ + (y - j + stride_h - 1) / stride_h * move_time_x + \ + (z - k + stride_d - 1) / stride_d * move_time_y * move_time_x; \ + coord_in.y = i / dilation_w + j * kernel_x / dilation_h + k * kernel_x * kernel_y / dilation_d; \ + sum = sum + convert_float(read_type(input, coord_in).x); \ + } \ + } \ + } \ + sum = sum * inOutScale + inOutTile; \ + dst_type dst = 0; \ + dst.x = convert_type(sum); \ + write_type(output, coord_out, dst); \ +} +COL2IM(U32toU32, read_imageui, uint4, convert_uint, write_imageui) +COL2IM(U32toI32, read_imageui, int4, convert_int, write_imagei) +COL2IM(U32toF32, read_imageui, float4, convert_float, write_imagef) +COL2IM(I32toU32, read_imagei, uint4, convert_uint, write_imageui) +COL2IM(I32toI32, read_imagei, int4, convert_int, write_imagei) +COL2IM(I32toF32, read_imagei, float4, convert_float, write_imagef) +COL2IM(F32toU32, read_imagef, uint4, convert_uint, write_imageui) +COL2IM(F32toI32, read_imagef, int4, convert_int, write_imagei) +COL2IM(F32toF32, read_imagef, float4, convert_float, write_imagef) + +#define COL2IM_2D(name, read_type, dst_type ,convert_type, write_type) \ +__kernel void col2im_##name##_2D \ +( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int stride_w, \ + int stride_h, \ + int stride_d, \ + int dilation_w, \ + int dilation_h, \ + int dilation_d, \ + int pad_w_front, \ + int pad_w_end, \ + int pad_h_front, \ + int pad_h_end, \ + int pad_d_front, \ + int pad_d_end, \ + int kernel_x, \ + int kernel_y, \ + int kernel_z, \ + float inOutScale, \ + float inOutTile \ +) \ +{ \ + int x = get_global_id(0); \ + int y = get_global_id(1); \ + int z = get_global_id(2); \ + int4 coord_out = (int4)(x,y,z,0); \ + int4 coord_in = (int4)(0,0,z,0); \ + \ + float sum = 0.0f; \ + x = x + pad_w_front; \ + y = y + pad_h_front; \ + int offset_x = x % stride_w; \ + int offset_y = y % stride_h; \ + int i,j; \ + for (j = offset_y; j < kernel_y_new; j = j + stride_h) \ + { \ + if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \ + { \ + continue; \ + } \ + for (i = offset_x; i < kernel_x_new; i = i + stride_w) \ + { \ + if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \ + { \ + continue; \ + } \ + coord_in.x = (x - i + stride_w - 1) / stride_w + \ + (y - j + stride_h - 1) / stride_h * move_time_x; \ + coord_in.y = i / dilation_w + j * kernel_x / dilation_h; \ + sum = sum + convert_float(read_type(input, coord_in).x); \ + } \ + } \ + sum = sum * inOutScale + inOutTile; \ + dst_type dst = 0; \ + dst.x = convert_type(sum); \ + write_type(output, coord_out, dst); \ +} +COL2IM_2D(U32toU32, read_imageui, uint4, convert_uint, write_imageui) +COL2IM_2D(U32toI32, read_imageui, int4, convert_int, write_imagei) +COL2IM_2D(U32toF32, read_imageui, float4, convert_float, write_imagef) +COL2IM_2D(I32toU32, read_imagei, uint4, convert_uint, write_imageui) +COL2IM_2D(I32toI32, read_imagei, int4, convert_int, write_imagei) +COL2IM_2D(I32toF32, read_imagei, float4, convert_float, write_imagef) +COL2IM_2D(F32toU32, read_imagef, uint4, convert_uint, write_imageui) +COL2IM_2D(F32toI32, read_imagef, int4, convert_int, write_imagei) +COL2IM_2D(F32toF32, read_imagef, float4, convert_float, write_imagef) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis0.cl new file mode 100644 index 00000000..98938459 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis0.cl @@ -0,0 +1,332 @@ + +__kernel void cumsum_array_F32toF32_axis0_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int chn, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + float sum = (float)(0); + Image img1 = create_image_from_image2d(input, 4); + Image img2 = create_image_from_image2d(output, 4); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global float* in_ptr = (__global float*)input_ptr; + __global float* out_ptr = (__global float*)output_ptr; + if(exclusive && rev) + { + coord.x = width - 1; + coord.z = coord.x; + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + + for(; coord.x > 0; coord.x--) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + coord.z--; + sum += data; + + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else if(exclusive) + { + coord.z = 0; + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + for(coord.x = 0; coord.x < width - 1; coord.x++) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + coord.z++; + sum += data; + + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else if(rev) + { + for(coord.x = width - 1; coord.x >= 0; coord.x--) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + sum += data; + + output_ptr = get_image_ptr_from_coord(img2, coord.xy); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else + { + for(coord.x = 0; coord.x < width; coord.x++) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + sum += data; + + output_ptr = get_image_ptr_from_coord(img2, coord.xy); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } +} + +__kernel void cumsum_array_U8toU8_axis0_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int chn, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + uint sum = (uint)(0); + uint dst = (uint)(0); + + int tmp_zp = convert_int_rte(output_zp); + dst.x = convert_uint_sat(tmp_zp); + + float cnt = 0.0f; + + Image img1 = create_image_from_image2d(input, 4); + Image img2 = create_image_from_image2d(output, 4); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global uint* in_ptr = (__global uint*)input_ptr; + __global uint* out_ptr = (__global uint*)output_ptr; + if(exclusive && rev) + { + coord.x = width - 1; + coord.z = coord.x; + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = dst; + for(; coord.x > 0; coord.x--) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global uint*)input_ptr; + uint data = in_ptr[0]; + coord.z--; + cnt += 1.0; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else if(exclusive) + { + coord.z = 0; + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = dst; + for(coord.x = 0; coord.x < width - 1; coord.x++) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global uint*)input_ptr; + uint data = in_ptr[0]; + cnt += 1.0f; + coord.z++; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = dst; + } + } + else if(rev) + { + for(coord.x = width - 1; coord.x >= 0; coord.x--) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global uint*)input_ptr; + uint data = in_ptr[0]; + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.xy); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = dst; + } + } + else + { + for(coord.x = 0; coord.x < width; coord.x++) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global uint*)input_ptr; + uint data = in_ptr[0]; + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.xy); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = dst; + } + } +} + +__kernel void cumsum_array_F32toU8_axis0_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int chn, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + float4 sum = (float4)(0); + uint4 dst = (uint4)(0); + int tmp_zp = convert_int_rte(output_zp); + dst.x = convert_uint_sat(tmp_zp); + + float cnt = 0.0f; + Image img1 = create_image_from_image2d(input, 4); + Image img2 = create_image_from_image2d(output, 4); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global float* in_ptr = (__global float*)input_ptr; + __global uint* out_ptr = (__global uint*)output_ptr; + if(exclusive && rev) + { + coord.x = width - 1; + coord.z = coord.x; + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + for(; coord.x > 0; coord.x--) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + coord.z--; + cnt += 1.0; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + } + } + else if(exclusive) + { + coord.z = 0; + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + for(coord.x = 0; coord.x < width - 1; coord.x++) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + cnt += 1.0f; + coord.z++; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + } + } + else if(rev) + { + for(coord.x = width - 1; coord.x >= 0; coord.x--) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.xy); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + } + } + else + { + for(coord.x = 0; coord.x < width; coord.x++) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.xy); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + } + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis1.cl new file mode 100644 index 00000000..545d05e1 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis1.cl @@ -0,0 +1,321 @@ + +__kernel void cumsum_array_F32toF32_axis1_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int chn, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + float sum = (float)(0); + Image img1 = create_image_from_image2d(input, 4); + Image img2 = create_image_from_image2d(output, 4); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global float* in_ptr = (__global float*)input_ptr; + __global float* out_ptr = (__global float*)output_ptr; + if(exclusive && rev) + { + coord.w = height - 1; + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + for(coord.y = height - 1; coord.y > 0; coord.y--) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + coord.w--; + sum += data; + + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else if(exclusive) + { + write_imagef(output, coord.zw, sum); + for(coord.y = 0; coord.y < height - 1; coord.y++) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + coord.w++; + sum += data; + + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else if(rev) + { + for(coord.y = height - 1; coord.y >= 0; coord.y--) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + sum += data; + output_ptr = get_image_ptr_from_coord(img2, coord.xy); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else + { + for(coord.y = 0; coord.y < height; coord.y++) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + sum += data; + + output_ptr = get_image_ptr_from_coord(img2, coord.xy); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } +} + +__kernel void cumsum_array_U8toU8_axis1_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int chn, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + uint sum = (uint)(0); + uint dst = (uint)(0); + + int tmp_zp = convert_int_rte(output_zp); + dst = convert_uint_sat(tmp_zp); + + float cnt = 0; + Image img1 = create_image_from_image2d(input, 4); + Image img2 = create_image_from_image2d(output, 4); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global uint* in_ptr = (__global uint*)input_ptr; + __global uint* out_ptr = (__global uint*)output_ptr; + if(exclusive && rev) + { + coord.w = height - 1; + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = dst; + for(coord.y = height - 1; coord.y > 0; coord.y--) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global uint*)input_ptr; + uint data = in_ptr[0]; + cnt += 1.0f; + coord.w--; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + } + } + else if(exclusive) + { + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = dst; + for(coord.y = 0; coord.y < height - 1; coord.y++) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global uint*)input_ptr; + uint data = in_ptr[0]; + cnt += 1.0f; + coord.w++; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + } + } + else if(rev) + { + for(coord.y = height - 1; coord.y >= 0; coord.y--) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global uint*)input_ptr; + uint data = in_ptr[0]; + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.xy); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + } + } + else + { + for(coord.y = 0; coord.y < height; coord.y++) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global uint*)input_ptr; + uint data = in_ptr[0]; + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.xy); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + } + } +} + +__kernel void cumsum_array_F32toU8_axis1_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int chn, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + float sum = (float)(0); + uint dst = (uint)(0); + int tmp_zp = convert_int_rte(output_zp); + dst = convert_uint_sat(tmp_zp); + + float cnt = 0; + Image img1 = create_image_from_image2d(input, 4); + Image img2 = create_image_from_image2d(output, 4); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global float* in_ptr = (__global float*)input_ptr; + __global uint* out_ptr = (__global uint*)output_ptr; + if(exclusive && rev) + { + coord.w = height - 1; + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + for(coord.y = height - 1; coord.y > 0; coord.y--) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + cnt += 1.0f; + coord.w--; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + } + } + else if(exclusive) + { + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + for(coord.y = 0; coord.y < height - 1; coord.y++) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + cnt += 1.0f; + coord.w++; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.zw); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + } + } + else if(rev) + { + for(coord.y = height - 1; coord.y >= 0; coord.y--) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.xy); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + } + } + else + { + for(coord.y = 0; coord.y < height; coord.y++) + { + input_ptr = get_image_ptr_from_coord(img1, coord.xy); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum * in_out_scale + tmpAlpha; + + dst = (uint)convert_int_rte(tmpSum); + output_ptr = get_image_ptr_from_coord(img2, coord.xy); + out_ptr = (__global uint*)output_ptr; + out_ptr[0] = dst; + } + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis0.cl new file mode 100644 index 00000000..2b5f2296 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis0.cl @@ -0,0 +1,215 @@ + +__kernel void cumsum_array_F32toF32_axis0( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int channel, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + float sum = (float)(0); + Tensor img1 = create_tensor_from_image2d_array(input, 4); + Tensor img2 = create_tensor_from_image2d_array(output, 4); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global float* in_ptr = (__global float*)input_ptr; + __global float* out_ptr = (__global float*)output_ptr; + if(exclusive && rev) + { + coord_out.x = width - 1; + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + for(coord.x = width - 1; coord.x > 0; coord.x--) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + coord_out.x--; + sum += data; + + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else if(exclusive) + { + coord_out.x = 0; + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + for(coord.x = 0; coord.x < width - 1; coord.x++) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + coord_out.x++; + sum += data; + + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else if(rev) + { + for(coord.x = width - 1; coord.x >= 0; coord.x--) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + sum += data; + + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else + { + for(coord.x = 0; coord.x < width; coord.x++) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + sum += data; + + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } +} + +#define CUMSUM_ARRAY_toU8_AXIS0_SH(name, src_type) \ +__kernel void cumsum_array_##name##toU8_axis0( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, \ + int exclusive, \ + int rev, \ + int width, \ + int height, \ + int channel, \ + int input_zp, \ + float in_out_scale, \ + float in_out_zp_scale, \ + float output_zp \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 coord_out = coord; \ + \ + src_type sum = (src_type)(0); \ + uint dst = (uint)(0); \ + int tmp_zp = convert_int_rte(output_zp); \ + dst = convert_uint_sat(tmp_zp); \ + \ + float cnt = 0; \ + \ + Tensor img1 = create_tensor_from_image2d_array(input, 4); \ + Tensor img2 = create_tensor_from_image2d_array(output, 4); \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global src_type* in_ptr = (__global src_type*)input_ptr; \ + __global uint* out_ptr = (__global uint*)output_ptr; \ + if(exclusive && rev) \ + { \ + coord_out.x = width - 1; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + for(coord.x = width - 1; coord.x > 0; coord.x--) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src_type data = in_ptr[0]; \ + coord_out.x--; \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum * in_out_scale + tmpAlpha; \ + \ + dst = (uint)convert_int_rte(tmpSum); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + } \ + } \ + else if(exclusive) \ + { \ + coord_out.x = 0; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + for(coord.x = 0; coord.x < width - 1; coord.x++) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src_type data = in_ptr[0]; \ + coord_out.x++; \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum * in_out_scale + tmpAlpha; \ + \ + dst = (uint)convert_int_rte(tmpSum); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + } \ + } \ + else if(rev) \ + { \ + for(coord.x = width - 1; coord.x >= 0; coord.x--) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src_type data = in_ptr[0]; \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum * in_out_scale + tmpAlpha; \ + \ + dst = (uint)convert_int_rte(tmpSum); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + } \ + } \ + else \ + { \ + for(coord.x = 0; coord.x < width; coord.x++) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src_type data = in_ptr[0]; \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum * in_out_scale + tmpAlpha; \ + \ + dst = (uint)convert_int_rte(tmpSum); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + } \ + } \ +} +CUMSUM_ARRAY_toU8_AXIS0_SH(U8,uint) +CUMSUM_ARRAY_toU8_AXIS0_SH(F32,float) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis1.cl new file mode 100644 index 00000000..92b9c743 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis1.cl @@ -0,0 +1,216 @@ + +__kernel void cumsum_array_F32toF32_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int channel, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + float sum = (float)(0); + Tensor img1 = create_tensor_from_image2d_array(input, 4); + Tensor img2 = create_tensor_from_image2d_array(output, 4); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global float* in_ptr = (__global float*)input_ptr; + __global float* out_ptr = (__global float*)output_ptr; + if(exclusive && rev) + { + coord_out.y = height - 1; + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + for(coord.y = height - 1; coord.y > 0; coord.y--) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + coord_out.y--; + sum += data; + + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else if(exclusive) + { + coord_out.y = 0; + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + for(coord.y = 0; coord.y < height - 1; coord.y++) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + coord_out.y++; + sum += data; + + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else if(rev) + { + for(coord.y = height - 1; coord.y >= 0; coord.y--) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + sum += data; + + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else + { + for(coord.y = 0; coord.y < height; coord.y++) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + sum += data; + + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } +} + +#define CUMSUM_ARRAY_toU8_AXIS1_SH(name, src_type) \ +__kernel void cumsum_array_##name##toU8_axis1( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, \ + int exclusive, \ + int rev, \ + int width, \ + int height, \ + int channel, \ + int input_zp, \ + float in_out_scale, \ + float in_out_zp_scale, \ + float output_zp \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 coord_out = coord; \ + \ + src_type sum = (src_type)(0); \ + uint dst = (uint4)(0); \ + int tmp_zp = convert_int_rte(output_zp); \ + dst = convert_uint_sat(tmp_zp); \ + \ + float cnt = 0; \ + \ + Tensor img1 = create_tensor_from_image2d_array(input, 4); \ + Tensor img2 = create_tensor_from_image2d_array(output, 4); \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global src_type* in_ptr = (__global src_type*)input_ptr; \ + __global uint* out_ptr = (__global uint*)output_ptr; \ + if(exclusive && rev) \ + { \ + coord_out.y = height - 1; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + \ + for(coord.y = height - 1; coord.y > 0; coord.y--) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src_type data = in_ptr[0]; \ + cnt += 1.0f; \ + coord_out.y--; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum * in_out_scale + tmpAlpha; \ + \ + dst = (uint)convert_int_rte(tmpSum); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + } \ + } \ + else if(exclusive) \ + { \ + coord_out.y = 0; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + for(coord.y = 0; coord.y < height - 1; coord.y++) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src_type data = in_ptr[0]; \ + cnt += 1.0f; \ + coord_out.y++; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum * in_out_scale + tmpAlpha; \ + \ + dst = (uint)convert_int_rte(tmpSum); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + } \ + } \ + else if(rev) \ + { \ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src_type data = in_ptr[0]; \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum * in_out_scale + tmpAlpha; \ + \ + dst = (uint)convert_int_rte(tmpSum); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + } \ + } \ + else \ + { \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src_type data = in_ptr[0]; \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum * in_out_scale + tmpAlpha; \ + \ + dst = (uint)convert_int_rte(tmpSum); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + } \ + } \ +} +CUMSUM_ARRAY_toU8_AXIS1_SH(U8,uint) +CUMSUM_ARRAY_toU8_AXIS1_SH(F32,float) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis2.cl new file mode 100644 index 00000000..44940725 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis2.cl @@ -0,0 +1,215 @@ +__kernel void cumsum_array_F32toF32_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int channel, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + float sum = 0; + Tensor img1 = create_tensor_from_image2d_array(input, 4); + Tensor img2 = create_tensor_from_image2d_array(output, 4); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global float* in_ptr = (__global float*)input_ptr; + __global float* out_ptr = (__global float*)output_ptr; + if(exclusive && rev) + { + coord_out.z = channel - 1; + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + + for(coord.z = channel - 1; coord.z > 0; coord.z--) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + coord_out.z--; + sum += data; + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else if(exclusive) + { + coord_out.z = 0; + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + for(coord.z = 0; coord.z < channel - 1; coord.z++) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + coord_out.z++; + sum += data; + + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else if(rev) + { + for(coord.z = channel - 1; coord.z >= 0; coord.z--) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + sum += data; + + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } + else + { + for(coord.z = 0; coord.z < channel; coord.z++) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global float*)input_ptr; + float data = in_ptr[0]; + sum += data; + + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global float*)output_ptr; + out_ptr[0] = sum; + } + } +} + +#define CUMSUM_ARRAY_toU8_AXIS2_SH(name, src_type) \ +__kernel void cumsum_array_##name##toU8_axis2( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, \ + int exclusive, \ + int rev, \ + int width, \ + int height, \ + int channel, \ + int input_zp, \ + float in_out_scale, \ + float in_out_zp_scale, \ + float output_zp \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 coord_out = coord; \ + \ + src_type sum = (src_type)(0); \ + uint dst = (uint)(0); \ + int tmp_zp = convert_int_rte(output_zp); \ + dst = convert_uint_sat(tmp_zp); \ + \ + float cnt = 0.0f; \ + Tensor img1 = create_tensor_from_image2d_array(input, 4); \ + Tensor img2 = create_tensor_from_image2d_array(output, 4); \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global src_type* in_ptr = (__global src_type*)input_ptr; \ + __global uint* out_ptr = (__global uint*)output_ptr; \ + \ + if(exclusive && rev) \ + { \ + coord_out.z = channel - 1; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + for(coord.z = channel - 1; coord.z > 0; coord.z--) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src_type data = in_ptr[0]; \ + coord_out.z--; \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum * in_out_scale + tmpAlpha; \ + \ + dst = (uint)convert_int_rte(tmpSum); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + } \ + } \ + else if(exclusive) \ + { \ + coord_out.z = 0; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + for(coord.z = 0; coord.z < channel - 1; coord.z++) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src_type data = in_ptr[0]; \ + coord_out.z++; \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum * in_out_scale + tmpAlpha; \ + \ + dst = (uint)convert_int_rte(tmpSum); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + } \ + } \ + else if(rev) \ + { \ + for(coord.z = channel - 1; coord.z >= 0; coord.z--) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src_type data = in_ptr[0]; \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum * in_out_scale + tmpAlpha; \ + \ + dst = (uint)convert_int_rte(tmpSum); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + } \ + } \ + else \ + { \ + for(coord.z = 0; coord.z < channel; coord.z++) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src_type data = in_ptr[0]; \ + cnt += 1.0f; \ + sum += data; \ + \ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \ + float tmpSum = sum * in_out_scale + tmpAlpha; \ + \ + dst = (uint)convert_int_rte(tmpSum); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global uint*)output_ptr; \ + out_ptr[0] = dst; \ + } \ + } \ +} +CUMSUM_ARRAY_toU8_AXIS2_SH(U8,uint) +CUMSUM_ARRAY_toU8_AXIS2_SH(F32,float) + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl index 0e6166c4..a215f1fe 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl @@ -18,8 +18,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ \ - __local float local_data[128]; \ - __local uint local_indices[128]; \ + __local float local_data[LOCAL_SIZE0 * 2]; \ + __local uint local_indices[LOCAL_SIZE0 * 2]; \ \ float left = read_imagef(input, coord.xy).x; \ coord.z += work_group_size; \ @@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \ float right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem) ^ signo) \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -78,13 +78,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag write_imagei(indices, coord.xy, index.xxxx); \ write_imagei(indices, coord.zy, index.yyyy); \ } -TOPK_F32(1 << 0, 0) -TOPK_F32(1 << 1, 1) -TOPK_F32(1 << 2, 2) -TOPK_F32(1 << 3, 3) -TOPK_F32(1 << 4, 4) -TOPK_F32(1 << 5, 5) -TOPK_F32(1 << 6, 6) +TOPK_F32((1 << 0), 0) +TOPK_F32((1 << 1), 1) +TOPK_F32((1 << 2), 2) +TOPK_F32((1 << 3), 3) +TOPK_F32((1 << 4), 4) +TOPK_F32((1 << 5), 5) +TOPK_F32((1 << 6), 6) +TOPK_F32((1 << 9), 9) #define TOPK_U32(LOCAL_SIZE0, STAGES) \ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \ @@ -106,8 +107,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ \ - __local uint local_data[128]; \ - __local uint local_indices[128]; \ + __local uint local_data[LOCAL_SIZE0 * 2]; \ + __local uint local_indices[LOCAL_SIZE0 * 2]; \ \ uint left = read_imageui(input, coord.xy).x; \ coord.z += work_group_size; \ @@ -139,7 +140,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag uint left_elem = local_data[left_id]; \ uint right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem) ^ signo) \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -166,13 +167,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag write_imagei(indices, coord.xy, index.xxxx); \ write_imagei(indices, coord.zy, index.yyyy); \ } -TOPK_U32(1 << 0, 0) -TOPK_U32(1 << 1, 1) -TOPK_U32(1 << 2, 2) -TOPK_U32(1 << 3, 3) -TOPK_U32(1 << 4, 4) -TOPK_U32(1 << 5, 5) -TOPK_U32(1 << 6, 6) +TOPK_U32((1 << 0), 0) +TOPK_U32((1 << 1), 1) +TOPK_U32((1 << 2), 2) +TOPK_U32((1 << 3), 3) +TOPK_U32((1 << 4), 4) +TOPK_U32((1 << 5), 5) +TOPK_U32((1 << 6), 6) +TOPK_U32((1 << 9), 9) #define TOPK_I32(LOCAL_SIZE0, STAGES) \ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \ @@ -194,8 +196,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ \ - __local int local_data[128]; \ - __local int local_indices[128]; \ + __local int local_data[LOCAL_SIZE0 * 2]; \ + __local int local_indices[LOCAL_SIZE0 * 2]; \ \ int left = read_imagei(input, coord.xy).x; \ coord.z += work_group_size; \ @@ -227,7 +229,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag int left_elem = local_data[left_id]; \ int right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem) ^ signo) \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -254,13 +256,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag write_imagei(indices, coord.xy, index.xxxx); \ write_imagei(indices, coord.zy, index.yyyy); \ } -TOPK_I32(1 << 0, 0) -TOPK_I32(1 << 1, 1) -TOPK_I32(1 << 2, 2) -TOPK_I32(1 << 3, 3) -TOPK_I32(1 << 4, 4) -TOPK_I32(1 << 5, 5) -TOPK_I32(1 << 6, 6) +TOPK_I32((1 << 0), 0) +TOPK_I32((1 << 1), 1) +TOPK_I32((1 << 2), 2) +TOPK_I32((1 << 3), 3) +TOPK_I32((1 << 4), 4) +TOPK_I32((1 << 5), 5) +TOPK_I32((1 << 6), 6) +TOPK_I32((1 << 9), 9) #define TOPK_F32toU32(LOCAL_SIZE0, STAGES) \ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toU32_I32 \ @@ -282,8 +285,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ \ - __local float local_data[128]; \ - __local uint local_indices[128]; \ + __local float local_data[LOCAL_SIZE0 * 2]; \ + __local uint local_indices[LOCAL_SIZE0 * 2]; \ \ float left = read_imagef(input, coord.xy).x; \ coord.z += work_group_size; \ @@ -315,7 +318,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \ float right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem) ^ signo) \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -342,13 +345,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag write_imagei(indices, coord.zy, index.yyyy); \ } -TOPK_F32toU32(1 << 0, 0) -TOPK_F32toU32(1 << 1, 1) -TOPK_F32toU32(1 << 2, 2) -TOPK_F32toU32(1 << 3, 3) -TOPK_F32toU32(1 << 4, 4) -TOPK_F32toU32(1 << 5, 5) -TOPK_F32toU32(1 << 6, 6) +TOPK_F32toU32((1 << 0), 0) +TOPK_F32toU32((1 << 1), 1) +TOPK_F32toU32((1 << 2), 2) +TOPK_F32toU32((1 << 3), 3) +TOPK_F32toU32((1 << 4), 4) +TOPK_F32toU32((1 << 5), 5) +TOPK_F32toU32((1 << 6), 6) +TOPK_F32toU32((1 << 9), 9) #define TOPK_F32toI32(LOCAL_SIZE0, STAGES) \ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toI32_I32 \ @@ -370,8 +374,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ \ - __local float local_data[128]; \ - __local uint local_indices[128]; \ + __local float local_data[LOCAL_SIZE0 * 2]; \ + __local uint local_indices[LOCAL_SIZE0 * 2]; \ \ float left = read_imagef(input, coord.xy).x; \ coord.z += work_group_size; \ @@ -403,7 +407,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \ float right_elem = local_data[right_id]; \ \ - if ((left_elem < right_elem) ^ signo) \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ { \ local_data[left_id] = right_elem; \ local_data[right_id] = left_elem; \ @@ -430,10 +434,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag write_imagei(indices, coord.zy, index.yyyy); \ } -TOPK_F32toI32(1 << 0, 0) -TOPK_F32toI32(1 << 1, 1) -TOPK_F32toI32(1 << 2, 2) -TOPK_F32toI32(1 << 3, 3) -TOPK_F32toI32(1 << 4, 4) -TOPK_F32toI32(1 << 5, 5) -TOPK_F32toI32(1 << 6, 6) \ No newline at end of file +TOPK_F32toI32((1 << 0), 0) +TOPK_F32toI32((1 << 1), 1) +TOPK_F32toI32((1 << 2), 2) +TOPK_F32toI32((1 << 3), 3) +TOPK_F32toI32((1 << 4), 4) +TOPK_F32toI32((1 << 5), 5) +TOPK_F32toI32((1 << 6), 6) +TOPK_F32toI32((1 << 9), 9) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl new file mode 100644 index 00000000..0eae5ab2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl @@ -0,0 +1,368 @@ + +#define BITONIC_STEP(dtype) \ +void bitonic_step_##dtype(uint num_stages, int lx, \ + __local dtype *local_data, __local int *local_indices) \ +{ \ + for (uint stage = 0; stage < num_stages + 1; ++stage) \ + { \ + uint signo = (lx >> stage) & 1; \ + \ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \ + { \ + uint postShift = (stage - passOfStage); \ + uint pairDistance = 1 << postShift; \ + \ + uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \ + uint right_id = left_id + pairDistance; \ + \ + int left_idx = local_indices[left_id]; \ + int right_idx = local_indices[right_id]; \ + \ + dtype left_elem = local_data[left_id]; \ + dtype right_elem = local_data[right_id]; \ + \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ + { \ + local_data[left_id] = right_elem; \ + local_data[right_id] = left_elem; \ + \ + local_indices[left_id] = right_idx; \ + local_indices[right_id] = left_idx; \ + } \ + \ + barrier(CLK_LOCAL_MEM_FENCE); \ + } \ + } \ +} +BITONIC_STEP(int) +BITONIC_STEP(uint) + +#define BITONIC_STEP_ASCEND(dtype) \ +void bitonic_step_ascend_##dtype(uint num_stages, int lx, \ + __local dtype *p_share_k, __local int *p_share_v) \ +{ \ + for (uint stage = 0; stage < num_stages + 1; ++stage) \ + { \ + uint signo = (lx >> stage) & 1; \ + \ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \ + { \ + uint postShift = (stage - passOfStage); \ + uint pairDistance = 1 << postShift; \ + \ + uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \ + uint right_id = left_id + pairDistance; \ + \ + int left_idx = p_share_v[left_id]; \ + int right_idx = p_share_v[right_id]; \ + \ + dtype left_elem = p_share_k[left_id]; \ + dtype right_elem = p_share_k[right_id]; \ + \ + if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \ + { \ + p_share_k[left_id] = right_elem; \ + p_share_k[right_id] = left_elem; \ + \ + p_share_v[left_id] = right_idx; \ + p_share_v[right_id] = left_idx; \ + } \ + \ + barrier(CLK_LOCAL_MEM_FENCE); \ + } \ + } \ +} +BITONIC_STEP_ASCEND(int) +BITONIC_STEP_ASCEND(uint) + +#define BITONIC_MERGE(dtype) \ +void bitonic_merge_##dtype(uint num_stages, int lx, \ + __local dtype *local_data, __local int *local_indices) \ +{ \ + uint stage = num_stages; \ + uint signo = (lx >> stage) & 1; \ + \ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \ + { \ + uint postShift = (stage - passOfStage); \ + uint pairDistance = 1 << postShift; \ + \ + uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \ + uint right_id = left_id + pairDistance; \ + \ + int left_idx = local_indices[left_id]; \ + int right_idx = local_indices[right_id]; \ + \ + dtype left_elem = local_data[left_id]; \ + dtype right_elem = local_data[right_id]; \ + \ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \ + { \ + local_data[left_id] = right_elem; \ + local_data[right_id] = left_elem; \ + \ + local_indices[left_id] = right_idx; \ + local_indices[right_id] = left_idx; \ + } \ + \ + barrier(CLK_LOCAL_MEM_FENCE); \ + } \ +} +BITONIC_MERGE(int) +BITONIC_MERGE(uint) + +#define BLOCK_SIZE (512) + +__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32 +( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t indices, + float input_scale, + float input_tail, + float output_scale, + float output_tail, + int _num_stages, + int width + ) + { + uint lx = get_local_id(0); + const int init_k = -2147483647; + const int init_v = -2147483647; + const int num_stages = 9; + const int threads_per_block = BLOCK_SIZE; + const int index_minus_1 = threads_per_block * 2 - 1; + uint offset = 0; + uint lx1 = lx + threads_per_block; + + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + __local int local_data[1536]; + __local int local_indices[1536]; + + int left = read_imagei(input, coord.xy).x; + coord.z += threads_per_block; + int right = read_imagei(input, coord.zy).x; + + local_data[lx] = left; + local_indices[lx] = coord.x; + local_data[lx1] = right; + local_indices[lx1] = coord.z; + + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_step_int(num_stages, lx, local_data, local_indices); + + int min_data = local_data[511]; + + int *p_share_k = local_data + threads_per_block; + int *p_share_v = local_indices + threads_per_block; + + int limit = (width >> 10) << 10; + p_share_k[lx] = init_k; + p_share_v[lx] = init_v; + + p_share_k[lx1] = init_k; + p_share_v[lx1] = init_v; + barrier(CLK_LOCAL_MEM_FENCE); + + for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2) + { + int2 data; + coord.z = coord.x + threads_per_block; + data.x = read_imagei(input, coord.xy).x; + data.y = read_imagei(input, coord.zy).x; + + p_share_k[lx] = data.x; + p_share_v[lx] = coord.x; + + p_share_k[lx1] = data.y; + p_share_v[lx1] = coord.z; + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v); + + if (p_share_k[index_minus_1] < min_data) + { + continue; + } + + p_share_k[lx] = p_share_k[lx1]; + p_share_v[lx] = p_share_v[lx1]; + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_merge_int(num_stages, lx, local_data, local_indices); + + min_data = local_data[511]; + p_share_k[lx] = init_k; + p_share_v[lx] = init_v; + p_share_k[lx1] = init_k; + p_share_v[lx1] = init_v; + } + + if (width > limit) + { + if (coord.x < width) + { + int2 data; + data.x = read_imagei(input, coord.xy).x; + coord.z = coord.x + threads_per_block; + data.y = read_imagei(input, coord.zy).x; + + p_share_k[lx] = data.x; + p_share_v[lx] = coord.x; + + p_share_k[lx1] = coord.z < width ? data.y : init_k; + p_share_v[lx1] = coord.z < width ? coord.z : init_v; + } + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v); + + if (p_share_k[index_minus_1] >= min_data) + { + p_share_k[lx] = p_share_k[lx1]; + p_share_v[lx] = p_share_v[lx1]; + barrier(CLK_LOCAL_MEM_FENCE); + bitonic_merge_int(num_stages, lx, local_data, local_indices); + } + } + + int4 dst; + dst.x = local_data[lx]; + + coord.x = lx; + write_imagei(output, coord.xy, dst.xxxx); + + int4 index; + index.x = local_indices[lx]; + + write_imagei(indices, coord.xy, index.xxxx); +} + +__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32 +( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t indices, + float input_scale, + float input_tail, + float output_scale, + float output_tail, + int _num_stages, + int width + ) + { + uint lx = get_local_id(0); + const uint init_k = 0; + const int init_v = -2147483647; + const int num_stages = 9; + const int threads_per_block = BLOCK_SIZE; + const int index_minus_1 = threads_per_block * 2 - 1; + uint offset = 0; + uint lx1 = lx + threads_per_block; + + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + __local uint local_data[1536]; + __local int local_indices[1536]; + + uint left = read_imageui(input, coord.xy).x; + coord.z += threads_per_block; + uint right = read_imageui(input, coord.zy).x; + + local_data[lx] = left; + local_indices[lx] = coord.x; + local_data[lx1] = right; + local_indices[lx1] = coord.z; + + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_step_uint(num_stages, lx, local_data, local_indices); + + uint min_data = local_data[511]; + + uint *p_share_k = local_data + threads_per_block; + int *p_share_v = local_indices + threads_per_block; + + int limit = (width >> 10) << 10; + p_share_k[lx] = init_k; + p_share_v[lx] = init_v; + + p_share_k[lx1] = init_k; + p_share_v[lx1] = init_v; + barrier(CLK_LOCAL_MEM_FENCE); + + for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2) + { + uint2 data; + coord.z = coord.x + threads_per_block; + data.x = read_imageui(input, coord.xy).x; + data.y = read_imageui(input, coord.zy).x; + + p_share_k[lx] = data.x; + p_share_v[lx] = coord.x; + + p_share_k[lx1] = data.y; + p_share_v[lx1] = coord.z; + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v); + + if (p_share_k[index_minus_1] < min_data) + { + continue; + } + + p_share_k[lx] = p_share_k[lx1]; + p_share_v[lx] = p_share_v[lx1]; + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_merge_uint(num_stages, lx, local_data, local_indices); + + min_data = local_data[511]; + p_share_k[lx] = init_k; + p_share_v[lx] = init_v; + p_share_k[lx1] = init_k; + p_share_v[lx1] = init_v; + } + + if (width > limit) + { + if (coord.x < width) + { + uint2 data; + data.x = read_imageui(input, coord.xy).x; + coord.z = coord.x + threads_per_block; + data.y = read_imageui(input, coord.zy).x; + + p_share_k[lx] = data.x; + p_share_v[lx] = coord.x; + + p_share_k[lx1] = coord.z < width ? data.y : init_k; + p_share_v[lx1] = coord.z < width ? coord.z : init_v; + } + barrier(CLK_LOCAL_MEM_FENCE); + + bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v); + + if (p_share_k[index_minus_1] >= min_data) + { + p_share_k[lx] = p_share_k[lx1]; + p_share_v[lx] = p_share_v[lx1]; + barrier(CLK_LOCAL_MEM_FENCE); + bitonic_merge_uint(num_stages, lx, local_data, local_indices); + } + } + + uint4 dst; + dst.x = local_data[lx]; + + coord.x = lx; + write_imageui(output, coord.xy, dst.xxxx); + + int4 index; + index.x = local_indices[lx]; + + write_imagei(indices, coord.xy, index.xxxx); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array.vx new file mode 100644 index 00000000..6bce2234 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array.vx @@ -0,0 +1,344 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8; +_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4; +_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int channel; +_viv_uniform int input_zp; +_viv_uniform float in_out_scale; +_viv_uniform float in_out_zp_scale; +_viv_uniform float output_zp; +_viv_uniform int remainder; +_viv_uniform int w_size; + + +__kernel void cumsum_array_F16toF16_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + vxc_half8 data, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + for(coord.z = 0; coord.z < channel; coord.z++) + { + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } +} + +#define CUMSUM_8BITS_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_array_##in_name##to##out_name##_axis2( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + src_type src; \ + dst_type dst; \ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \ + \ + Tensor img1 = create_tensor_from_image2d_array(input, 1); \ + Tensor img2 = create_tensor_from_image2d_array(output, 1); \ + if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \ + { \ + coord.x = coord.x - (16 - remainder); \ + } \ + for(coord.z = 0; coord.z < channel; coord.z++) \ + { \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global src_type* in_ptr = (__global src_type*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\ + out_ptr[0] = dst; \ + } \ +} +CUMSUM_8BITS_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_8BITS_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16) + +__kernel void cumsum_array_I16toI16_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + int4 sum0 = (int4)(0), sum1 = (int4)(0); + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + for(coord.z = 0; coord.z < channel; coord.z++) + { + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8); + + out_ptr[0] = dst; + } +} + +__kernel void cumsum_array_F16toF16_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + vxc_half8 data, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + for(coord.y = 0; coord.y < height; coord.y++) + { + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } +} + +#define CUMSUM_8BITS_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_array_##in_name##to##out_name##_axis1( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + src_type src; \ + dst_type dst; \ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \ + Tensor img2 = create_tensor_from_image2d_array(output, 2); \ + if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \ + { \ + coord.x = coord.x - (16 - remainder); \ + } \ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global src_type* in_ptr = (__global src_type*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + out_ptr[0] = dst; \ + } \ +} +CUMSUM_8BITS_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_8BITS_ARRAY_AXIS1(I8, I8, vxc_char16, vxc_char16) + +__kernel void cumsum_array_I16toI16_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + int4 sum0 = (int4)(0), sum1 = (int4)(0); + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + for(coord.y = 0; coord.y < height; coord.y++) + { + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + + out_ptr[0] = dst; + } +} + +__kernel void cumsum_array_F16toF16_axis0( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + vxc_half8 data, tmpsum, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + + for(; coord.x < width; coord.x += 8) + { + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + _viv_asm(COPY, data, src, 16); + + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + + } +} + +#define CUMSUM_ARRAY_QINT_AXIS0(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_array_##in_name##to##out_name##_axis0( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + src_type src; \ + dst_type dst; \ + vxc_short8 rowSum; \ + int4 sum0 = (int4)(0), sum1 = (int4)(0); \ + short zp = (short)input_zp; \ + \ + for(; coord.x < width; coord.x += 8) \ + { \ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \ + { \ + coord.x = coord.x - (8 - remainder); \ + } \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global src_type* in_ptr = (__global src_type*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32A_4x4); \ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32B_4x4); \ + \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + out_ptr[0] = dst; \ + } \ +} + +CUMSUM_ARRAY_QINT_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_ARRAY_QINT_AXIS0(I8, I8, vxc_char16, vxc_char16) +CUMSUM_ARRAY_QINT_AXIS0(I16, I16, vxc_short8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_2d.vx new file mode 100644 index 00000000..83c11645 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_2d.vx @@ -0,0 +1,259 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8; +_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4; +_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int input_zp; +_viv_uniform float in_out_scale; +_viv_uniform float in_out_zp_scale; +_viv_uniform float output_zp; +_viv_uniform int remainder; +_viv_uniform int w_size; + + +__kernel void cumsum_array_F16toF16_axis1_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, int exclusive, int rev + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + + vxc_short8 src, dst; + vxc_half8 data, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + + Image img1 = create_image_from_image2d(input, 2); + Image img2 = create_image_from_image2d(output, 2); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + for(; coord.y < height; coord.y++) + { + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } +} + +#define CUMSUM_8BITS_ARRAY_AXIS1_2D(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_array_##in_name##to##out_name##_axis1_2D( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + src_type src; \ + dst_type dst; \ + int4 sum0 = (int4)(0); \ + int4 sum1 = (int4)(0); \ + int4 sum2 = (int4)(0); \ + int4 sum3 = (int4)(0); \ + \ + Image img1 = create_image_from_image2d(input, 1); \ + Image img2 = create_image_from_image2d(output, 1); \ + if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \ + { \ + coord.x = coord.x - (16 - remainder); \ + } \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \ + __global src_type* in_ptr = (__global src_type*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertU8toI32D_4x4); \ + \ + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + out_ptr[0] = dst; \ + } \ +} +CUMSUM_8BITS_ARRAY_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_8BITS_ARRAY_AXIS1_2D(I8, I8, vxc_char16, vxc_char16) + +__kernel void cumsum_array_I16toI16_axis1_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, int exclusive, int rev + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src, dst; + int4 sum0 = (int4)(0), sum1 = (int4)(0); + + Image img1 = create_image_from_image2d(input, 2); + Image img2 = create_image_from_image2d(output, 2); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + + for(coord.y = 0; coord.y < height; coord.y++) + { + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), + uniConvertInt32toUint8_2x8); + + out_ptr[0] = dst; + } +} + +__kernel void cumsum_array_F16toF16_axis0_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, int exclusive, int rev + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src, dst; + vxc_half8 data, tmpsum, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + Image img1 = create_image_from_image2d(input, 2); + Image img2 = create_image_from_image2d(output, 2); + for(; coord.x < width; coord.x += 8) + { + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + _viv_asm(COPY, data, src, 16); + + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzF16toF16A_4x4); + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzF16toF16B_4x4); + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzF16toF16C_2x8); + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniAccSumHorzF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } +} + +#define CUMSUM_ARRAY_QINT_AXIS0_2D(in_name, out_name, src_type, dst_type, stride_data) \ +__kernel void cumsum_array_##in_name##to##out_name##_axis0_2D( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + src_type src; \ + dst_type dst; \ + vxc_short8 rowSum; \ + int4 sum0, sum1; \ + sum0 ^= sum0; \ + sum1 ^= sum1; \ + short zp = (short)input_zp; \ + Image img1 = create_image_from_image2d(input, stride_data); \ + Image img2 = create_image_from_image2d(output, stride_data); \ + \ + for(; coord.x < width; coord.x += 8) \ + { \ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \ + { \ + coord.x = coord.x - (8 - remainder); \ + } \ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \ + __global src_type* in_ptr = (__global src_type*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzU8toI16A_4x4); \ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzU8toI16B_8x4); \ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniSubZpI16toI16_2x8); \ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumHorzI16toI32A_4x4); \ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumHorzI16toI32B_4x4); \ + \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + out_ptr[0] = dst; \ + } \ +} + +CUMSUM_ARRAY_QINT_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16, 1) +CUMSUM_ARRAY_QINT_AXIS0_2D(I8, I8, vxc_char16, vxc_char16, 1) +CUMSUM_ARRAY_QINT_AXIS0_2D(I16, I16, vxc_short8, vxc_short8, 2) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_bf16.vx new file mode 100644 index 00000000..adc80187 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_bf16.vx @@ -0,0 +1,244 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int channel; +_viv_uniform int remainder; +_viv_uniform int w_size; + + +__kernel void cumsum_array_BF16toBF16_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_ushort8 src, val0, val1; + vxc_ushort8 dst0, dst1, dst; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + float4 sum0 = (float4)(0), sum1 = (float4)(0); + + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + + for(coord.z = 0; coord.z < channel; coord.z++) + { + float4 data0, data1; + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr; + __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr; + src = in_ptr[0]; + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, val0, 16); + _viv_asm(COPY, data1, val1, 16); + + sum0 += data0; + sum1 += data1; + _viv_asm(COPY, dst0, sum0, 16); + _viv_asm(COPY, dst1, sum1, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + out_ptr[0] = dst; + } +} + +__kernel void cumsum_BF16toBF16_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_ushort8 src, val0, val1; + vxc_ushort8 dst0, dst1, dst; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + float4 sum0 = (float4)(0), sum1 = (float4)(0); + + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + + for(coord.y = 0; coord.y < height; coord.y++) + { + float4 data0, data1; + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr; + __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr; + src = in_ptr[0]; + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, val0, 16); + _viv_asm(COPY, data1, val1, 16); + sum0 += data0; + sum1 += data1; + _viv_asm(COPY, dst0, sum0, 16); + _viv_asm(COPY, dst1, sum1, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + out_ptr[0] = dst; + } +} + +__kernel void cumsum_BF16toBF16_axis0( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_ushort8 src, val0, val1; + vxc_ushort8 dst0, dst1, dst; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + float preSum = 0; + float4 one = (float4)(1.0, 1.0, 1.0, 1.0); + float4 q = (float4)(1.0, 1.0, 1.0, 0); + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + + for(; coord.x < width; coord.x += 8) + { + float4 data0, data1; + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr; + __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr; + src = in_ptr[0]; + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, val0, 16); + _viv_asm(COPY, data1, val1, 16); + + float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one)); + float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one)); + tmpSum1 += tmpSum0.w; + + tmpSum0 += preSum; + tmpSum1 += preSum; + + preSum = tmpSum1.w; + + _viv_asm(COPY, dst0, tmpSum0, 16); + _viv_asm(COPY, dst1, tmpSum1, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + out_ptr[0] = dst; + } +} + +__kernel void cumsum_BF16toBF16_axis1_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, int exclusive, int rev + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + + vxc_ushort8 src, val0, val1; + vxc_ushort8 dst0, dst1, dst; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + float4 sum0 = (float4)(0), sum1 = (float4)(0); + + Image img1 = create_image_from_image2d(input, 2); + Image img2 = create_image_from_image2d(output, 2); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + + for(; coord.y < height; coord.y++) + { + float4 data0, data1; + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr; + __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr; + src = in_ptr[0]; + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, val0, 16); + _viv_asm(COPY, data1, val1, 16); + + sum0 += data0; + sum1 += data1; + + _viv_asm(COPY, dst0, sum0, 16); + _viv_asm(COPY, dst1, sum1, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniExtractOddData_2x8); + out_ptr[0] = dst; + } +} + +__kernel void cumsum_BF16toBF16_axis0_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, int exclusive, int rev + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_ushort8 src, val0, val1; + vxc_ushort8 dst0, dst1, dst; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + float preSum = 0; + float4 one = (float4)(1.0, 1.0, 1.0, 1.0); + float4 q = (float4)(1.0, 1.0, 1.0, 0); + + Image img1 = create_image_from_image2d(input, 2); + Image img2 = create_image_from_image2d(output, 2); + for(; coord.x < width; coord.x += 8) + { + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + float4 data0, data1; + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr; + __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr; + src = in_ptr[0]; + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, val0, 16); + _viv_asm(COPY, data1, val1, 16); + + float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one)); + float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one)); + tmpSum1 += tmpSum0.w; + + tmpSum0 += preSum; + tmpSum1 += preSum; + + preSum = tmpSum1.w; + + _viv_asm(COPY, dst0, tmpSum0, 16); + _viv_asm(COPY, dst1, tmpSum1, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniExtractOddData_2x8); + out_ptr[0] = dst; + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis0.vx new file mode 100644 index 00000000..78e33fbc --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis0.vx @@ -0,0 +1,259 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8; +_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4; +_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4; +_viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8; + +_viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4; +_viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4; + + +_viv_uniform int width; +_viv_uniform int input_zp; +_viv_uniform float in_out_scale; +_viv_uniform float output_zp; + +_viv_uniform int remainder; +_viv_uniform int w_size; + + +__kernel void cumsum_ex_rev_array_F16toF16_axis0( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + vxc_short8 src, dst; + vxc_half8 data, tmpsum, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + if(exclusive == 0 && rev) + { + for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) + { + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + input_ptr = get_tensor_ptr_from_coord(img1, coord); + output_ptr = get_tensor_ptr_from_coord(img2, coord); + in_ptr = (__global vxc_short8*)input_ptr; + out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + _viv_asm(COPY, data, src, 16); + + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4); + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4); + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniSumHorzRevF16toF16C_2x8); + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } + } + else if(exclusive && rev == 0) + { + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + for(; coord.x < width - 8;) + { + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global vxc_short8*)input_ptr; + src = in_ptr[0]; + coord_out.x = coord.x + 1; + coord.x += 8; + _viv_asm(COPY, data, src, 16); + + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global vxc_short8*)output_ptr; + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } + } + else if(exclusive && rev) + { + coord.x = width - 8; + coord_out.x = width - 1; + _viv_asm(COPY, dst, sum, 16); + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); + out_ptr = (__global vxc_short8*)output_ptr; + out_ptr[0] = dst; + for(; coord.x > 0;) + { + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + input_ptr = get_tensor_ptr_from_coord(img1, coord); + output_ptr = get_tensor_ptr_from_coord(img2, coord); + in_ptr = (__global vxc_short8*)input_ptr; + out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + coord_out.x = coord.x - 1; + coord.x -= 8; + _viv_asm(COPY, data, src, 16); + + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4); + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4); + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniSumHorzRevF16toF16C_2x8); + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } + } +} + +#define CUMSUM_QINT_EX_REV_ARRAY_AXIS0(in_name, out_name, src_type, dst_type, stride_data) \ +__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis0( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \ + int4 coord_out = coord; \ + \ + src_type src; \ + dst_type dst; \ + vxc_short8 rowSum; \ + int4 sum0 = (int4)(0), sum1 = (int4)(0); \ + short zp = (short)input_zp; \ + \ + Tensor img1 = create_tensor_from_image2d_array(input, stride_data); \ + Tensor img2 = create_tensor_from_image2d_array(output, stride_data); \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global src_type* in_ptr = (__global src_type*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + if(exclusive == 0 && rev) \ + { \ + for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \ + { \ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \ + { \ + coord.x = coord.x - (8 - remainder); \ + } \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAccSumHorzRevI16toI32A_4x4); \ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAccSumHorzRevI16toI32B_4x4); \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + out_ptr[0] = dst; \ + } \ + } \ + else if(exclusive && rev == 0) \ + { \ + for(coord.x = -1; coord.x < width - 8;) \ + { \ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \ + { \ + coord.x = coord.x - (8 - remainder); \ + } \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src = in_ptr[0]; \ + coord_out.x = coord.x + 1; \ + coord.x += 8; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \ + out_ptr = (__global dst_type*)output_ptr; \ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAccSumHorzI16toI32A_4x4); \ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAccSumHorzI16toI32B_4x4); \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + out_ptr[0] = dst; \ + } \ + } \ + else if(exclusive && rev) \ + { \ + for(coord.x = width - 7; coord.x > 0;) \ + { \ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \ + { \ + coord.x = coord.x - (8 - remainder); \ + } \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + coord_out.x = coord.x - 1; \ + coord.x -= 8; \ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAccSumHorzRevI16toI32A_4x4); \ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAccSumHorzRevI16toI32B_4x4); \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + out_ptr[0] = dst; \ + } \ + } \ +} +CUMSUM_QINT_EX_REV_ARRAY_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, 1) +CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I8, I8, vxc_char16, vxc_char16, 1) +CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I16, I16, vxc_short8, vxc_short8, 2) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis1.vx new file mode 100644 index 00000000..5b548ec7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis1.vx @@ -0,0 +1,330 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform int height; +_viv_uniform float in_out_scale; +_viv_uniform float in_out_zp_scale; +_viv_uniform float output_zp; + +_viv_uniform int remainder; +_viv_uniform int w_size; + + +__kernel void cumsum_ex_rev_array_F16toF16_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); + + vxc_short8 src, dst; + vxc_half8 data, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + if(exclusive == 0 && rev) + { + for(coord.y = height - 1; coord.y >= 0; coord.y--) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + output_ptr = get_tensor_ptr_from_coord(img2, coord); + in_ptr = (__global vxc_short8*)input_ptr; + out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } + } + else if(exclusive && rev == 0) + { + dst ^= dst; + out_ptr[0] = dst; + for(; coord.y < height - 1;) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global vxc_short8*)input_ptr; + src = in_ptr[0]; + coord.y++; + _viv_asm(COPY, data, src, 16); + + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global vxc_short8*)output_ptr; + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } + } + else if(exclusive && rev) + { + dst ^= dst; + coord.y = height - 1; + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global vxc_short8*)output_ptr; + out_ptr[0] = dst; + + for(; coord.y > 0;) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global vxc_short8*)input_ptr; + src = in_ptr[0]; + coord.y--; + _viv_asm(COPY, data, src, 16); + + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global vxc_short8*)output_ptr; + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } + } +} + +#define CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis1( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \ + \ + src_type src; \ + dst_type dst; \ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \ + \ + if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \ + { \ + coord.x = coord.x - (16 - remainder); \ + } \ + Tensor img1 = create_tensor_from_image2d_array(input, 1); \ + Tensor img2 = create_tensor_from_image2d_array(output, 1); \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global src_type* in_ptr = (__global src_type*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + if(exclusive == 0 && rev) \ + { \ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + out_ptr[0] = dst; \ + } \ + } \ + else if(exclusive && rev == 0) \ + { \ + int tmpAlpha0 = convert_int_rte(output_zp); \ + int4 tmpVal; \ + tmpVal.x = tmpAlpha0; \ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \ + for(; coord.y < height - 1;) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src = in_ptr[0]; \ + coord.y++; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global dst_type*)output_ptr; \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8);\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8);\ + out_ptr[0] = dst; \ + } \ + } \ + else if(exclusive && rev) \ + { \ + coord.y = height - 1; \ + int tmpAlpha0 = convert_int_rte(output_zp); \ + int4 tmpVal; \ + tmpVal.x = tmpAlpha0; \ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global vxc_short8*)output_ptr; \ + out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \ + for(; coord.y > 0;) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src = in_ptr[0]; \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \ + coord.y--; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global dst_type*)output_ptr; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8);\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8);\ + out_ptr[0] = dst; \ + } \ + } \ +} +CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(I8, I8, vxc_char16, vxc_char16) + +__kernel void cumsum_ex_rev_array_I16toI16_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); + + vxc_short8 src, dst; + int4 sum0 = (int4)(0), sum1 = (int4)(0); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + if(exclusive == 0 && rev) + { + for(coord.y = height - 1; coord.y >= 0; coord.y--) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + output_ptr = get_tensor_ptr_from_coord(img2, coord); + in_ptr = (__global vxc_short8*)input_ptr; + out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), + uniConvertInt32toUint8_2x8); + out_ptr[0] = dst; + } + } + else if(exclusive && rev == 0) + { + int tmpAlpha0 = convert_int_rte(output_zp); + int4 tmpVal; + tmpVal.x = tmpAlpha0; + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + out_ptr[0] = dst.xxxxxxxx; + for(; coord.y < height - 1;) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global vxc_short8*)input_ptr; + src = in_ptr[0]; + coord.y++; + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global vxc_short8*)output_ptr; + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), + uniConvertInt32toUint8_2x8); + + out_ptr[0] = dst; + } + } + else if(exclusive && rev) + { + coord.y = height - 1; + int tmpAlpha0 = convert_int_rte(output_zp); + int4 tmpVal; + tmpVal.x = tmpAlpha0; + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global vxc_short8*)output_ptr; + out_ptr[0] = dst.xxxxxxxx; + for(; coord.y > 0;) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global vxc_short8*)input_ptr; + src = in_ptr[0]; + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; + coord.y--; + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global vxc_short8*)output_ptr; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), + uniConvertInt32toUint8_2x8); + + out_ptr[0] = dst; + } + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis2.vx new file mode 100644 index 00000000..5d94783d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis2.vx @@ -0,0 +1,322 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform int channel; +_viv_uniform float in_out_scale; +_viv_uniform float in_out_zp_scale; +_viv_uniform float output_zp; + +_viv_uniform int remainder; +_viv_uniform int w_size; + + +__kernel void cumsum_ex_rev_array_F16toF16_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + vxc_short8 src, dst; + vxc_half8 data, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + if(rev && exclusive == 0) + { + for(coord.z = channel - 1; coord.z >= 0; coord.z--) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + output_ptr = get_tensor_ptr_from_coord(img2, coord); + in_ptr = (__global vxc_short8*)input_ptr; + out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } + } + else if(rev == 0 && exclusive) + { + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + for(; coord.z < channel - 1;) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global vxc_short8*)input_ptr; + src = in_ptr[0]; + coord.z++; + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } + } + else if(rev && exclusive) + { + _viv_asm(COPY, dst, sum, 16); + coord.z = channel - 1; + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global vxc_short8*)output_ptr; + out_ptr[0] = dst; + for(; coord.z > 0;) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global vxc_short8*)input_ptr; + src = in_ptr[0]; + coord.z--; + _viv_asm(COPY, data, src, 16); + + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global vxc_short8*)output_ptr; + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + out_ptr[0] = dst; + } + } +} + +#define CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis2( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + \ + src_type src; \ + dst_type dst; \ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \ + \ + if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \ + { \ + coord.x = coord.x - (16 - remainder); \ + } \ + Tensor img1 = create_tensor_from_image2d_array(input, 1); \ + Tensor img2 = create_tensor_from_image2d_array(output, 1); \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global src_type* in_ptr = (__global src_type*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + if(rev && exclusive == 0) \ + { \ + for(coord.z = channel - 1; coord.z >= 0; coord.z--) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8);\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \ + uniConvertInt32toUint8_2x8);\ + out_ptr[0] = dst; \ + } \ + } \ + else if(exclusive && rev == 0) \ + { \ + int tmpAlpha0 = convert_int_rte(output_zp); \ + int4 tmpVal; \ + tmpVal.x = tmpAlpha0; \ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ + out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \ + for(; coord.z < channel - 1;) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src = in_ptr[0]; \ + coord.z++; \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \ + uniConvertInt32toUint8_2x8); \ + out_ptr[0] = dst; \ + } \ + } \ + else if(rev && exclusive) \ + { \ + coord.z = channel - 1; \ + int tmpAlpha0 = convert_int_rte(output_zp); \ + int4 tmpVal; \ + tmpVal.x = tmpAlpha0; \ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global vxc_short8*)output_ptr; \ + out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \ + for(; coord.z > 0;) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global src_type*)input_ptr; \ + src = in_ptr[0]; \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \ + coord.z--; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global dst_type*)output_ptr; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), + uniConvertInt32toUint8_2x8); \ + out_ptr[0] = dst; \ + } \ + } \ +} +CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16) + +__kernel void cumsum_ex_rev_array_I16toI16_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + vxc_short8 src, dst; + int4 sum0 = (int4)(0), sum1 = (int4)(0); + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) + { + coord.x = coord.x - (8 - remainder); + } + Tensor img1 = create_tensor_from_image2d_array(input, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr; + if(exclusive == 0 && rev) + { + for(coord.z = channel - 1; coord.z >= 0; coord.z--) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + output_ptr = get_tensor_ptr_from_coord(img2, coord); + in_ptr = (__global vxc_short8*)input_ptr; + out_ptr = (__global vxc_short8*)output_ptr; + src = in_ptr[0]; + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), + uniConvertInt32toUint8_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } + else if(exclusive && rev == 0) + { + int tmpAlpha0 = convert_int_rte(output_zp); + int4 tmpVal; + tmpVal.x = tmpAlpha0; + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + out_ptr[0] = dst.xxxxxxxx; + for(; coord.z < channel - 1;) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global vxc_short8*)input_ptr; + src = in_ptr[0]; + coord.z++; + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), + uniConvertInt32toUint8_2x8); + + out_ptr[0] = dst; + } + } + else if(exclusive && rev) + { + coord.z = channel - 1; + int tmpAlpha0 = convert_int_rte(output_zp); + int4 tmpVal; + tmpVal.x = tmpAlpha0; + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + output_ptr = get_tensor_ptr_from_coord(img2, coord); + out_ptr = (__global vxc_short8*)output_ptr; + out_ptr[0] = dst.xxxxxxxx; + for(; coord.z > 0;) + { + input_ptr = get_tensor_ptr_from_coord(img1, coord); + in_ptr = (__global vxc_short8*)input_ptr; + src = in_ptr[0]; + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; + coord.z--; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), + uniConvertInt32toUint8_2x8); + + out_ptr[0] = dst; + } + } +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8.vx new file mode 100644 index 00000000..41e9981f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8.vx @@ -0,0 +1,324 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8; +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int channel; + +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; + +_viv_uniform int remainder; +_viv_uniform int w_size; + + +#define CUMSUM_ARRAY_F16TOQINT_AXIS2(out_name, src_type, dst_type, stride_out) \ +__kernel void cumsum_array_F16to##out_name##_axis2( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \ + { \ + coord.x = coord.x - (8 - remainder); \ + } \ + for(coord.z = 0; coord.z < channel; coord.z++) \ + { \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + } \ +} +CUMSUM_ARRAY_F16TOQINT_AXIS2(I8, vxc_half8, vxc_char16, 1) +CUMSUM_ARRAY_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8, 2) +CUMSUM_ARRAY_F16TOQINT_AXIS2(U8, vxc_half8, vxc_uchar16, 1) + + +#define CUMSUM_ARRAY_F16TOQINT_AXIS1(out_name, src_type, dst_type, stride_out) \ +__kernel void cumsum_array_F16to##out_name##_axis1( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \ + { \ + coord.x = coord.x - (8 - remainder); \ + } \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + } \ +} +CUMSUM_ARRAY_F16TOQINT_AXIS1(I8, vxc_half8, vxc_char16, 1) +CUMSUM_ARRAY_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8, 2) +CUMSUM_ARRAY_F16TOQINT_AXIS1(U8, vxc_half8, vxc_uchar16, 1) + +#define CUMSUM_ARRAY_F16TOQINT_AXIS0(out_name, src_type, dst_type, stride_out) \ +__kernel void cumsum_array_F16to##out_name##_axis0( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, tmpsum, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \ + for(; coord.x < width; coord.x += 8) \ + { \ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \ + { \ + coord.x = coord.x - (8 - remainder); \ + } \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); \ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); \ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + } \ +} +CUMSUM_ARRAY_F16TOQINT_AXIS0(I8, vxc_half8, vxc_char16, 1) +CUMSUM_ARRAY_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8, 2) +CUMSUM_ARRAY_F16TOQINT_AXIS0(U8, vxc_half8, vxc_uchar16, 1) + +#define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type, stride_out) \ +__kernel void cumsum_array_ex_rev_F16to##out_name##_axis2( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \ + { \ + coord.x = coord.x - (8 - remainder); \ + } \ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + if(exclusive == 0 && rev) \ + { \ + for(coord.z = channel - 1; coord.z >= 0; coord.z--) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + in_ptr = (__global vxc_short8*)input_ptr; \ + out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + } \ + } \ + else if(exclusive && rev == 0) \ + { \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + for(; coord.z < channel - 1;) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global vxc_short8*)input_ptr; \ + src = in_ptr[0]; \ + coord.z++; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global dst_type*)output_ptr; \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + } \ + } \ + else if(exclusive && rev) \ + { \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + coord.z = channel - 1; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global dst_type*)output_ptr; \ + out_ptr[0] = dst; \ + for(; coord.z > 0;) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global vxc_short8*)input_ptr; \ + src = in_ptr[0]; \ + coord.z--; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global dst_type*)output_ptr; \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + } \ + } \ +} +CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I8, vxc_half8, vxc_char16, 1) +CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8, 2) +CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(U8, vxc_half8, vxc_uchar16, 1) + +#define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type, stride_out) \ +__kernel void cumsum_array_ex_rev_F16to##out_name##_axis1( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \ + { \ + coord.x = coord.x - (8 - remainder); \ + } \ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + if(exclusive == 0 && rev) \ + { \ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + in_ptr = (__global vxc_short8*)input_ptr; \ + out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + } \ + } \ + else if(exclusive && rev == 0) \ + { \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + for(; coord.y < height - 1;) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global vxc_short8*)input_ptr; \ + src = in_ptr[0]; \ + coord.y++; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global dst_type*)output_ptr; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + } \ + } \ + else if(exclusive && rev) \ + { \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + coord.y = height - 1; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global dst_type*)output_ptr; \ + out_ptr[0] = dst; \ + for(; coord.y > 0;) \ + { \ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + in_ptr = (__global vxc_short8*)input_ptr; \ + src = in_ptr[0]; \ + coord.y--; \ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + out_ptr = (__global dst_type*)output_ptr; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + } \ + } \ +} +CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I8, vxc_half8, vxc_char16, 1) +CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8, 2) +CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(U8, vxc_half8, vxc_uchar16, 1) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8_2d.vx new file mode 100644 index 00000000..21d37e09 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8_2d.vx @@ -0,0 +1,108 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8; +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int channel; + +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; + +_viv_uniform int remainder; +_viv_uniform int w_size; + + +#define CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type, stride_out) \ +__kernel void cumsum_array_F16to##out_name##_axis1_2D( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), 0); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \ + { \ + coord.x = coord.x - (8 - remainder); \ + } \ + for(; coord.y < height; coord.y++) \ + { \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + } \ +} +CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I8, vxc_half8, vxc_char16, 1) +CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8, 2) +CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(U8, vxc_half8, vxc_uchar16, 1) + +#define CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type, stride_out) \ +__kernel void cumsum_array_F16to##out_name##_axis0_2D( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, tmpsum, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \ + for(; coord.x < width; coord.x += 8) \ + { \ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \ + { \ + coord.x = coord.x - (8 - remainder); \ + } \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \ + src = in_ptr[0]; \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzF16toF16A_4x4); \ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzF16toF16B_4x4); \ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzF16toF16C_2x8); \ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniAccSumHorzF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + out_ptr[0] = dst; \ + } \ +} +CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I8, vxc_half8, vxc_char16, 1) +CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8, 2) +CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(U8, vxc_half8, vxc_uchar16, 1) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx index f6aa7c7c..77abb3b2 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx @@ -92,3 +92,116 @@ __kernel void gather_nd_F16toF16_1D( VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } + +__kernel void gather_nd_array_I8toI8_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + + coord.w = indice.x; + + Image img1 = create_image_from_image2d(input0, 1); + Image img2 = create_image_from_image2d(output, 1); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); + __global char* data_ptr = (__global char*)input_ptr; + __global char* dst_ptr = (__global char*)output_ptr; + char src = data_ptr[0]; + dst_ptr[0] = src; +} + +__kernel void gather_nd_array_U8toU8_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + + coord.w = indice.x; + + Image img1 = create_image_from_image2d(input0, 1); + Image img2 = create_image_from_image2d(output, 1); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); + __global uchar* data_ptr = (__global uchar*)input_ptr; + __global uchar* dst_ptr = (__global uchar*)output_ptr; + uchar src = data_ptr[0]; + dst_ptr[0] = src; +} + +__kernel void gather_nd_array_I16toI16_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + + coord.w = indice.x; + + Image img1 = create_image_from_image2d(input0, 2); + Image img2 = create_image_from_image2d(output, 2); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); + __global short* data_ptr = (__global short*)input_ptr; + __global short* dst_ptr = (__global short*)output_ptr; + short src = data_ptr[0]; + dst_ptr[0] = src; + +} + +__kernel void gather_nd_array_F16toF16_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + + coord.w = indice.x; + + Image img1 = create_image_from_image2d(input0, 2); + Image img2 = create_image_from_image2d(output, 2); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); + __global short* data_ptr = (__global short*)input_ptr; + __global short* dst_ptr = (__global short*)output_ptr; + short src = data_ptr[0]; + dst_ptr[0] = src; +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx index 74c1a229..eb127a58 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx @@ -92,3 +92,116 @@ __kernel void gather_nd_F16toF16_2D( VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } + +__kernel void gather_nd_array_I8toI8_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + + indice.x = indice.x * block_size + gidx; + + Image img1 = create_image_from_image2d(input0, 1); + Image img2 = create_image_from_image2d(output, 1); + uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); + __global char* data_ptr = (__global char*)input_ptr; + __global char* dst_ptr = (__global char*)output_ptr; + char src = data_ptr[0]; + dst_ptr[0] = src; +} + +__kernel void gather_nd_array_U8toU8_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + + indice.x = indice.x * block_size + gidx; + + Image img1 = create_image_from_image2d(input0, 1); + Image img2 = create_image_from_image2d(output, 1); + uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); + __global uchar* data_ptr = (__global uchar*)input_ptr; + __global uchar* dst_ptr = (__global uchar*)output_ptr; + uchar src = data_ptr[0]; + dst_ptr[0] = src; + +} + +__kernel void gather_nd_array_I16toI16_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + + indice.x = indice.x * block_size + gidx; + + Image img1 = create_image_from_image2d(input0, 2); + Image img2 = create_image_from_image2d(output, 2); + uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); + __global short* data_ptr = (__global short*)input_ptr; + __global short* dst_ptr = (__global short*)output_ptr; + short src = data_ptr[0]; + dst_ptr[0] = src; +} + +__kernel void gather_nd_array_F16toF16_2D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + + indice.x = indice.x * block_size + gidx; + + Image img1 = create_image_from_image2d(input0, 2); + Image img2 = create_image_from_image2d(output, 2); + uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); + __global short* data_ptr = (__global short*)input_ptr; + __global short* dst_ptr = (__global short*)output_ptr; + short src = data_ptr[0]; + dst_ptr[0] = src; +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx index e45482c7..175b4785 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx @@ -80,3 +80,85 @@ __kernel void gather_nd_F16to##src1_type_name##_2D( \ GATHER_ND_F16_TO_QINT_2D(U8, vxc_uchar16) GATHER_ND_F16_TO_QINT_2D(I8, vxc_char16) GATHER_ND_F16_TO_QINT_2D(I16, vxc_short8) + +#define GATHER_ND_ARRAY_QINT_TO_F16_2D(src0_type_name, read_type, ptr_type, stride) \ +__kernel void gather_nd_array_##src0_type_name##toF16_2D( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord = (int4)(0, gidy, gidx, 0); \ + Image img = create_image_from_image2d(input1, 4); \ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \ + int4 indice = ((int4 *)indice_ptr)[0]; \ + \ + indice.x = indice.x * block_size + gidx; \ + \ + Image img1 = create_image_from_image2d(input0, stride); \ + Image img2 = create_image_from_image2d(output, 2); \ + \ + uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \ + \ + __global ptr_type data_ptr = (__global ptr_type)input_ptr; \ + __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \ + read_type src = data_ptr[0]; \ + \ + vxc_half8 src0; \ + vxc_short8 dst0; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst0, src0, 16); \ + dst_ptr[0] = dst0; \ +} +GATHER_ND_ARRAY_QINT_TO_F16_2D(U8, vxc_uchar16, vxc_uchar16*, 1) +GATHER_ND_ARRAY_QINT_TO_F16_2D(I8, vxc_char16, vxc_char16*, 1) +GATHER_ND_ARRAY_QINT_TO_F16_2D(I16, vxc_short8, vxc_short8*, 2) + +#define GATHER_ND_ARRAY_F16_TO_QINT_2D(src1_type_name, write_type, ptr_type, stride) \ +__kernel void gather_nd_array_F16to##src1_type_name##_2D( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord = (int4)(0, gidy, gidx, 0); \ + Image img = create_image_from_image2d(input1, 4); \ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \ + int4 indice = ((int4 *)indice_ptr)[0]; \ + \ + indice.x = indice.x * block_size + gidx; \ + \ + Image img1 = create_image_from_image2d(input0, 2); \ + Image img2 = create_image_from_image2d(output, stride); \ + \ + uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \ + \ + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \ + __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \ + vxc_short8 src = data_ptr[0]; \ + \ + vxc_ushort8 mp1; \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + vxc_half8 data; \ + write_type dst; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1),uniConvertFp16toU8_2x8); \ + dst_ptr[0] = dst; \ +} +GATHER_ND_ARRAY_F16_TO_QINT_2D(U8, vxc_uchar16, vxc_uchar16*, 1) +GATHER_ND_ARRAY_F16_TO_QINT_2D(I8, vxc_char16, vxc_char16*, 1) +GATHER_ND_ARRAY_F16_TO_QINT_2D(I16, vxc_short8, vxc_short8*, 2) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx index 566aaa55..7cf0cb89 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx @@ -98,3 +98,120 @@ __kernel void gather_nd_F16toF16_3D( VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } +__kernel void gather_nd_array_I8toI8_3D( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + + indice.x = indice.x * block_size + gidx; + indice.w = 0; + + Tensor img1 = create_tensor_from_image2d_array(input0, 1); + Image img2 = create_image_from_image2d(output, 1); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); + __global char* data_ptr = (__global char*)input_ptr; + __global char* dst_ptr = (__global char*)output_ptr; + char src = data_ptr[0]; + dst_ptr[0] = src; +} + +__kernel void gather_nd_array_U8toU8_3D( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + + indice.x = indice.x * block_size + gidx; + indice.w = 0; + + Tensor img1 = create_tensor_from_image2d_array(input0, 1); + Image img2 = create_image_from_image2d(output, 1); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); + __global uchar* data_ptr = (__global uchar*)input_ptr; + __global uchar* dst_ptr = (__global uchar*)output_ptr; + uchar src = data_ptr[0]; + dst_ptr[0] = src; + +} + +__kernel void gather_nd_array_I16toI16_3D( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + + indice.x = indice.x * block_size + gidx; + indice.w = 0; + + Tensor img1 = create_tensor_from_image2d_array(input0, 2); + Image img2 = create_image_from_image2d(output, 2); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); + __global short* data_ptr = (__global short*)input_ptr; + __global short* dst_ptr = (__global short*)output_ptr; + short src = data_ptr[0]; + dst_ptr[0] = src; +} + +__kernel void gather_nd_array_F16toF16_3D( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + + int4 coord = (int4)(0, gidy, gidx, 0); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + + indice.x = indice.x * block_size + gidx; + indice.w = 0; + + Tensor img1 = create_tensor_from_image2d_array(input0, 2); + Image img2 = create_image_from_image2d(output, 2); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); + __global short* data_ptr = (__global short*)input_ptr; + __global short* dst_ptr = (__global short*)output_ptr; + short src = data_ptr[0]; + dst_ptr[0] = src; +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx index e9ca9ecd..28397fe4 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx @@ -80,3 +80,86 @@ GATHER_ND_F16_TO_QINT_3D(U8, vxc_uchar16) GATHER_ND_F16_TO_QINT_3D(I8, vxc_char16) GATHER_ND_F16_TO_QINT_3D(I16, vxc_short8) +#define GATHER_ND_ARRAY_QINT_TO_F16_3D(src0_type_name, read_type, ptr_type, stride) \ +__kernel void gather_nd_array_##src0_type_name##toF16_3D( \ + __read_only image2d_array_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord = (int4)(0, gidy, gidx, 0); \ + Image img = create_image_from_image2d(input1, 4); \ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \ + int4 indice = ((int4 *)indice_ptr)[0]; \ + \ + indice.x = indice.x * block_size + gidx; \ + indice.w = 0; \ + Tensor img1 = create_tensor_from_image2d_array(input0, stride); \ + Image img2 = create_image_from_image2d(output, 2); \ + \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \ + \ + __global ptr_type data_ptr = (__global ptr_type)input_ptr; \ + __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \ + read_type src = data_ptr[0]; \ + \ + vxc_half8 src0; \ + vxc_short8 dst0; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst0, src0, 16); \ + dst_ptr[0] = dst0; \ +} +GATHER_ND_ARRAY_QINT_TO_F16_3D(U8, vxc_uchar16, vxc_uchar16*, 1) +GATHER_ND_ARRAY_QINT_TO_F16_3D(I8, vxc_char16, vxc_char16*, 1) +GATHER_ND_ARRAY_QINT_TO_F16_3D(I16, vxc_short8, vxc_short8*, 2) + +#define GATHER_ND_ARRAY_F16_TO_QINT_3D(src1_type_name, write_type, ptr_type, stride) \ +__kernel void gather_nd_array_F16to##src1_type_name##_3D( \ + __read_only image2d_array_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord = (int4)(0, gidy, gidx, 0); \ + Image img = create_image_from_image2d(input1, 4); \ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \ + int4 indice = ((int4 *)indice_ptr)[0]; \ + \ + indice.x = indice.x * block_size + gidx; \ + indice.w = 0; \ + \ + Tensor img1 = create_tensor_from_image2d_array(input0, 2); \ + Image img2 = create_image_from_image2d(output, stride); \ + \ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \ + \ + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \ + __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \ + vxc_short8 src = data_ptr[0]; \ + \ + vxc_ushort8 mp1; \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + vxc_half8 data; \ + write_type dst; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1), uniConvertFp16toU8_2x8); \ + dst_ptr[0] = dst; \ +} +GATHER_ND_ARRAY_F16_TO_QINT_3D(U8, vxc_uchar16, vxc_uchar16*, 1) +GATHER_ND_ARRAY_F16_TO_QINT_3D(I8, vxc_char16, vxc_char16*, 1) +GATHER_ND_ARRAY_F16_TO_QINT_3D(I16, vxc_short8, vxc_short8*, 2) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx index e467f252..b3632383 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx @@ -95,3 +95,118 @@ __kernel void gather_nd_batch_F16toF16_1D( VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } + +__kernel void gather_nd_array_batch_I8toI8_1D( + __read_only image2d_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num + + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); + int4 indice = ((int4 *)indice_ptr)[0]; + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz); + + Image img1 = create_image_from_image2d(input0, 1); + Tensor img2 = create_tensor_from_image2d_array(output, 1); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord0); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global char* data_ptr = (__global char*)input_ptr; + __global char* dst_ptr = (__global char*)output_ptr; + char src = data_ptr[0]; + dst_ptr[0] = src; +} + +__kernel void gather_nd_array_batch_U8toU8_1D( + __read_only image2d_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num + + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); + int4 indice = ((int4 *)indice_ptr)[0]; + + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz); + + Image img1 = create_image_from_image2d(input0, 1); + Tensor img2 = create_tensor_from_image2d_array(output, 1); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord0); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global uchar* data_ptr = (__global uchar*)input_ptr; + __global uchar* dst_ptr = (__global uchar*)output_ptr; + uchar src = data_ptr[0]; + dst_ptr[0] = src; +} + +__kernel void gather_nd_array_batch_I16toI16_1D( + __read_only image2d_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num + + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); + int4 indice = ((int4 *)indice_ptr)[0]; + + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz); + + Image img1 = create_image_from_image2d(input0, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord0); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global short* data_ptr = (__global short*)input_ptr; + __global short* dst_ptr = (__global short*)output_ptr; + short src = data_ptr[0]; + dst_ptr[0] = src; +} + +__kernel void gather_nd_array_batch_F16toF16_1D( + __read_only image2d_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num + + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); + int4 indice = ((int4 *)indice_ptr)[0]; + + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz); + + Image img1 = create_image_from_image2d(input0, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord0); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global short* data_ptr = (__global short*)input_ptr; + __global short* dst_ptr = (__global short*)output_ptr; + short src = data_ptr[0]; + dst_ptr[0] = src; +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx index 58c2af34..8e52eeac 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx @@ -26,7 +26,7 @@ __kernel void gather_nd_batch_I8toI8_2D( VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } -__kernel void gather_nd_U8toU8_2D( +__kernel void gather_nd_batch_U8toU8_2D( __read_only image2d_array_t input0, __read_only image2d_array_t input1, __write_only image2d_array_t output, @@ -51,7 +51,7 @@ __kernel void gather_nd_U8toU8_2D( VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } -__kernel void gather_nd_I16toI16_2D( +__kernel void gather_nd_batch_I16toI16_2D( __read_only image2d_array_t input0, __read_only image2d_array_t input1, __write_only image2d_array_t output, @@ -76,7 +76,7 @@ __kernel void gather_nd_I16toI16_2D( VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } -__kernel void gather_nd_F16toF16_2D( +__kernel void gather_nd_batch_F16toF16_2D( __read_only image2d_array_t input0, __read_only image2d_array_t input1, __write_only image2d_array_t output, @@ -100,3 +100,123 @@ __kernel void gather_nd_F16toF16_2D( VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } + +__kernel void gather_nd_array_batch_I8toI8_2D( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num + + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); + int4 indice = ((int4 *)indice_ptr)[0]; + + indice.x = indice.x * block_size + gidx; + indice.zw = coord.zw; + + Tensor img1 = create_tensor_from_image2d_array(input0, 1); + Tensor img2 = create_tensor_from_image2d_array(output, 1); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global char* data_ptr = (__global char*)input_ptr; + __global char* dst_ptr = (__global char*)output_ptr; + char src = data_ptr[0]; + dst_ptr[0] = src; +} + +__kernel void gather_nd_array_batch_U8toU8_2D( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num + + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); + int4 indice = ((int4 *)indice_ptr)[0]; + + indice.x = indice.x * block_size + gidx; + indice.zw = coord.zw; + + Tensor img1 = create_tensor_from_image2d_array(input0, 1); + Tensor img2 = create_tensor_from_image2d_array(output, 1); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global uchar* data_ptr = (__global uchar*)input_ptr; + __global uchar* dst_ptr = (__global uchar*)output_ptr; + uchar src = data_ptr[0]; + dst_ptr[0] = src; +} + +__kernel void gather_nd_array_batch_I16toI16_2D( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num + + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); + int4 indice = ((int4 *)indice_ptr)[0]; + + indice.x = indice.x * block_size + gidx; + indice.zw = coord.zw; + + Tensor img1 = create_tensor_from_image2d_array(input0, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global short* data_ptr = (__global short*)input_ptr; + __global short* dst_ptr = (__global short*)output_ptr; + short src = data_ptr[0]; + dst_ptr[0] = src; +} + +__kernel void gather_nd_array_batch_F16toF16_2D( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int block_size, + int coord_dim + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num + + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); + int4 indice = ((int4 *)indice_ptr)[0]; + + indice.x = indice.x * block_size + gidx; + indice.zw = coord.zw; + + Tensor img1 = create_tensor_from_image2d_array(input0, 2); + Tensor img2 = create_tensor_from_image2d_array(output, 2); + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); + __global short* data_ptr = (__global short*)input_ptr; + __global short* dst_ptr = (__global short*)output_ptr; + short src = data_ptr[0]; + dst_ptr[0] = src; +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx index 8288ab05..b4660c29 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx @@ -81,3 +81,85 @@ GATHER_ND_F16_TO_QINT_1D(U8, vxc_uchar16) GATHER_ND_F16_TO_QINT_1D(I8, vxc_char16) GATHER_ND_F16_TO_QINT_1D(I16, vxc_short8) +#define GATHER_ND_ARRAY_QINT_TO_F16_1D(src0_type_name, read_type, ptr_type, stride) \ +__kernel void gather_nd_array_##src0_type_name##toF16_1D( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord = (int4)(0, gidy, gidx, 0); \ + Image img = create_image_from_image2d(input1, 4); \ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \ + int4 indice = ((int4 *)indice_ptr)[0]; \ + \ + coord.w = indice.x; \ + \ + Image img1 = create_image_from_image2d(input0, stride); \ + Image img2 = create_image_from_image2d(output, 2); \ + \ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \ + \ + __global ptr_type data_ptr = (__global ptr_type)input_ptr; \ + __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \ + read_type src = data_ptr[0]; \ + \ + vxc_half8 src0; \ + vxc_short8 dst0; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst0, src0, 16); \ + dst_ptr[0] = dst0; \ +} +GATHER_ND_ARRAY_QINT_TO_F16_1D(U8, vxc_uchar16, vxc_uchar16*, 1) +GATHER_ND_ARRAY_QINT_TO_F16_1D(I8, vxc_char16, vxc_char16*, 1) +GATHER_ND_ARRAY_QINT_TO_F16_1D(I16, vxc_short8, vxc_short8*, 2) + +#define GATHER_ND_ARRAY_F16_TO_QINT_1D(src1_type_name, write_type, ptr_type, stride) \ +__kernel void gather_nd_array_F16to##src1_type_name##_1D( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int coord_dim \ + ) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + \ + int4 coord = (int4)(0, gidy, gidx, 0); \ + Image img = create_image_from_image2d(input1, 4); \ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \ + int4 indice = ((int4 *)indice_ptr)[0]; \ + \ + coord.w = indice.x; \ + \ + Image img1 = create_image_from_image2d(input0, 2); \ + Image img2 = create_image_from_image2d(output, stride); \ + \ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \ + \ + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \ + __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \ + vxc_short8 src = data_ptr[0]; \ + vxc_ushort8 mp1; \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + vxc_half8 data; \ + write_type dst; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \ + dst_ptr[0] = dst; \ +} +GATHER_ND_ARRAY_F16_TO_QINT_1D(U8, vxc_uchar16, vxc_uchar16*, 1) +GATHER_ND_ARRAY_F16_TO_QINT_1D(I8, vxc_char16, vxc_char16*, 1) +GATHER_ND_ARRAY_F16_TO_QINT_1D(I16, vxc_short8, vxc_short8*, 2) + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx index 3396163a..92cd9fba 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx @@ -65,5 +65,5 @@ __kernel void pre_process_gray_half_U8toU8 coord_in.xy = coord_in.xy >> 1; - VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index debd6873..5d4159ac 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -6431,7 +6431,613 @@ CUMSUM_QINT_AXIS0_2D(I8, I8, vxc_char16, vxc_char16)\n\ CUMSUM_QINT_AXIS0_2D(I16, I16, vxc_short8, vxc_short8)\n\ "; /* end of cumsum_2d_vx*/ -static const char cumsum_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char cumsum_array_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\ +_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int channel;\n\ +_viv_uniform int input_zp;\n\ +_viv_uniform float in_out_scale;\n\ +_viv_uniform float in_out_zp_scale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int remainder;\n\ +_viv_uniform int w_size;\n\ +\n\ +\n\ +__kernel void cumsum_array_F16toF16_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ +\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + out_ptr[0] = dst;\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_8BITS_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_array_##in_name##to##out_name##_axis2( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\ + \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 1); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 1); \\\n\ + if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (16 - remainder); \\\n\ + } \\\n\ + for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ + { \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ +}\n\ +CUMSUM_8BITS_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_8BITS_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +__kernel void cumsum_array_I16toI16_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\n\ +\n\ + out_ptr[0] = dst;\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_array_F16toF16_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + out_ptr[0] = dst;\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_8BITS_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_array_##in_name##to##out_name##_axis1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2); \\\n\ + if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (16 - remainder); \\\n\ + } \\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ +}\n\ +CUMSUM_8BITS_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_8BITS_ARRAY_AXIS1(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +__kernel void cumsum_array_I16toI16_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ +\n\ + out_ptr[0] = dst;\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_array_F16toF16_axis0(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, tmpsum, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ +\n\ + for(; coord.x < width; coord.x += 8)\n\ + {\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + out_ptr[0] = dst;\n\ +\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_ARRAY_QINT_AXIS0(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_array_##in_name##to##out_name##_axis0( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + vxc_short8 rowSum; \\\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0); \\\n\ + short zp = (short)input_zp; \\\n\ + \\\n\ + for(; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (8 - remainder); \\\n\ + } \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \\\n\ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \\\n\ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \\\n\ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32B_4x4); \\\n\ + \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ +}\n\ +\n\ +CUMSUM_ARRAY_QINT_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_ARRAY_QINT_AXIS0(I8, I8, vxc_char16, vxc_char16)\n\ +CUMSUM_ARRAY_QINT_AXIS0(I16, I16, vxc_short8, vxc_short8)\n\ +"; /* end of cumsum_array_vx*/ + +static const char cumsum_array_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\ +_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int input_zp;\n\ +_viv_uniform float in_out_scale;\n\ +_viv_uniform float in_out_zp_scale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int remainder;\n\ +_viv_uniform int w_size;\n\ +\n\ +\n\ +__kernel void cumsum_array_F16toF16_axis1_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ +\n\ + Image img1 = create_image_from_image2d(input, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + for(; coord.y < height; coord.y++)\n\ + {\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + out_ptr[0] = dst;\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_8BITS_ARRAY_AXIS1_2D(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_array_##in_name##to##out_name##_axis1_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + int4 sum0 = (int4)(0); \\\n\ + int4 sum1 = (int4)(0); \\\n\ + int4 sum2 = (int4)(0); \\\n\ + int4 sum3 = (int4)(0); \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(input, 1); \\\n\ + Image img2 = create_image_from_image2d(output, 1); \\\n\ + if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (16 - remainder); \\\n\ + } \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \\\n\ + __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertU8toI32D_4x4); \\\n\ + \\\n\ + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ +}\n\ +CUMSUM_8BITS_ARRAY_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_8BITS_ARRAY_AXIS1_2D(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +__kernel void cumsum_array_I16toI16_axis1_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src, dst;\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\ +\n\ + Image img1 = create_image_from_image2d(input, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + out_ptr[0] = dst;\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_array_F16toF16_axis0_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, tmpsum, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + Image img1 = create_image_from_image2d(input, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + for(; coord.x < width; coord.x += 8)\n\ + {\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16A_4x4);\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16B_4x4);\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16C_2x8);\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumHorzF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + out_ptr[0] = dst;\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_ARRAY_QINT_AXIS0_2D(in_name, out_name, src_type, dst_type, stride_data) \\\n\ +__kernel void cumsum_array_##in_name##to##out_name##_axis0_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + vxc_short8 rowSum; \\\n\ + int4 sum0, sum1; \\\n\ + sum0 ^= sum0; \\\n\ + sum1 ^= sum1; \\\n\ + short zp = (short)input_zp; \\\n\ + Image img1 = create_image_from_image2d(input, stride_data); \\\n\ + Image img2 = create_image_from_image2d(output, stride_data); \\\n\ + \\\n\ + for(; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (8 - remainder); \\\n\ + } \\\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \\\n\ + __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzU8toI16A_4x4); \\\n\ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzU8toI16B_8x4); \\\n\ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSubZpI16toI16_2x8); \\\n\ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumHorzI16toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumHorzI16toI32B_4x4); \\\n\ + \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ +}\n\ +\n\ +CUMSUM_ARRAY_QINT_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16, 1)\n\ +CUMSUM_ARRAY_QINT_AXIS0_2D(I8, I8, vxc_char16, vxc_char16, 1)\n\ +CUMSUM_ARRAY_QINT_AXIS0_2D(I16, I16, vxc_short8, vxc_short8, 2)"; /* end of cumsum_array_2d_vx*/ + +static const char cumsum_array_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ @@ -6440,8 +7046,11 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ _viv_uniform int width;\n\ _viv_uniform int height;\n\ _viv_uniform int channel;\n\ +_viv_uniform int remainder;\n\ +_viv_uniform int w_size;\n\ \n\ -__kernel void cumsum_BF16toBF16_axis2(\n\ +\n\ +__kernel void cumsum_array_BF16toBF16_axis2(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ int axis, int exclusive, int rev\n\ @@ -6453,11 +7062,22 @@ __kernel void cumsum_BF16toBF16_axis2(\n\ vxc_ushort8 dst0, dst1, dst;\n\ vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\ +\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ \n\ for(coord.z = 0; coord.z < channel; coord.z++)\n\ {\n\ float4 data0, data1;\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;\n\ + __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;\n\ + src = in_ptr[0];\n\ VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ _viv_asm(COPY, data0, val0, 16);\n\ @@ -6468,7 +7088,7 @@ __kernel void cumsum_BF16toBF16_axis2(\n\ _viv_asm(COPY, dst0, sum0, 16);\n\ _viv_asm(COPY, dst1, sum1, 16);\n\ VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ \n\ @@ -6484,11 +7104,22 @@ __kernel void cumsum_BF16toBF16_axis1(\n\ vxc_ushort8 dst0, dst1, dst;\n\ vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\ +\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ float4 data0, data1;\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;\n\ + __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;\n\ + src = in_ptr[0];\n\ VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ _viv_asm(COPY, data0, val0, 16);\n\ @@ -6498,7 +7129,7 @@ __kernel void cumsum_BF16toBF16_axis1(\n\ _viv_asm(COPY, dst0, sum0, 16);\n\ _viv_asm(COPY, dst1, sum1, 16);\n\ VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ \n\ @@ -6516,11 +7147,21 @@ __kernel void cumsum_BF16toBF16_axis0(\n\ float preSum = 0;\n\ float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\ float4 q = (float4)(1.0, 1.0, 1.0, 0);\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ \n\ for(; coord.x < width; coord.x += 8)\n\ {\n\ float4 data0, data1;\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;\n\ + __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;\n\ + src = in_ptr[0];\n\ VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ _viv_asm(COPY, data0, val0, 16);\n\ @@ -6538,7 +7179,7 @@ __kernel void cumsum_BF16toBF16_axis0(\n\ _viv_asm(COPY, dst0, tmpSum0, 16);\n\ _viv_asm(COPY, dst1, tmpSum1, 16);\n\ VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ \n\ @@ -6554,11 +7195,22 @@ __kernel void cumsum_BF16toBF16_axis1_2D(\n\ vxc_ushort8 dst0, dst1, dst;\n\ vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\ +\n\ + Image img1 = create_image_from_image2d(input, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ \n\ for(; coord.y < height; coord.y++)\n\ {\n\ float4 data0, data1;\n\ - VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;\n\ + __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;\n\ + src = in_ptr[0];\n\ VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ uniConvBF16toF32_Part0_2x8);\n\ VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ @@ -6573,7 +7225,7 @@ __kernel void cumsum_BF16toBF16_axis1_2D(\n\ _viv_asm(COPY, dst1, sum1, 16);\n\ VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ uniExtractOddData_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ \n\ @@ -6592,10 +7244,20 @@ __kernel void cumsum_BF16toBF16_axis0_2D(\n\ float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\ float4 q = (float4)(1.0, 1.0, 1.0, 0);\n\ \n\ + Image img1 = create_image_from_image2d(input, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ for(; coord.x < width; coord.x += 8)\n\ {\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ float4 data0, data1;\n\ - VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;\n\ + __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;\n\ + src = in_ptr[0];\n\ VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ uniConvBF16toF32_Part0_2x8);\n\ VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ @@ -6616,12 +7278,12 @@ __kernel void cumsum_BF16toBF16_axis0_2D(\n\ _viv_asm(COPY, dst1, tmpSum1, 16);\n\ VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ uniExtractOddData_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ -"; /* end of cumsum_bf16_vx*/ +"; /* end of cumsum_array_bf16_vx*/ -static const char cumsum_ex_rev_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char cumsum_array_ex_rev_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ \n\ @@ -6654,7 +7316,11 @@ _viv_uniform int input_zp;\n\ _viv_uniform float in_out_scale;\n\ _viv_uniform float output_zp;\n\ \n\ -__kernel void cumsum_ex_rev_F16toF16_axis0(\n\ +_viv_uniform int remainder;\n\ +_viv_uniform int w_size;\n\ +\n\ +\n\ +__kernel void cumsum_ex_rev_array_F16toF16_axis0(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ int axis, int exclusive, int rev\n\ @@ -6666,11 +7332,26 @@ __kernel void cumsum_ex_rev_F16toF16_axis0(\n\ vxc_short8 src, dst;\n\ vxc_half8 data, tmpsum, sum;\n\ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ +\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ if(exclusive == 0 && rev)\n\ {\n\ for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ _viv_asm(COPY, data, src, 16);\n\ \n\ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);\n\ @@ -6679,26 +7360,34 @@ __kernel void cumsum_ex_rev_F16toF16_axis0(\n\ uniSumHorzRevF16toF16C_2x8);\n\ VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\ _viv_asm(COPY, dst, sum, 16);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(exclusive && rev == 0)\n\ {\n\ _viv_asm(COPY, dst, sum, 16);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ for(; coord.x < width - 8;)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + src = in_ptr[0];\n\ coord_out.x = coord.x + 1;\n\ coord.x += 8;\n\ _viv_asm(COPY, data, src, 16);\n\ \n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);\n\ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);\n\ VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);\n\ VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);\n\ _viv_asm(COPY, dst, sum, 16);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(exclusive && rev)\n\ @@ -6706,10 +7395,20 @@ __kernel void cumsum_ex_rev_F16toF16_axis0(\n\ coord.x = width - 8;\n\ coord_out.x = width - 1;\n\ _viv_asm(COPY, dst, sum, 16);\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ + out_ptr[0] = dst;\n\ for(; coord.x > 0;)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ coord_out.x = coord.x - 1;\n\ coord.x -= 8;\n\ _viv_asm(COPY, data, src, 16);\n\ @@ -6720,13 +7419,13 @@ __kernel void cumsum_ex_rev_F16toF16_axis0(\n\ uniSumHorzRevF16toF16C_2x8);\n\ VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\ _viv_asm(COPY, dst, sum, 16);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ }\n\ \n\ -#define CUMSUM_QINT_EX_REV_AXIS0(in_name, out_name, src_type, dst_type) \\\n\ -__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\ +#define CUMSUM_QINT_EX_REV_ARRAY_AXIS0(in_name, out_name, src_type, dst_type, stride_data) \\\n\ +__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis0( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axis, int exclusive, int rev \\\n\ @@ -6741,10 +7440,25 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\ int4 sum0 = (int4)(0), sum1 = (int4)(0); \\\n\ short zp = (short)input_zp; \\\n\ \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, stride_data); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, stride_data); \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ if(exclusive == 0 && rev) \\\n\ { \\\n\ for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \\\n\ { \\\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (8 - remainder); \\\n\ + } \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\ VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \\\n\ @@ -6759,16 +7473,24 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\ int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ else if(exclusive && rev == 0) \\\n\ { \\\n\ for(coord.x = -1; coord.x < width - 8;) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (8 - remainder); \\\n\ + } \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src = in_ptr[0]; \\\n\ coord_out.x = coord.x + 1; \\\n\ coord.x += 8; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \\\n\ VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \\\n\ VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \\\n\ @@ -6782,14 +7504,22 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\ int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ else if(exclusive && rev) \\\n\ { \\\n\ for(coord.x = width - 7; coord.x > 0;) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (8 - remainder); \\\n\ + } \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ coord_out.x = coord.x - 1; \\\n\ coord.x -= 8; \\\n\ VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\ @@ -6805,16 +7535,16 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\ int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ }\n\ -CUMSUM_QINT_EX_REV_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16)\n\ -CUMSUM_QINT_EX_REV_AXIS0(I8, I8, vxc_char16, vxc_char16)\n\ -CUMSUM_QINT_EX_REV_AXIS0(I16, I16, vxc_short8, vxc_short8)\n\ -"; /* end of cumsum_ex_rev_axis0_vx*/ +CUMSUM_QINT_EX_REV_ARRAY_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16, 1)\n\ +CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I8, I8, vxc_char16, vxc_char16, 1)\n\ +CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I16, I16, vxc_short8, vxc_short8, 2)\n\ +"; /* end of cumsum_array_ex_rev_axis0_vx*/ -static const char cumsum_ex_rev_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char cumsum_array_ex_rev_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ _viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\ @@ -6830,7 +7560,11 @@ _viv_uniform float in_out_scale;\n\ _viv_uniform float in_out_zp_scale;\n\ _viv_uniform float output_zp;\n\ \n\ -__kernel void cumsum_ex_rev_F16toF16_axis1(\n\ +_viv_uniform int remainder;\n\ +_viv_uniform int w_size;\n\ +\n\ +\n\ +__kernel void cumsum_ex_rev_array_F16toF16_axis1(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ int axis, int exclusive, int rev)\n\ @@ -6840,54 +7574,80 @@ __kernel void cumsum_ex_rev_F16toF16_axis1(\n\ vxc_short8 src, dst;\n\ vxc_half8 data, sum;\n\ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ if(exclusive == 0 && rev)\n\ {\n\ for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ _viv_asm(COPY, data, src, 16);\n\ \n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ _viv_asm(COPY, dst, sum, 16);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(exclusive && rev == 0)\n\ {\n\ dst ^= dst;\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ for(; coord.y < height - 1;)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + src = in_ptr[0];\n\ coord.y++;\n\ _viv_asm(COPY, data, src, 16);\n\ +\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ \n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ _viv_asm(COPY, dst, sum, 16);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(exclusive && rev)\n\ {\n\ dst ^= dst;\n\ coord.y = height - 1;\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ + out_ptr[0] = dst;\n\ \n\ for(; coord.y > 0;)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + src = in_ptr[0];\n\ coord.y--;\n\ _viv_asm(COPY, data, src, 16);\n\ +\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ \n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ _viv_asm(COPY, dst, sum, 16);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ }\n\ \n\ -#define CUMSUM_8BITS_EX_REV_AXIS1(in_name, out_name, src_type, dst_type) \\\n\ -__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\ +#define CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis1( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axis, int exclusive, int rev) \\\n\ @@ -6898,11 +7658,25 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\ dst_type dst; \\\n\ int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\ \\\n\ + if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (16 - remainder); \\\n\ + } \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 1); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 1); \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ if(exclusive == 0 && rev) \\\n\ { \\\n\ for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ @@ -6920,7 +7694,7 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\ uniConvertInt32toUint8_2x8); \\\n\ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ else if(exclusive && rev == 0) \\\n\ @@ -6929,11 +7703,15 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\ int4 tmpVal; \\\n\ tmpVal.x = tmpAlpha0; \\\n\ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ - VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \\\n\ for(; coord.y < height - 1;) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src = in_ptr[0]; \\\n\ coord.y++; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ @@ -6951,7 +7729,7 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\ uniConvertInt32toUint8_2x8);\\\n\ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\ uniConvertInt32toUint8_2x8);\\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ else if(exclusive && rev) \\\n\ @@ -6961,16 +7739,22 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\ int4 tmpVal; \\\n\ tmpVal.x = tmpAlpha0; \\\n\ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ - VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global vxc_short8*)output_ptr; \\\n\ + out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \\\n\ for(; coord.y > 0;) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src = in_ptr[0]; \\\n\ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \\\n\ coord.y--; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ @@ -6983,14 +7767,14 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\ uniConvertInt32toUint8_2x8);\\\n\ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\ uniConvertInt32toUint8_2x8);\\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ }\n\ -CUMSUM_8BITS_EX_REV_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\ -CUMSUM_8BITS_EX_REV_AXIS1(I8, I8, vxc_char16, vxc_char16)\n\ +CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(I8, I8, vxc_char16, vxc_char16)\n\ \n\ -__kernel void cumsum_ex_rev_I16toI16_axis1(\n\ +__kernel void cumsum_ex_rev_array_I16toI16_axis1(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ int axis, int exclusive, int rev)\n\ @@ -6999,11 +7783,25 @@ __kernel void cumsum_ex_rev_I16toI16_axis1(\n\ \n\ vxc_short8 src, dst;\n\ int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ if(exclusive == 0 && rev)\n\ {\n\ for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\ @@ -7013,8 +7811,7 @@ __kernel void cumsum_ex_rev_I16toI16_axis1(\n\ int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ uniConvertInt32toUint8_2x8);\n\ -\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(exclusive && rev == 0)\n\ @@ -7023,12 +7820,15 @@ __kernel void cumsum_ex_rev_I16toI16_axis1(\n\ int4 tmpVal;\n\ tmpVal.x = tmpAlpha0;\n\ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ + out_ptr[0] = dst.xxxxxxxx;\n\ for(; coord.y < height - 1;)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + src = in_ptr[0];\n\ coord.y++;\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;\n\ @@ -7039,7 +7839,7 @@ __kernel void cumsum_ex_rev_I16toI16_axis1(\n\ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ uniConvertInt32toUint8_2x8);\n\ \n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(exclusive && rev)\n\ @@ -7049,15 +7849,20 @@ __kernel void cumsum_ex_rev_I16toI16_axis1(\n\ int4 tmpVal;\n\ tmpVal.x = tmpAlpha0;\n\ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ + out_ptr[0] = dst.xxxxxxxx;\n\ for(; coord.y > 0;)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + src = in_ptr[0];\n\ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\ coord.y--;\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ @@ -7065,13 +7870,13 @@ __kernel void cumsum_ex_rev_I16toI16_axis1(\n\ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ uniConvertInt32toUint8_2x8);\n\ \n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ }\n\ -"; /* end of cumsum_ex_rev_axis1_vx*/ +"; /* end of cumsum_array_ex_rev_axis1_vx*/ -static const char cumsum_ex_rev_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char cumsum_array_ex_rev_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ _viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\ @@ -7087,7 +7892,11 @@ _viv_uniform float in_out_scale;\n\ _viv_uniform float in_out_zp_scale;\n\ _viv_uniform float output_zp;\n\ \n\ -__kernel void cumsum_ex_rev_F16toF16_axis2(\n\ +_viv_uniform int remainder;\n\ +_viv_uniform int w_size;\n\ +\n\ +\n\ +__kernel void cumsum_ex_rev_array_F16toF16_axis2(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ int axis, int exclusive, int rev)\n\ @@ -7097,53 +7906,76 @@ __kernel void cumsum_ex_rev_F16toF16_axis2(\n\ vxc_short8 src, dst;\n\ vxc_half8 data, sum;\n\ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ if(rev && exclusive == 0)\n\ {\n\ for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ _viv_asm(COPY, data, src, 16);\n\ \n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ _viv_asm(COPY, dst, sum, 16);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(rev == 0 && exclusive)\n\ {\n\ _viv_asm(COPY, dst, sum, 16);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ for(; coord.z < channel - 1;)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + src = in_ptr[0];\n\ coord.z++;\n\ _viv_asm(COPY, data, src, 16);\n\ \n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ _viv_asm(COPY, dst, sum, 16);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(rev && exclusive)\n\ {\n\ _viv_asm(COPY, dst, sum, 16);\n\ coord.z = channel - 1;\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ + out_ptr[0] = dst;\n\ for(; coord.z > 0;)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + src = in_ptr[0];\n\ coord.z--;\n\ _viv_asm(COPY, data, src, 16);\n\ +\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ \n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ _viv_asm(COPY, dst, sum, 16);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ }\n\ \n\ -#define CUMSUM_8BITS_EX_REV_AXIS2(in_name, out_name, src_type, dst_type) \\\n\ -__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\ +#define CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis2( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axis, int exclusive, int rev) \\\n\ @@ -7154,11 +7986,25 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\ dst_type dst; \\\n\ int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\ \\\n\ + if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (16 - remainder); \\\n\ + } \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 1); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 1); \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ if(rev && exclusive == 0) \\\n\ { \\\n\ for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ @@ -7176,7 +8022,7 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\ uniConvertInt32toUint8_2x8);\\\n\ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\ uniConvertInt32toUint8_2x8);\\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ else if(exclusive && rev == 0) \\\n\ @@ -7185,10 +8031,12 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\ int4 tmpVal; \\\n\ tmpVal.x = tmpAlpha0; \\\n\ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \\\n\ for(; coord.z < channel - 1;) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src = in_ptr[0]; \\\n\ coord.z++; \\\n\ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ @@ -7207,7 +8055,7 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\ uniConvertInt32toUint8_2x8); \\\n\ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ else if(rev && exclusive) \\\n\ @@ -7217,16 +8065,22 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\ int4 tmpVal; \\\n\ tmpVal.x = tmpAlpha0; \\\n\ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ - VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global vxc_short8*)output_ptr; \\\n\ + out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \\\n\ for(; coord.z > 0;) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src = in_ptr[0]; \\\n\ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \\\n\ coord.z--; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ @@ -7239,14 +8093,14 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\ uniConvertInt32toUint8_2x8); \\\n\ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),\n\ uniConvertInt32toUint8_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ }\n\ -CUMSUM_8BITS_EX_REV_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\ -CUMSUM_8BITS_EX_REV_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\ +CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\ \n\ -__kernel void cumsum_ex_rev_I16toI16_axis2(\n\ +__kernel void cumsum_ex_rev_array_I16toI16_axis2(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ int axis, int exclusive, int rev)\n\ @@ -7255,11 +8109,25 @@ __kernel void cumsum_ex_rev_I16toI16_axis2(\n\ \n\ vxc_short8 src, dst;\n\ int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\ + {\n\ + coord.x = coord.x - (8 - remainder);\n\ + }\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\ + __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\ if(exclusive == 0 && rev)\n\ {\n\ for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ + src = in_ptr[0];\n\ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\ @@ -7279,10 +8147,12 @@ __kernel void cumsum_ex_rev_I16toI16_axis2(\n\ int4 tmpVal;\n\ tmpVal.x = tmpAlpha0;\n\ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst.xxxxxxxx;\n\ for(; coord.z < channel - 1;)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + src = in_ptr[0];\n\ coord.z++;\n\ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ @@ -7294,7 +8164,7 @@ __kernel void cumsum_ex_rev_I16toI16_axis2(\n\ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\ uniConvertInt32toUint8_2x8);\n\ \n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(exclusive && rev)\n\ @@ -7304,10 +8174,14 @@ __kernel void cumsum_ex_rev_I16toI16_axis2(\n\ int4 tmpVal;\n\ tmpVal.x = tmpAlpha0;\n\ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global vxc_short8*)output_ptr;\n\ + out_ptr[0] = dst.xxxxxxxx;\n\ for(; coord.z > 0;)\n\ {\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global vxc_short8*)input_ptr;\n\ + src = in_ptr[0];\n\ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\ @@ -7319,13 +8193,14 @@ __kernel void cumsum_ex_rev_I16toI16_axis2(\n\ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\ uniConvertInt32toUint8_2x8);\n\ \n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ }\n\ -"; /* end of cumsum_ex_rev_axis2_vx*/ +\n\ +"; /* end of cumsum_array_ex_rev_axis2_vx*/ -static const char cumsum_f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char cumsum_array_f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ _viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\ @@ -7342,8 +8217,12 @@ _viv_uniform int channel;\n\ _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ _viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ \n\ -#define CUMSUM_F16TOQINT_AXIS2(out_name, src_type, dst_type) \\\n\ -__kernel void cumsum_F16to##out_name##_axis2( \\\n\ +_viv_uniform int remainder;\n\ +_viv_uniform int w_size;\n\ +\n\ +\n\ +#define CUMSUM_ARRAY_F16TOQINT_AXIS2(out_name, src_type, dst_type, stride_out) \\\n\ +__kernel void cumsum_array_F16to##out_name##_axis2( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axis, int exclusive, int rev \\\n\ @@ -7357,24 +8236,34 @@ __kernel void cumsum_F16to##out_name##_axis2( \\\n\ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ vxc_ushort8 ms0; \\\n\ _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (8 - remainder); \\\n\ + } \\\n\ for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ _viv_asm(COPY, data, src, 16); \\\n\ \\\n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ }\n\ -CUMSUM_F16TOQINT_AXIS2(I8, vxc_half8, vxc_char16)\n\ -CUMSUM_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8)\n\ -CUMSUM_F16TOQINT_AXIS2(U8, vxc_half8, vxc_uchar16)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS2(I8, vxc_half8, vxc_char16, 1)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8, 2)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS2(U8, vxc_half8, vxc_uchar16, 1)\n\ \n\ \n\ -#define CUMSUM_F16TOQINT_AXIS1(out_name, src_type, dst_type) \\\n\ -__kernel void cumsum_F16to##out_name##_axis1( \\\n\ +#define CUMSUM_ARRAY_F16TOQINT_AXIS1(out_name, src_type, dst_type, stride_out) \\\n\ +__kernel void cumsum_array_F16to##out_name##_axis1( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axis, int exclusive, int rev \\\n\ @@ -7388,23 +8277,33 @@ __kernel void cumsum_F16to##out_name##_axis1( \\\n\ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ vxc_ushort8 ms0; \\\n\ _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (8 - remainder); \\\n\ + } \\\n\ for(coord.y = 0; coord.y < height; coord.y++) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ _viv_asm(COPY, data, src, 16); \\\n\ \\\n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ }\n\ -CUMSUM_F16TOQINT_AXIS1(I8, vxc_half8, vxc_char16)\n\ -CUMSUM_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8)\n\ -CUMSUM_F16TOQINT_AXIS1(U8, vxc_half8, vxc_uchar16)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS1(I8, vxc_half8, vxc_char16, 1)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8, 2)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS1(U8, vxc_half8, vxc_uchar16, 1)\n\ \n\ -#define CUMSUM_F16TOQINT_AXIS0(out_name, src_type, dst_type) \\\n\ -__kernel void cumsum_F16to##out_name##_axis0( \\\n\ +#define CUMSUM_ARRAY_F16TOQINT_AXIS0(out_name, src_type, dst_type, stride_out) \\\n\ +__kernel void cumsum_array_F16to##out_name##_axis0( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axis, int exclusive, int rev \\\n\ @@ -7418,9 +8317,19 @@ __kernel void cumsum_F16to##out_name##_axis0( \\\n\ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ vxc_ushort8 ms0; \\\n\ _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\ for(; coord.x < width; coord.x += 8) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (8 - remainder); \\\n\ + } \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ _viv_asm(COPY, data, src, 16); \\\n\ \\\n\ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \\\n\ @@ -7429,83 +8338,15 @@ __kernel void cumsum_F16to##out_name##_axis0( \\\n\ VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - } \\\n\ -}\n\ -CUMSUM_F16TOQINT_AXIS0(I8, vxc_half8, vxc_char16)\n\ -CUMSUM_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8)\n\ -CUMSUM_F16TOQINT_AXIS0(U8, vxc_half8, vxc_uchar16)\n\ -\n\ -#define CUMSUM_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type) \\\n\ -__kernel void cumsum_F16to##out_name##_axis1_2D( \\\n\ - __read_only image2d_t input, \\\n\ - __write_only image2d_t output, \\\n\ - int axis, int exclusive, int rev \\\n\ - ) \\\n\ -{ \\\n\ - int2 coord = (int2)(get_global_id(0), 0); \\\n\ - \\\n\ - vxc_short8 src; \\\n\ - dst_type dst; \\\n\ - vxc_half8 data, sum; \\\n\ - VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ - vxc_ushort8 ms0; \\\n\ - _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ - for(; coord.y < height; coord.y++) \\\n\ - { \\\n\ - VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - _viv_asm(COPY, data, src, 16); \\\n\ - \\\n\ - VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniAccSumVertF16toF16_2x8); \\\n\ - VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - } \\\n\ -}\n\ -CUMSUM_F16TOQINT_AXIS1_2D(I8, vxc_half8, vxc_char16)\n\ -CUMSUM_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8)\n\ -CUMSUM_F16TOQINT_AXIS1_2D(U8, vxc_half8, vxc_uchar16)\n\ -\n\ -#define CUMSUM_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type) \\\n\ -__kernel void cumsum_F16to##out_name##_axis0_2D( \\\n\ - __read_only image2d_t input, \\\n\ - __write_only image2d_t output, \\\n\ - int axis, int exclusive, int rev \\\n\ - ) \\\n\ -{ \\\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ - \\\n\ - vxc_short8 src; \\\n\ - dst_type dst; \\\n\ - vxc_half8 data, tmpsum, sum; \\\n\ - VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ - vxc_ushort8 ms0; \\\n\ - _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ - for(; coord.x < width; coord.x += 8) \\\n\ - { \\\n\ - VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - _viv_asm(COPY, data, src, 16); \\\n\ - \\\n\ - VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniSumHorzF16toF16A_4x4); \\\n\ - VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniSumHorzF16toF16B_4x4); \\\n\ - VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniSumHorzF16toF16C_2x8); \\\n\ - VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniAccSumHorzF16toF16_2x8); \\\n\ - VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ }\n\ -CUMSUM_F16TOQINT_AXIS0_2D(I8, vxc_half8, vxc_char16)\n\ -CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8)\n\ -CUMSUM_F16TOQINT_AXIS0_2D(U8, vxc_half8, vxc_uchar16)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS0(I8, vxc_half8, vxc_char16, 1)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8, 2)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS0(U8, vxc_half8, vxc_uchar16, 1)\n\ \n\ -#define CUMSUM_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type) \\\n\ -__kernel void cumsum_ex_rev_F16to##out_name##_axis2( \\\n\ +#define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type, stride_out) \\\n\ +__kernel void cumsum_array_ex_rev_F16to##out_name##_axis2( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axis, int exclusive, int rev \\\n\ @@ -7519,33 +8360,51 @@ __kernel void cumsum_ex_rev_F16to##out_name##_axis2( \\\n\ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ vxc_ushort8 ms0; \\\n\ _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (8 - remainder); \\\n\ + } \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ if(exclusive == 0 && rev) \\\n\ { \\\n\ for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ _viv_asm(COPY, data, src, 16); \\\n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ else if(exclusive && rev == 0) \\\n\ { \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ for(; coord.z < channel - 1;) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + src = in_ptr[0]; \\\n\ coord.z++; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ _viv_asm(COPY, data, src, 16); \\\n\ \\\n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ else if(exclusive && rev) \\\n\ @@ -7553,26 +8412,32 @@ __kernel void cumsum_ex_rev_F16to##out_name##_axis2( \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ coord.z = channel - 1; \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ for(; coord.z > 0;) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + src = in_ptr[0]; \\\n\ coord.z--; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ _viv_asm(COPY, data, src, 16); \\\n\ \\\n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ }\n\ -CUMSUM_F16TOQINT_EX_REV_AXIS2(I8, vxc_half8, vxc_char16)\n\ -CUMSUM_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8)\n\ -CUMSUM_F16TOQINT_EX_REV_AXIS2(U8, vxc_half8, vxc_uchar16)\n\ +CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I8, vxc_half8, vxc_char16, 1)\n\ +CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8, 2)\n\ +CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(U8, vxc_half8, vxc_uchar16, 1)\n\ \n\ -#define CUMSUM_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type) \\\n\ -__kernel void cumsum_ex_rev_F16to##out_name##_axis1( \\\n\ +#define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type, stride_out) \\\n\ +__kernel void cumsum_array_ex_rev_F16to##out_name##_axis1( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axis, int exclusive, int rev \\\n\ @@ -7586,32 +8451,50 @@ __kernel void cumsum_ex_rev_F16to##out_name##_axis1( \\\n\ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ vxc_ushort8 ms0; \\\n\ _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (8 - remainder); \\\n\ + } \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ if(exclusive == 0 && rev) \\\n\ { \\\n\ for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ _viv_asm(COPY, data, src, 16); \\\n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ else if(exclusive && rev == 0) \\\n\ { \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ for(; coord.y < height - 1;) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + src = in_ptr[0]; \\\n\ coord.y++; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ _viv_asm(COPY, data, src, 16); \\\n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ else if(exclusive && rev) \\\n\ @@ -7619,191 +8502,1512 @@ __kernel void cumsum_ex_rev_F16to##out_name##_axis1( \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ coord.y = height - 1; \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ for(; coord.y > 0;) \\\n\ { \\\n\ - VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + src = in_ptr[0]; \\\n\ coord.y--; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global dst_type*)output_ptr; \\\n\ _viv_asm(COPY, data, src, 16); \\\n\ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_0_Lo_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + out_ptr[0] = dst; \\\n\ } \\\n\ } \\\n\ }\n\ -CUMSUM_F16TOQINT_EX_REV_AXIS1(I8, vxc_half8, vxc_char16)\n\ -CUMSUM_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8)\n\ -CUMSUM_F16TOQINT_EX_REV_AXIS1(U8, vxc_half8, vxc_uchar16)\n\ -"; /* end of cumsum_f16_u8_vx*/ +CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I8, vxc_half8, vxc_char16, 1)\n\ +CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8, 2)\n\ +CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(U8, vxc_half8, vxc_uchar16, 1)"; /* end of cumsum_array_f16_u8_vx*/ -static const char custom_softmax_vx[] = "/*\n\ - ============================================================================\n\ - Name : Softmax2.vx\n\ - Author : VSI\n\ - Version :\n\ - Copyright : Your copyright notice\n\ - Description :\n\ - ============================================================================\n\ - */\n\ -#include \"cl_viv_vx_ext.h\"\n\ +static const char cumsum_array_f16_u8_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ -_viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;\n\ -_viv_uniform int sf_size;\n\ - #define F_MAX(a,b) ((a)>(b)?(a):(b))\n\ -__kernel void Softmax2VXC\n\ - (\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int axis\n\ - )\n\ -{\n\ - int4 coord_in = (int4)(0,0,0,0);\n\ - float fMax = 0.0;\n\ - for (int i = 0; i < sf_size; i++)\n\ - {\n\ - vxc_char8 val;\n\ - coord_in.x = i;\n\ - VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\ - float fval;\n\ - VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\ \n\ - fMax = F_MAX(fMax, fval);\n\ - }\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ \n\ - float fProbSum = 0.0f;\n\ - vxc_short8 dst;\n\ - for (int i = 0; i < sf_size; i++)\n\ - {\n\ - vxc_char8 val;\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int channel;\n\ \n\ - coord_in.x = i;\n\ - VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\ - float fval;\n\ - VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ \n\ - float fOut = (float)exp(fval - fMax);\n\ - fProbSum += fOut;\n\ - half hVal;\n\ - _viv_asm(CONV,hVal,fOut);\n\ - _viv_asm(COPY,dst,hVal, 4);\n\ - VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ +_viv_uniform int remainder;\n\ +_viv_uniform int w_size;\n\ \n\ - for (int i = 0; i < sf_size; i++)\n\ - {\n\ - vxc_short8 val;\n\ - vxc_half8 val_h;\n\ - coord_in.x = i;\n\ - VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\ - float fval;\n\ - _viv_asm(COPY, val_h,val, 16);\n\ - VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ \n\ - float fOut =fval/fProbSum;\n\ - half hVal;\n\ - _viv_asm(CONV,hVal,fOut);\n\ - _viv_asm(COPY,dst,hVal, 4);\n\ - VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ +#define CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type, stride_out) \\\n\ +__kernel void cumsum_array_F16to##out_name##_axis1_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (8 - remainder); \\\n\ + } \\\n\ + for(; coord.y < height; coord.y++) \\\n\ + { \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ }\n\ -"; /* end of custom_softmax_vx*/ +CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I8, vxc_half8, vxc_char16, 1)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8, 2)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(U8, vxc_half8, vxc_uchar16, 1)\n\ +\n\ +#define CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type, stride_out) \\\n\ +__kernel void cumsum_array_F16to##out_name##_axis0_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, tmpsum, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\ + for(; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\ + { \\\n\ + coord.x = coord.x - (8 - remainder); \\\n\ + } \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\ + __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\ + src = in_ptr[0]; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16A_4x4); \\\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16B_4x4); \\\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16C_2x8); \\\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumHorzF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ +}\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I8, vxc_half8, vxc_char16, 1)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8, 2)\n\ +CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(U8, vxc_half8, vxc_uchar16, 1)\n\ +"; /* end of cumsum_array_f16_u8_2d_vx*/ -static const char custom_warp_affine_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +static const char cumsum_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ -#include \"cl_viv_vx_ext.h\"\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ \n\ -_viv_uniform float4 matrix0;\n\ -_viv_uniform float2 matrix1;\n\ -_viv_uniform float4 matrix4;\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int channel;\n\ \n\ -__kernel void custom_warp_affine_nearest_neighbor_U8toU8\n\ -(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output,\n\ - float _m0,\n\ - float _m1,\n\ - float _m2,\n\ - float _m3,\n\ - float _m4,\n\ - float _m5\n\ -)\n\ +__kernel void cumsum_BF16toBF16_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ {\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ -\n\ - float4 coord_f = convert_float4(coord_in);\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ \n\ - coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ + vxc_ushort8 src, val0, val1;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\ \n\ - coord_in = convert_int4(coord_f);\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + float4 data0, data1;\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, val0, 16);\n\ + _viv_asm(COPY, data1, val1, 16);\n\ \n\ - int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_input.w, baseAddr);\n\ + sum0 += data0;\n\ + sum1 += data1;\n\ + _viv_asm(COPY, dst0, sum0, 16);\n\ + _viv_asm(COPY, dst1, sum1, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ \n\ - vxc_uchar16 dst;\n\ - VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_input.xy = coord_in.zw;\n\ - VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_f = coord_f.zwzw + matrix4;\n\ - coord_in = convert_int4(coord_f);\n\ - coord_input.xy = coord_in.xy;\n\ - VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_input.xy = coord_in.zw;\n\ - VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - coord_f = coord_f.zwzw + matrix4;\n\ - coord_in = convert_int4(coord_f);\n\ - coord_input.xy = coord_in.xy;\n\ - VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ - coord_input.xy = coord_in.zw;\n\ - VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ - coord_f = coord_f.zwzw + matrix4;\n\ - coord_in = convert_int4(coord_f);\n\ - coord_input.xy = coord_in.xy;\n\ - VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - coord_input.xy = coord_in.zw;\n\ - VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +__kernel void cumsum_BF16toBF16_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ \n\ + vxc_ushort8 src, val0, val1;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\ \n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + float4 data0, data1;\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, val0, 16);\n\ + _viv_asm(COPY, data1, val1, 16);\n\ + sum0 += data0;\n\ + sum1 += data1;\n\ + _viv_asm(COPY, dst0, sum0, 16);\n\ + _viv_asm(COPY, dst1, sum1, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ }\n\ \n\ -__kernel void custom_warp_affine_bilinear_U8toU8\n\ -(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output,\n\ - float _m0,\n\ - float _m1,\n\ - float _m2,\n\ - float _m3,\n\ - float _m4,\n\ - float _m5\n\ -)\n\ +__kernel void cumsum_BF16toBF16_axis0(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ {\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ \n\ - float4 coord_f = convert_float4(coord_in);\n\ + vxc_ushort8 src, val0, val1;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float preSum = 0;\n\ + float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\ + float4 q = (float4)(1.0, 1.0, 1.0, 0);\n\ \n\ - coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ + for(; coord.x < width; coord.x += 8)\n\ + {\n\ + float4 data0, data1;\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, val0, 16);\n\ + _viv_asm(COPY, data1, val1, 16);\n\ \n\ - coord_in = convert_int4(coord_f);\n\ + float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));\n\ + float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));\n\ + tmpSum1 += tmpSum0.w;\n\ \n\ - int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_input.w, baseAddr);\n\ + tmpSum0 += preSum;\n\ + tmpSum1 += preSum;\n\ \n\ - vxc_uchar16 src0, src1, dst;\n\ - VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ -#if (VX_VERSION==1)\n\ + preSum = tmpSum1.w;\n\ +\n\ + _viv_asm(COPY, dst0, tmpSum0, 16);\n\ + _viv_asm(COPY, dst1, tmpSum1, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_BF16toBF16_axis1_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ +\n\ + vxc_ushort8 src, val0, val1;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\ +\n\ + for(; coord.y < height; coord.y++)\n\ + {\n\ + float4 data0, data1;\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, val0, 16);\n\ + _viv_asm(COPY, data1, val1, 16);\n\ +\n\ + sum0 += data0;\n\ + sum1 += data1;\n\ +\n\ + _viv_asm(COPY, dst0, sum0, 16);\n\ + _viv_asm(COPY, dst1, sum1, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_BF16toBF16_axis0_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_ushort8 src, val0, val1;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float preSum = 0;\n\ + float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\ + float4 q = (float4)(1.0, 1.0, 1.0, 0);\n\ +\n\ + for(; coord.x < width; coord.x += 8)\n\ + {\n\ + float4 data0, data1;\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, val0, 16);\n\ + _viv_asm(COPY, data1, val1, 16);\n\ +\n\ + float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));\n\ + float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));\n\ + tmpSum1 += tmpSum0.w;\n\ +\n\ + tmpSum0 += preSum;\n\ + tmpSum1 += preSum;\n\ +\n\ + preSum = tmpSum1.w;\n\ +\n\ + _viv_asm(COPY, dst0, tmpSum0, 16);\n\ + _viv_asm(COPY, dst1, tmpSum1, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +"; /* end of cumsum_bf16_vx*/ + +static const char cumsum_ex_rev_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\ +_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4;\n\ +_viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4;\n\ +\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int input_zp;\n\ +_viv_uniform float in_out_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +__kernel void cumsum_ex_rev_F16toF16_axis0(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, tmpsum, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + if(exclusive == 0 && rev)\n\ + {\n\ + for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniSumHorzRevF16toF16C_2x8);\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev == 0)\n\ + {\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.x < width - 8;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x = coord.x + 1;\n\ + coord.x += 8;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev)\n\ + {\n\ + coord.x = width - 8;\n\ + coord_out.x = width - 1;\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.x > 0;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x = coord.x - 1;\n\ + coord.x -= 8;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniSumHorzRevF16toF16C_2x8);\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_QINT_EX_REV_AXIS0(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + vxc_short8 rowSum; \\\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0); \\\n\ + short zp = (short)input_zp; \\\n\ + \\\n\ + if(exclusive == 0 && rev) \\\n\ + { \\\n\ + for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \\\n\ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \\\n\ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAccSumHorzRevI16toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAccSumHorzRevI16toI32B_4x4); \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev == 0) \\\n\ + { \\\n\ + for(coord.x = -1; coord.x < width - 8;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_out.x = coord.x + 1; \\\n\ + coord.x += 8; \\\n\ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \\\n\ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \\\n\ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \\\n\ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAccSumHorzI16toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAccSumHorzI16toI32B_4x4); \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev) \\\n\ + { \\\n\ + for(coord.x = width - 7; coord.x > 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_out.x = coord.x - 1; \\\n\ + coord.x -= 8; \\\n\ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \\\n\ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \\\n\ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAccSumHorzRevI16toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAccSumHorzRevI16toI32B_4x4); \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_QINT_EX_REV_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_QINT_EX_REV_AXIS0(I8, I8, vxc_char16, vxc_char16)\n\ +CUMSUM_QINT_EX_REV_AXIS0(I16, I16, vxc_short8, vxc_short8)\n\ +"; /* end of cumsum_ex_rev_axis0_vx*/ + +static const char cumsum_ex_rev_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ +\n\ +_viv_uniform int height;\n\ +_viv_uniform float in_out_scale;\n\ +_viv_uniform float in_out_zp_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +__kernel void cumsum_ex_rev_F16toF16_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + if(exclusive == 0 && rev)\n\ + {\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev == 0)\n\ + {\n\ + dst ^= dst;\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.y < height - 1;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev)\n\ + {\n\ + dst ^= dst;\n\ + coord.y = height - 1;\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + for(; coord.y > 0;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y--;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_8BITS_EX_REV_AXIS1(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\ + \\\n\ + if(exclusive == 0 && rev) \\\n\ + { \\\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev == 0) \\\n\ + { \\\n\ + int tmpAlpha0 = convert_int_rte(output_zp); \\\n\ + int4 tmpVal; \\\n\ + tmpVal.x = tmpAlpha0; \\\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.y < height - 1;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8);\\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev) \\\n\ + { \\\n\ + coord.y = height - 1; \\\n\ + int tmpAlpha0 = convert_int_rte(output_zp); \\\n\ + int4 tmpVal; \\\n\ + tmpVal.x = tmpAlpha0; \\\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.y > 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \\\n\ + coord.y--; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8);\\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_8BITS_EX_REV_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_8BITS_EX_REV_AXIS1(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +__kernel void cumsum_ex_rev_I16toI16_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\ + if(exclusive == 0 && rev)\n\ + {\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev == 0)\n\ + {\n\ + int tmpAlpha0 = convert_int_rte(output_zp);\n\ + int4 tmpVal;\n\ + tmpVal.x = tmpAlpha0;\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + for(; coord.y < height - 1;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev)\n\ + {\n\ + coord.y = height - 1;\n\ + int tmpAlpha0 = convert_int_rte(output_zp);\n\ + int4 tmpVal;\n\ + tmpVal.x = tmpAlpha0;\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + for(; coord.y > 0;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\ + coord.y--;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ +}\n\ +"; /* end of cumsum_ex_rev_axis1_vx*/ + +static const char cumsum_ex_rev_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ +\n\ +_viv_uniform int channel;\n\ +_viv_uniform float in_out_scale;\n\ +_viv_uniform float in_out_zp_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +__kernel void cumsum_ex_rev_F16toF16_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + if(rev && exclusive == 0)\n\ + {\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(rev == 0 && exclusive)\n\ + {\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.z < channel - 1;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.z++;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(rev && exclusive)\n\ + {\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + coord.z = channel - 1;\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.z > 0;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.z--;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_8BITS_EX_REV_AXIS2(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\ + \\\n\ + if(rev && exclusive == 0) \\\n\ + { \\\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8);\\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\ + uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev == 0) \\\n\ + { \\\n\ + int tmpAlpha0 = convert_int_rte(output_zp); \\\n\ + int4 tmpVal; \\\n\ + tmpVal.x = tmpAlpha0; \\\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.z < channel - 1;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z++; \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(rev && exclusive) \\\n\ + { \\\n\ + coord.z = channel - 1; \\\n\ + int tmpAlpha0 = convert_int_rte(output_zp); \\\n\ + int4 tmpVal; \\\n\ + tmpVal.x = tmpAlpha0; \\\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.z > 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \\\n\ + coord.z--; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_8BITS_EX_REV_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_8BITS_EX_REV_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +__kernel void cumsum_ex_rev_I16toI16_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\ + if(exclusive == 0 && rev)\n\ + {\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev == 0)\n\ + {\n\ + int tmpAlpha0 = convert_int_rte(output_zp);\n\ + int4 tmpVal;\n\ + tmpVal.x = tmpAlpha0;\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.z < channel - 1;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.z++;\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev)\n\ + {\n\ + coord.z = channel - 1;\n\ + int tmpAlpha0 = convert_int_rte(output_zp);\n\ + int4 tmpVal;\n\ + tmpVal.x = tmpAlpha0;\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.z > 0;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\ + coord.z--;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ +}\n\ +"; /* end of cumsum_ex_rev_axis2_vx*/ + +static const char cumsum_f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int channel;\n\ +\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +\n\ +#define CUMSUM_F16TOQINT_AXIS2(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_F16to##out_name##_axis2( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_AXIS2(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_AXIS2(U8, vxc_half8, vxc_uchar16)\n\ +\n\ +\n\ +#define CUMSUM_F16TOQINT_AXIS1(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_F16to##out_name##_axis1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_AXIS1(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_AXIS1(U8, vxc_half8, vxc_uchar16)\n\ +\n\ +#define CUMSUM_F16TOQINT_AXIS0(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_F16to##out_name##_axis0( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, tmpsum, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + for(; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \\\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); \\\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); \\\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_AXIS0(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_AXIS0(U8, vxc_half8, vxc_uchar16)\n\ +\n\ +#define CUMSUM_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_F16to##out_name##_axis1_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + for(; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_AXIS1_2D(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_AXIS1_2D(U8, vxc_half8, vxc_uchar16)\n\ +\n\ +#define CUMSUM_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_F16to##out_name##_axis0_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, tmpsum, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + for(; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16A_4x4); \\\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16B_4x4); \\\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16C_2x8); \\\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumHorzF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_AXIS0_2D(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_AXIS0_2D(U8, vxc_half8, vxc_uchar16)\n\ +\n\ +#define CUMSUM_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_ex_rev_F16to##out_name##_axis2( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + if(exclusive == 0 && rev) \\\n\ + { \\\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev == 0) \\\n\ + { \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.z < channel - 1;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z++; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev) \\\n\ + { \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + coord.z = channel - 1; \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.z > 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z--; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_EX_REV_AXIS2(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_EX_REV_AXIS2(U8, vxc_half8, vxc_uchar16)\n\ +\n\ +#define CUMSUM_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_ex_rev_F16to##out_name##_axis1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + if(exclusive == 0 && rev) \\\n\ + { \\\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev == 0) \\\n\ + { \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.y < height - 1;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev) \\\n\ + { \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + coord.y = height - 1; \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.y > 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y--; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_EX_REV_AXIS1(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_EX_REV_AXIS1(U8, vxc_half8, vxc_uchar16)\n\ +"; /* end of cumsum_f16_u8_vx*/ + +static const char custom_softmax_vx[] = "/*\n\ + ============================================================================\n\ + Name : Softmax2.vx\n\ + Author : VSI\n\ + Version :\n\ + Copyright : Your copyright notice\n\ + Description :\n\ + ============================================================================\n\ + */\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;\n\ +_viv_uniform int sf_size;\n\ + #define F_MAX(a,b) ((a)>(b)?(a):(b))\n\ +__kernel void Softmax2VXC\n\ + (\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + int axis\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(0,0,0,0);\n\ + float fMax = 0.0;\n\ + for (int i = 0; i < sf_size; i++)\n\ + {\n\ + vxc_char8 val;\n\ + coord_in.x = i;\n\ + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\ + float fval;\n\ + VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ +\n\ + fMax = F_MAX(fMax, fval);\n\ + }\n\ +\n\ + float fProbSum = 0.0f;\n\ + vxc_short8 dst;\n\ + for (int i = 0; i < sf_size; i++)\n\ + {\n\ + vxc_char8 val;\n\ +\n\ + coord_in.x = i;\n\ + VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\ + float fval;\n\ + VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ +\n\ + float fOut = (float)exp(fval - fMax);\n\ + fProbSum += fOut;\n\ + half hVal;\n\ + _viv_asm(CONV,hVal,fOut);\n\ + _viv_asm(COPY,dst,hVal, 4);\n\ + VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +\n\ + for (int i = 0; i < sf_size; i++)\n\ + {\n\ + vxc_short8 val;\n\ + vxc_half8 val_h;\n\ + coord_in.x = i;\n\ + VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\ + float fval;\n\ + _viv_asm(COPY, val_h,val, 16);\n\ + VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\ +\n\ + float fOut =fval/fProbSum;\n\ + half hVal;\n\ + _viv_asm(CONV,hVal,fOut);\n\ + _viv_asm(COPY,dst,hVal, 4);\n\ + VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +"; /* end of custom_softmax_vx*/ + +static const char custom_warp_affine_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float4 matrix0;\n\ +_viv_uniform float2 matrix1;\n\ +_viv_uniform float4 matrix4;\n\ +\n\ +__kernel void custom_warp_affine_nearest_neighbor_U8toU8\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_input.w, baseAddr);\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_f = coord_f.zwzw + matrix4;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_warp_affine_bilinear_U8toU8\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_input.w, baseAddr);\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ #else\n\ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ @@ -13458,35 +15662,244 @@ __kernel void gather_batch_I16toF16_axis0(\n\ int is_array\n\ )\n\ {\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - int4 indices = read_imagei(input1, coord.xz);\n\ - indices = indices >= 0 ? indices : indices + axis_num;\n\ - int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 indices = read_imagei(input1, coord.xz);\n\ + indices = indices >= 0 ? indices : indices + axis_num;\n\ + int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.y;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.z;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = indices.w;\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_half8 src0;\n\ + vxc_short8 dst0;\n\ + vxc_ushort8 ms0;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ + _viv_asm(COPY, dst0, src0, 16);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of gather_mix_batch_vx*/ + +static const char gather_nd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +__kernel void gather_nd_I8toI8_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + coord.w = indice.x;\n\ +\n\ + vxc_char16 src;\n\ + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_U8toU8_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + coord.w = indice.x;\n\ +\n\ + vxc_uchar16 src;\n\ + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_I16toI16_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + coord.w = indice.x;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_F16toF16_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + coord.w = indice.x;\n\ +\n\ + vxc_short8 src;\n\ + VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void gather_nd_array_I8toI8_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + coord.w = indice.x;\n\ +\n\ + Image img1 = create_image_from_image2d(input0, 1);\n\ + Image img2 = create_image_from_image2d(output, 1);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\ + __global char* data_ptr = (__global char*)input_ptr;\n\ + __global char* dst_ptr = (__global char*)output_ptr;\n\ + char src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_nd_array_U8toU8_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + coord.w = indice.x;\n\ +\n\ + Image img1 = create_image_from_image2d(input0, 1);\n\ + Image img2 = create_image_from_image2d(output, 1);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\ + __global uchar* data_ptr = (__global uchar*)input_ptr;\n\ + __global uchar* dst_ptr = (__global uchar*)output_ptr;\n\ + uchar src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_nd_array_I16toI16_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + coord.w = indice.x;\n\ +\n\ + Image img1 = create_image_from_image2d(input0, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\ + __global short* data_ptr = (__global short*)input_ptr;\n\ + __global short* dst_ptr = (__global short*)output_ptr;\n\ + short src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +\n\ +}\n\ +\n\ +__kernel void gather_nd_array_F16toF16_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ \n\ - vxc_short8 src, dst;\n\ - VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = indices.y;\n\ - VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = indices.z;\n\ - VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x = indices.w;\n\ - VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ \n\ - vxc_half8 src0;\n\ - vxc_short8 dst0;\n\ - vxc_ushort8 ms0;\n\ - _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ - VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniU8MulAndPostShift_0_Lo_2x8);\n\ - _viv_asm(COPY, dst0, src0, 16);\n\ + coord.w = indice.x;\n\ \n\ - VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + Image img1 = create_image_from_image2d(input0, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\ + __global short* data_ptr = (__global short*)input_ptr;\n\ + __global short* dst_ptr = (__global short*)output_ptr;\n\ + short src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ }\n\ -"; /* end of gather_mix_batch_vx*/ +"; /* end of gather_nd_vx*/ -static const char gather_nd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char gather_nd_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ -__kernel void gather_nd_I8toI8_1D(\n\ +__kernel void gather_nd_I8toI8_2D(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ @@ -13502,15 +15915,15 @@ __kernel void gather_nd_I8toI8_1D(\n\ uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ int4 indice = ((int4 *)indice_ptr)[0];\n\ \n\ - coord.w = indice.x;\n\ + indice.x = indice.x * block_size + gidx;\n\ \n\ vxc_char16 src;\n\ - VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ \n\ VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -__kernel void gather_nd_U8toU8_1D(\n\ +__kernel void gather_nd_U8toU8_2D(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ @@ -13526,14 +15939,14 @@ __kernel void gather_nd_U8toU8_1D(\n\ uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ int4 indice = ((int4 *)indice_ptr)[0];\n\ \n\ - coord.w = indice.x;\n\ + indice.x = indice.x * block_size + gidx;\n\ \n\ vxc_uchar16 src;\n\ - VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -__kernel void gather_nd_I16toI16_1D(\n\ +__kernel void gather_nd_I16toI16_2D(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ @@ -13549,14 +15962,14 @@ __kernel void gather_nd_I16toI16_1D(\n\ uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ int4 indice = ((int4 *)indice_ptr)[0];\n\ \n\ - coord.w = indice.x;\n\ + indice.x = indice.x * block_size + gidx;\n\ \n\ vxc_short8 src;\n\ - VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -__kernel void gather_nd_F16toF16_1D(\n\ +__kernel void gather_nd_F16toF16_2D(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ @@ -13572,17 +15985,14 @@ __kernel void gather_nd_F16toF16_1D(\n\ uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ int4 indice = ((int4 *)indice_ptr)[0];\n\ \n\ - coord.w = indice.x;\n\ + indice.x = indice.x * block_size + gidx;\n\ \n\ vxc_short8 src;\n\ - VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ -"; /* end of gather_nd_vx*/ - -static const char gather_nd_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ -__kernel void gather_nd_I8toI8_2D(\n\ +__kernel void gather_nd_array_I8toI8_2D(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ @@ -13600,13 +16010,17 @@ __kernel void gather_nd_I8toI8_2D(\n\ \n\ indice.x = indice.x * block_size + gidx;\n\ \n\ - vxc_char16 src;\n\ - VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + Image img1 = create_image_from_image2d(input0, 1);\n\ + Image img2 = create_image_from_image2d(output, 1);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\ + __global char* data_ptr = (__global char*)input_ptr;\n\ + __global char* dst_ptr = (__global char*)output_ptr;\n\ + char src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ }\n\ \n\ -__kernel void gather_nd_U8toU8_2D(\n\ +__kernel void gather_nd_array_U8toU8_2D(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ @@ -13624,12 +16038,18 @@ __kernel void gather_nd_U8toU8_2D(\n\ \n\ indice.x = indice.x * block_size + gidx;\n\ \n\ - vxc_uchar16 src;\n\ - VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + Image img1 = create_image_from_image2d(input0, 1);\n\ + Image img2 = create_image_from_image2d(output, 1);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\ + __global uchar* data_ptr = (__global uchar*)input_ptr;\n\ + __global uchar* dst_ptr = (__global uchar*)output_ptr;\n\ + uchar src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +\n\ }\n\ \n\ -__kernel void gather_nd_I16toI16_2D(\n\ +__kernel void gather_nd_array_I16toI16_2D(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ @@ -13647,12 +16067,17 @@ __kernel void gather_nd_I16toI16_2D(\n\ \n\ indice.x = indice.x * block_size + gidx;\n\ \n\ - vxc_short8 src;\n\ - VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + Image img1 = create_image_from_image2d(input0, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\ + __global short* data_ptr = (__global short*)input_ptr;\n\ + __global short* dst_ptr = (__global short*)output_ptr;\n\ + short src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ }\n\ \n\ -__kernel void gather_nd_F16toF16_2D(\n\ +__kernel void gather_nd_array_F16toF16_2D(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ __write_only image2d_t output,\n\ @@ -13670,9 +16095,14 @@ __kernel void gather_nd_F16toF16_2D(\n\ \n\ indice.x = indice.x * block_size + gidx;\n\ \n\ - vxc_short8 src;\n\ - VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + Image img1 = create_image_from_image2d(input0, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\ + __global short* data_ptr = (__global short*)input_ptr;\n\ + __global short* dst_ptr = (__global short*)output_ptr;\n\ + short src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ }\n\ "; /* end of gather_nd_2d_vx*/ @@ -13758,7 +16188,88 @@ __kernel void gather_nd_F16to##src1_type_name##_2D( \\\n\ GATHER_ND_F16_TO_QINT_2D(U8, vxc_uchar16)\n\ GATHER_ND_F16_TO_QINT_2D(I8, vxc_char16)\n\ GATHER_ND_F16_TO_QINT_2D(I16, vxc_short8)\n\ -"; /* end of gather_nd_2d_mix_vx*/ +\n\ +#define GATHER_ND_ARRAY_QINT_TO_F16_2D(src0_type_name, read_type, ptr_type, stride) \\\n\ +__kernel void gather_nd_array_##src0_type_name##toF16_2D( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ + Image img = create_image_from_image2d(input1, 4); \\\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\ + int4 indice = ((int4 *)indice_ptr)[0]; \\\n\ + \\\n\ + indice.x = indice.x * block_size + gidx; \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(input0, stride); \\\n\ + Image img2 = create_image_from_image2d(output, 2); \\\n\ + \\\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \\\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \\\n\ + \\\n\ + __global ptr_type data_ptr = (__global ptr_type)input_ptr; \\\n\ + __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \\\n\ + read_type src = data_ptr[0]; \\\n\ + \\\n\ + vxc_half8 src0; \\\n\ + vxc_short8 dst0; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst0, src0, 16); \\\n\ + dst_ptr[0] = dst0; \\\n\ +}\n\ +GATHER_ND_ARRAY_QINT_TO_F16_2D(U8, vxc_uchar16, vxc_uchar16*, 1)\n\ +GATHER_ND_ARRAY_QINT_TO_F16_2D(I8, vxc_char16, vxc_char16*, 1)\n\ +GATHER_ND_ARRAY_QINT_TO_F16_2D(I16, vxc_short8, vxc_short8*, 2)\n\ +\n\ +#define GATHER_ND_ARRAY_F16_TO_QINT_2D(src1_type_name, write_type, ptr_type, stride) \\\n\ +__kernel void gather_nd_array_F16to##src1_type_name##_2D( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ + Image img = create_image_from_image2d(input1, 4); \\\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\ + int4 indice = ((int4 *)indice_ptr)[0]; \\\n\ + \\\n\ + indice.x = indice.x * block_size + gidx; \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(input0, 2); \\\n\ + Image img2 = create_image_from_image2d(output, stride); \\\n\ + \\\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \\\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \\\n\ + \\\n\ + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \\\n\ + __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \\\n\ + vxc_short8 src = data_ptr[0]; \\\n\ + \\\n\ + vxc_ushort8 mp1; \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + vxc_half8 data; \\\n\ + write_type dst; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1),uniConvertFp16toU8_2x8); \\\n\ + dst_ptr[0] = dst; \\\n\ +}\n\ +GATHER_ND_ARRAY_F16_TO_QINT_2D(U8, vxc_uchar16, vxc_uchar16*, 1)\n\ +GATHER_ND_ARRAY_F16_TO_QINT_2D(I8, vxc_char16, vxc_char16*, 1)\n\ +GATHER_ND_ARRAY_F16_TO_QINT_2D(I16, vxc_short8, vxc_short8*, 2)"; /* end of gather_nd_2d_mix_vx*/ static const char gather_nd_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -13860,6 +16371,123 @@ __kernel void gather_nd_F16toF16_3D(\n\ VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ +__kernel void gather_nd_array_I8toI8_3D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.w = 0;\n\ +\n\ + Tensor img1 = create_tensor_from_image2d_array(input0, 1);\n\ + Image img2 = create_image_from_image2d(output, 1);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\ + __global char* data_ptr = (__global char*)input_ptr;\n\ + __global char* dst_ptr = (__global char*)output_ptr;\n\ + char src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_nd_array_U8toU8_3D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ +\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.w = 0;\n\ +\n\ + Tensor img1 = create_tensor_from_image2d_array(input0, 1);\n\ + Image img2 = create_image_from_image2d(output, 1);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\ + __global uchar* data_ptr = (__global uchar*)input_ptr;\n\ + __global uchar* dst_ptr = (__global uchar*)output_ptr;\n\ + uchar src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +\n\ +}\n\ +\n\ +__kernel void gather_nd_array_I16toI16_3D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.w = 0;\n\ +\n\ + Tensor img1 = create_tensor_from_image2d_array(input0, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\ + __global short* data_ptr = (__global short*)input_ptr;\n\ + __global short* dst_ptr = (__global short*)output_ptr;\n\ + short src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_nd_array_F16toF16_3D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ +\n\ + int4 coord = (int4)(0, gidy, gidx, 0);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.w = 0;\n\ +\n\ + Tensor img1 = create_tensor_from_image2d_array(input0, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\ + __global short* data_ptr = (__global short*)input_ptr;\n\ + __global short* dst_ptr = (__global short*)output_ptr;\n\ + short src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}\n\ "; /* end of gather_nd_3d_vx*/ static const char gather_nd_3d_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -13944,6 +16572,89 @@ GATHER_ND_F16_TO_QINT_3D(U8, vxc_uchar16)\n\ GATHER_ND_F16_TO_QINT_3D(I8, vxc_char16)\n\ GATHER_ND_F16_TO_QINT_3D(I16, vxc_short8)\n\ \n\ +#define GATHER_ND_ARRAY_QINT_TO_F16_3D(src0_type_name, read_type, ptr_type, stride) \\\n\ +__kernel void gather_nd_array_##src0_type_name##toF16_3D( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ + Image img = create_image_from_image2d(input1, 4); \\\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\ + int4 indice = ((int4 *)indice_ptr)[0]; \\\n\ + \\\n\ + indice.x = indice.x * block_size + gidx; \\\n\ + indice.w = 0; \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input0, stride); \\\n\ + Image img2 = create_image_from_image2d(output, 2); \\\n\ + \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \\\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \\\n\ + \\\n\ + __global ptr_type data_ptr = (__global ptr_type)input_ptr; \\\n\ + __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \\\n\ + read_type src = data_ptr[0]; \\\n\ + \\\n\ + vxc_half8 src0; \\\n\ + vxc_short8 dst0; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst0, src0, 16); \\\n\ + dst_ptr[0] = dst0; \\\n\ +}\n\ +GATHER_ND_ARRAY_QINT_TO_F16_3D(U8, vxc_uchar16, vxc_uchar16*, 1)\n\ +GATHER_ND_ARRAY_QINT_TO_F16_3D(I8, vxc_char16, vxc_char16*, 1)\n\ +GATHER_ND_ARRAY_QINT_TO_F16_3D(I16, vxc_short8, vxc_short8*, 2)\n\ +\n\ +#define GATHER_ND_ARRAY_F16_TO_QINT_3D(src1_type_name, write_type, ptr_type, stride) \\\n\ +__kernel void gather_nd_array_F16to##src1_type_name##_3D( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ + Image img = create_image_from_image2d(input1, 4); \\\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\ + int4 indice = ((int4 *)indice_ptr)[0]; \\\n\ + \\\n\ + indice.x = indice.x * block_size + gidx; \\\n\ + indice.w = 0; \\\n\ + \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input0, 2); \\\n\ + Image img2 = create_image_from_image2d(output, stride); \\\n\ + \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \\\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \\\n\ + \\\n\ + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \\\n\ + __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \\\n\ + vxc_short8 src = data_ptr[0]; \\\n\ + \\\n\ + vxc_ushort8 mp1; \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + vxc_half8 data; \\\n\ + write_type dst; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1), uniConvertFp16toU8_2x8); \\\n\ + dst_ptr[0] = dst; \\\n\ +}\n\ +GATHER_ND_ARRAY_F16_TO_QINT_3D(U8, vxc_uchar16, vxc_uchar16*, 1)\n\ +GATHER_ND_ARRAY_F16_TO_QINT_3D(I8, vxc_char16, vxc_char16*, 1)\n\ +GATHER_ND_ARRAY_F16_TO_QINT_3D(I16, vxc_short8, vxc_short8*, 2)\n\ +\n\ "; /* end of gather_nd_3d_mix_vx*/ static const char gather_nd_batch_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -14043,7 +16754,121 @@ __kernel void gather_nd_batch_F16toF16_1D(\n\ VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ -"; /* end of gather_nd_batch_vx*/ +\n\ +__kernel void gather_nd_array_batch_I8toI8_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ +\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\ +\n\ + Image img1 = create_image_from_image2d(input0, 1);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 1);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global char* data_ptr = (__global char*)input_ptr;\n\ + __global char* dst_ptr = (__global char*)output_ptr;\n\ + char src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_nd_array_batch_U8toU8_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ +\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\ +\n\ + Image img1 = create_image_from_image2d(input0, 1);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 1);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global uchar* data_ptr = (__global uchar*)input_ptr;\n\ + __global uchar* dst_ptr = (__global uchar*)output_ptr;\n\ + uchar src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_nd_array_batch_I16toI16_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ +\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\ +\n\ + Image img1 = create_image_from_image2d(input0, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global short* data_ptr = (__global short*)input_ptr;\n\ + __global short* dst_ptr = (__global short*)output_ptr;\n\ + short src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_nd_array_batch_F16toF16_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ +\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\ +\n\ + Image img1 = create_image_from_image2d(input0, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global short* data_ptr = (__global short*)input_ptr;\n\ + __global short* dst_ptr = (__global short*)output_ptr;\n\ + short src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}"; /* end of gather_nd_batch_vx*/ static const char gather_nd_batch_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -14073,7 +16898,7 @@ __kernel void gather_nd_batch_I8toI8_2D(\n\ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -__kernel void gather_nd_U8toU8_2D(\n\ +__kernel void gather_nd_batch_U8toU8_2D(\n\ __read_only image2d_array_t input0,\n\ __read_only image2d_array_t input1,\n\ __write_only image2d_array_t output,\n\ @@ -14098,7 +16923,7 @@ __kernel void gather_nd_U8toU8_2D(\n\ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -__kernel void gather_nd_I16toI16_2D(\n\ +__kernel void gather_nd_batch_I16toI16_2D(\n\ __read_only image2d_array_t input0,\n\ __read_only image2d_array_t input1,\n\ __write_only image2d_array_t output,\n\ @@ -14123,7 +16948,7 @@ __kernel void gather_nd_I16toI16_2D(\n\ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -__kernel void gather_nd_F16toF16_2D(\n\ +__kernel void gather_nd_batch_F16toF16_2D(\n\ __read_only image2d_array_t input0,\n\ __read_only image2d_array_t input1,\n\ __write_only image2d_array_t output,\n\ @@ -14147,6 +16972,126 @@ __kernel void gather_nd_F16toF16_2D(\n\ VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ +\n\ +__kernel void gather_nd_array_batch_I8toI8_2D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ +\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.zw = coord.zw;\n\ +\n\ + Tensor img1 = create_tensor_from_image2d_array(input0, 1);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 1);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global char* data_ptr = (__global char*)input_ptr;\n\ + __global char* dst_ptr = (__global char*)output_ptr;\n\ + char src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_nd_array_batch_U8toU8_2D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ +\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.zw = coord.zw;\n\ +\n\ + Tensor img1 = create_tensor_from_image2d_array(input0, 1);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 1);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global uchar* data_ptr = (__global uchar*)input_ptr;\n\ + __global uchar* dst_ptr = (__global uchar*)output_ptr;\n\ + uchar src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_nd_array_batch_I16toI16_2D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ +\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.zw = coord.zw;\n\ +\n\ + Tensor img1 = create_tensor_from_image2d_array(input0, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global short* data_ptr = (__global short*)input_ptr;\n\ + __global short* dst_ptr = (__global short*)output_ptr;\n\ + short src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_nd_array_batch_F16toF16_2D(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int block_size,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ +\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ + indice.x = indice.x * block_size + gidx;\n\ + indice.zw = coord.zw;\n\ +\n\ + Tensor img1 = create_tensor_from_image2d_array(input0, 2);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global short* data_ptr = (__global short*)input_ptr;\n\ + __global short* dst_ptr = (__global short*)output_ptr;\n\ + short src = data_ptr[0];\n\ + dst_ptr[0] = src;\n\ +}\n\ "; /* end of gather_nd_batch_2d_vx*/ static const char gather_nd_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -14232,6 +17177,88 @@ GATHER_ND_F16_TO_QINT_1D(U8, vxc_uchar16)\n\ GATHER_ND_F16_TO_QINT_1D(I8, vxc_char16)\n\ GATHER_ND_F16_TO_QINT_1D(I16, vxc_short8)\n\ \n\ +#define GATHER_ND_ARRAY_QINT_TO_F16_1D(src0_type_name, read_type, ptr_type, stride) \\\n\ +__kernel void gather_nd_array_##src0_type_name##toF16_1D( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ + Image img = create_image_from_image2d(input1, 4); \\\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\ + int4 indice = ((int4 *)indice_ptr)[0]; \\\n\ + \\\n\ + coord.w = indice.x; \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(input0, stride); \\\n\ + Image img2 = create_image_from_image2d(output, 2); \\\n\ + \\\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \\\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \\\n\ + \\\n\ + __global ptr_type data_ptr = (__global ptr_type)input_ptr; \\\n\ + __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \\\n\ + read_type src = data_ptr[0]; \\\n\ + \\\n\ + vxc_half8 src0; \\\n\ + vxc_short8 dst0; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst0, src0, 16); \\\n\ + dst_ptr[0] = dst0; \\\n\ +}\n\ +GATHER_ND_ARRAY_QINT_TO_F16_1D(U8, vxc_uchar16, vxc_uchar16*, 1)\n\ +GATHER_ND_ARRAY_QINT_TO_F16_1D(I8, vxc_char16, vxc_char16*, 1)\n\ +GATHER_ND_ARRAY_QINT_TO_F16_1D(I16, vxc_short8, vxc_short8*, 2)\n\ +\n\ +#define GATHER_ND_ARRAY_F16_TO_QINT_1D(src1_type_name, write_type, ptr_type, stride) \\\n\ +__kernel void gather_nd_array_F16to##src1_type_name##_1D( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int coord_dim \\\n\ + ) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + \\\n\ + int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ + Image img = create_image_from_image2d(input1, 4); \\\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\ + int4 indice = ((int4 *)indice_ptr)[0]; \\\n\ + \\\n\ + coord.w = indice.x; \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(input0, 2); \\\n\ + Image img2 = create_image_from_image2d(output, stride); \\\n\ + \\\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \\\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \\\n\ + \\\n\ + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \\\n\ + __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \\\n\ + vxc_short8 src = data_ptr[0]; \\\n\ + vxc_ushort8 mp1; \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + vxc_half8 data; \\\n\ + write_type dst; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \\\n\ + dst_ptr[0] = dst; \\\n\ +}\n\ +GATHER_ND_ARRAY_F16_TO_QINT_1D(U8, vxc_uchar16, vxc_uchar16*, 1)\n\ +GATHER_ND_ARRAY_F16_TO_QINT_1D(I8, vxc_char16, vxc_char16*, 1)\n\ +GATHER_ND_ARRAY_F16_TO_QINT_1D(I16, vxc_short8, vxc_short8*, 2)\n\ +\n\ +\n\ "; /* end of gather_nd_mix_vx*/ static const char get_matrix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -39749,7 +42776,7 @@ __kernel void pre_process_gray_half_U8toU8\n\ \n\ coord_in.xy = coord_in.xy >> 1;\n\ \n\ - VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of pre_process_gray_2_vx*/ @@ -60368,6 +63395,169 @@ __kernel void clip_U8toF32_2D(\n\ }\n\ "; /* end of clip_U8_cl*/ +static const char col2im_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width_pad;\n\ +_viv_uniform int height_pad;\n\ +_viv_uniform int depth_pad;\n\ +_viv_uniform int move_time_x;\n\ +_viv_uniform int move_time_y;\n\ +_viv_uniform int kernel_x_new;\n\ +_viv_uniform int kernel_y_new;\n\ +_viv_uniform int kernel_z_new;\n\ +_viv_uniform int depth;\n\ +\n\ +#define COL2IM(name, read_type, dst_type ,convert_type, write_type) \\\n\ +__kernel void col2im_##name \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int stride_w, \\\n\ + int stride_h, \\\n\ + int stride_d, \\\n\ + int dilation_w, \\\n\ + int dilation_h, \\\n\ + int dilation_d, \\\n\ + int pad_w_front, \\\n\ + int pad_w_end, \\\n\ + int pad_h_front, \\\n\ + int pad_h_end, \\\n\ + int pad_d_front, \\\n\ + int pad_d_end, \\\n\ + int kernel_x, \\\n\ + int kernel_y, \\\n\ + int kernel_z, \\\n\ + float inOutScale, \\\n\ + float inOutTile \\\n\ +) \\\n\ +{ \\\n\ + int x = get_global_id(0); \\\n\ + int y = get_global_id(1); \\\n\ + int z = get_global_id(2); \\\n\ + int4 coord_out = (int4)(x,y,z,0); \\\n\ + int b = z / depth; \\\n\ + z = z % depth; \\\n\ + int4 coord_in = (int4)(0,0,b,0); \\\n\ + \\\n\ + float sum = 0.0f; \\\n\ + x = x + pad_w_front; \\\n\ + y = y + pad_h_front; \\\n\ + z = z + pad_d_front; \\\n\ + int offset_x = x % stride_w; \\\n\ + int offset_y = y % stride_h; \\\n\ + int offset_z = z % stride_d; \\\n\ + int i,j,k; \\\n\ + for (k = offset_z; k < kernel_z_new; k += stride_d) \\\n\ + { \\\n\ + if ((z - k) < 0 || (z + (kernel_z_new - k)) > depth_pad || k % dilation_d != 0) \\\n\ + { \\\n\ + continue; \\\n\ + } \\\n\ + for (j = offset_y; j < kernel_y_new; j = j + stride_h) \\\n\ + { \\\n\ + if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \\\n\ + { \\\n\ + continue; \\\n\ + } \\\n\ + for (i = offset_x; i < kernel_x_new; i = i + stride_w) \\\n\ + { \\\n\ + if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \\\n\ + { \\\n\ + continue; \\\n\ + } \\\n\ + coord_in.x = (x - i + stride_w - 1) / stride_w + \\\n\ + (y - j + stride_h - 1) / stride_h * move_time_x + \\\n\ + (z - k + stride_d - 1) / stride_d * move_time_y * move_time_x; \\\n\ + coord_in.y = i / dilation_w + j * kernel_x / dilation_h + k * kernel_x * kernel_y / dilation_d; \\\n\ + sum = sum + convert_float(read_type(input, coord_in).x); \\\n\ + } \\\n\ + } \\\n\ + } \\\n\ + sum = sum * inOutScale + inOutTile; \\\n\ + dst_type dst = 0; \\\n\ + dst.x = convert_type(sum); \\\n\ + write_type(output, coord_out, dst); \\\n\ +}\n\ +COL2IM(U32toU32, read_imageui, uint4, convert_uint, write_imageui)\n\ +COL2IM(U32toI32, read_imageui, int4, convert_int, write_imagei)\n\ +COL2IM(U32toF32, read_imageui, float4, convert_float, write_imagef)\n\ +COL2IM(I32toU32, read_imagei, uint4, convert_uint, write_imageui)\n\ +COL2IM(I32toI32, read_imagei, int4, convert_int, write_imagei)\n\ +COL2IM(I32toF32, read_imagei, float4, convert_float, write_imagef)\n\ +COL2IM(F32toU32, read_imagef, uint4, convert_uint, write_imageui)\n\ +COL2IM(F32toI32, read_imagef, int4, convert_int, write_imagei)\n\ +COL2IM(F32toF32, read_imagef, float4, convert_float, write_imagef)\n\ +\n\ +#define COL2IM_2D(name, read_type, dst_type ,convert_type, write_type) \\\n\ +__kernel void col2im_##name##_2D \\\n\ +( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int stride_w, \\\n\ + int stride_h, \\\n\ + int stride_d, \\\n\ + int dilation_w, \\\n\ + int dilation_h, \\\n\ + int dilation_d, \\\n\ + int pad_w_front, \\\n\ + int pad_w_end, \\\n\ + int pad_h_front, \\\n\ + int pad_h_end, \\\n\ + int pad_d_front, \\\n\ + int pad_d_end, \\\n\ + int kernel_x, \\\n\ + int kernel_y, \\\n\ + int kernel_z, \\\n\ + float inOutScale, \\\n\ + float inOutTile \\\n\ +) \\\n\ +{ \\\n\ + int x = get_global_id(0); \\\n\ + int y = get_global_id(1); \\\n\ + int z = get_global_id(2); \\\n\ + int4 coord_out = (int4)(x,y,z,0); \\\n\ + int4 coord_in = (int4)(0,0,z,0); \\\n\ + \\\n\ + float sum = 0.0f; \\\n\ + x = x + pad_w_front; \\\n\ + y = y + pad_h_front; \\\n\ + int offset_x = x % stride_w; \\\n\ + int offset_y = y % stride_h; \\\n\ + int i,j; \\\n\ + for (j = offset_y; j < kernel_y_new; j = j + stride_h) \\\n\ + { \\\n\ + if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \\\n\ + { \\\n\ + continue; \\\n\ + } \\\n\ + for (i = offset_x; i < kernel_x_new; i = i + stride_w) \\\n\ + { \\\n\ + if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \\\n\ + { \\\n\ + continue; \\\n\ + } \\\n\ + coord_in.x = (x - i + stride_w - 1) / stride_w + \\\n\ + (y - j + stride_h - 1) / stride_h * move_time_x; \\\n\ + coord_in.y = i / dilation_w + j * kernel_x / dilation_h; \\\n\ + sum = sum + convert_float(read_type(input, coord_in).x); \\\n\ + } \\\n\ + } \\\n\ + sum = sum * inOutScale + inOutTile; \\\n\ + dst_type dst = 0; \\\n\ + dst.x = convert_type(sum); \\\n\ + write_type(output, coord_out, dst); \\\n\ +}\n\ +COL2IM_2D(U32toU32, read_imageui, uint4, convert_uint, write_imageui)\n\ +COL2IM_2D(U32toI32, read_imageui, int4, convert_int, write_imagei)\n\ +COL2IM_2D(U32toF32, read_imageui, float4, convert_float, write_imagef)\n\ +COL2IM_2D(I32toU32, read_imagei, uint4, convert_uint, write_imageui)\n\ +COL2IM_2D(I32toI32, read_imagei, int4, convert_int, write_imagei)\n\ +COL2IM_2D(I32toF32, read_imagei, float4, convert_float, write_imagef)\n\ +COL2IM_2D(F32toU32, read_imagef, uint4, convert_uint, write_imageui)\n\ +COL2IM_2D(F32toI32, read_imagef, int4, convert_int, write_imagei)\n\ +COL2IM_2D(F32toF32, read_imagef, float4, convert_float, write_imagef)"; /* end of col2im_cl*/ + static const char crop_and_resize_bilinear_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ #include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -60582,7 +63772,339 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\ for(coord.z = channel - 1; coord.z > 0; coord.z--)\n\ {\n\ float4 data = read_imagef(input, coord);\n\ - coord_out.z--;\n\ + coord_out.z--;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord_out, sum);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord_out.z = 0;\n\ + write_imagef(output, coord_out, sum);\n\ + for(coord.z = 0; coord.z < channel - 1; coord.z++)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord_out.z++;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord_out, sum);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord, sum);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord, sum);\n\ + }\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \\\n\ +__kernel void cumsum_##name##toU8_axis2( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int exclusive, \\\n\ + int rev, \\\n\ + int width, \\\n\ + int height, \\\n\ + int channel, \\\n\ + int input_zp, \\\n\ + float in_out_scale, \\\n\ + float in_out_zp_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + src_type sum = (src_type)(0); \\\n\ + uint4 dst = (uint4)(0); \\\n\ + int tmp_zp = convert_int_rte(output_zp); \\\n\ + dst.x = convert_uint_sat(tmp_zp); \\\n\ + \\\n\ + float cnt = 0.0f; \\\n\ + \\\n\ + if(exclusive && rev) \\\n\ + { \\\n\ + coord_out.z = channel - 1; \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + for(coord.z = channel - 1; coord.z > 0; coord.z--) \\\n\ + { \\\n\ + src_type data = read_image_type(input, coord); \\\n\ + coord_out.z--; \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = (uint)convert_int_rte(tmpSum); \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive) \\\n\ + { \\\n\ + coord_out.z = 0; \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + for(coord.z = 0; coord.z < channel - 1; coord.z++) \\\n\ + { \\\n\ + src_type data = read_image_type(input, coord); \\\n\ + coord_out.z++; \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = (uint)convert_int_rte(tmpSum); \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + } \\\n\ + } \\\n\ + else if(rev) \\\n\ + { \\\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\ + { \\\n\ + src_type data = read_image_type(input, coord); \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = (uint)convert_int_rte(tmpSum); \\\n\ + write_imageui(output, coord, dst); \\\n\ + } \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ + { \\\n\ + src_type data = read_image_type(input, coord); \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = (uint)convert_int_rte(tmpSum); \\\n\ + write_imageui(output, coord, dst); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)\n\ +CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)\n\ +\n\ +\n\ +\n\ +__kernel void cumsum_F32toF32_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int channel,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord_out.y = height - 1;\n\ + write_imagef(output, coord_out, sum);\n\ + for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord_out.y--;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord_out, sum);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord_out.y = 0;\n\ + write_imagef(output, coord_out, sum);\n\ + for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord_out.y++;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord_out, sum);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord, sum);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord, sum);\n\ + }\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \\\n\ +__kernel void cumsum_##name##toU8_axis1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int exclusive, \\\n\ + int rev, \\\n\ + int width, \\\n\ + int height, \\\n\ + int channel, \\\n\ + int input_zp, \\\n\ + float in_out_scale, \\\n\ + float in_out_zp_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + src_type sum = (src_type)(0); \\\n\ + uint4 dst = (uint4)(0); \\\n\ + int tmp_zp = convert_int_rte(output_zp); \\\n\ + dst.x = convert_uint_sat(tmp_zp); \\\n\ + \\\n\ + float cnt = 0; \\\n\ + \\\n\ + if(exclusive && rev) \\\n\ + { \\\n\ + coord_out.y = height - 1; \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + \\\n\ + for(coord.y = height - 1; coord.y > 0; coord.y--) \\\n\ + { \\\n\ + src_type data = read_image_type(input, coord); \\\n\ + cnt += 1.0f; \\\n\ + coord_out.y--; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = (uint)convert_int_rte(tmpSum); \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive) \\\n\ + { \\\n\ + coord_out.y = 0; \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + for(coord.y = 0; coord.y < height - 1; coord.y++) \\\n\ + { \\\n\ + src_type data = read_image_type(input, coord); \\\n\ + cnt += 1.0f; \\\n\ + coord_out.y++; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = (uint)convert_int_rte(tmpSum); \\\n\ + write_imageui(output, coord_out, dst); \\\n\ + } \\\n\ + } \\\n\ + else if(rev) \\\n\ + { \\\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\ + { \\\n\ + src_type data = read_image_type(input, coord); \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = (uint)convert_int_rte(tmpSum); \\\n\ + write_imageui(output, coord, dst); \\\n\ + } \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + src_type data = read_image_type(input, coord); \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst.x = (uint)convert_int_rte(tmpSum); \\\n\ + write_imageui(output, coord, dst); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)\n\ +CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)\n\ +\n\ +\n\ +__kernel void cumsum_F32toF32_axis0(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int channel,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord_out.x = width - 1;\n\ + write_imagef(output, coord_out, sum);\n\ + for(coord.x = width - 1; coord.x > 0; coord.x--)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord_out.x--;\n\ sum += data;\n\ \n\ write_imagef(output, coord_out, sum);\n\ @@ -60590,12 +64112,12 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\ }\n\ else if(exclusive)\n\ {\n\ - coord_out.z = 0;\n\ + coord_out.x = 0;\n\ write_imagef(output, coord_out, sum);\n\ - for(coord.z = 0; coord.z < channel - 1; coord.z++)\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ {\n\ float4 data = read_imagef(input, coord);\n\ - coord_out.z++;\n\ + coord_out.x++;\n\ sum += data;\n\ \n\ write_imagef(output, coord_out, sum);\n\ @@ -60603,7 +64125,7 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\ }\n\ else if(rev)\n\ {\n\ - for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ {\n\ float4 data = read_imagef(input, coord);\n\ sum += data;\n\ @@ -60613,7 +64135,7 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\ }\n\ else\n\ {\n\ - for(coord.z = 0; coord.z < channel; coord.z++)\n\ + for(coord.x = 0; coord.x < width; coord.x++)\n\ {\n\ float4 data = read_imagef(input, coord);\n\ sum += data;\n\ @@ -60623,8 +64145,8 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\ }\n\ }\n\ \n\ -#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \\\n\ -__kernel void cumsum_##name##toU8_axis2( \\\n\ +#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \\\n\ +__kernel void cumsum_##name##toU8_axis0( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int axis, \\\n\ @@ -60647,16 +64169,16 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\ int tmp_zp = convert_int_rte(output_zp); \\\n\ dst.x = convert_uint_sat(tmp_zp); \\\n\ \\\n\ - float cnt = 0.0f; \\\n\ + float cnt = 0; \\\n\ \\\n\ if(exclusive && rev) \\\n\ { \\\n\ - coord_out.z = channel - 1; \\\n\ + coord_out.x = width - 1; \\\n\ write_imageui(output, coord_out, dst); \\\n\ - for(coord.z = channel - 1; coord.z > 0; coord.z--) \\\n\ + for(coord.x = width - 1; coord.x > 0; coord.x--) \\\n\ { \\\n\ src_type data = read_image_type(input, coord); \\\n\ - coord_out.z--; \\\n\ + coord_out.x--; \\\n\ cnt += 1.0f; \\\n\ sum += data; \\\n\ \\\n\ @@ -60669,12 +64191,12 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\ } \\\n\ else if(exclusive) \\\n\ { \\\n\ - coord_out.z = 0; \\\n\ + coord_out.x = 0; \\\n\ write_imageui(output, coord_out, dst); \\\n\ - for(coord.z = 0; coord.z < channel - 1; coord.z++) \\\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++) \\\n\ { \\\n\ src_type data = read_image_type(input, coord); \\\n\ - coord_out.z++; \\\n\ + coord_out.x++; \\\n\ cnt += 1.0f; \\\n\ sum += data; \\\n\ \\\n\ @@ -60687,7 +64209,7 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\ } \\\n\ else if(rev) \\\n\ { \\\n\ - for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--) \\\n\ { \\\n\ src_type data = read_image_type(input, coord); \\\n\ cnt += 1.0f; \\\n\ @@ -60702,7 +64224,7 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\ } \\\n\ else \\\n\ { \\\n\ - for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ + for(coord.x = 0; coord.x < width; coord.x++) \\\n\ { \\\n\ src_type data = read_image_type(input, coord); \\\n\ cnt += 1.0f; \\\n\ @@ -60716,344 +64238,851 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\ } \\\n\ } \\\n\ }\n\ -CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)\n\ -CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)\n\ +CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)\n\ +CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)\n\ +"; /* end of cumsum_cl*/ + +static const char cumsum_2d_cl[] = "\n\ +__kernel void cumsum_F32toF32_axis1_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord.w = height - 1;\n\ + write_imagef(output, coord.zw, sum);\n\ + for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + coord.w--;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.zw, sum);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + write_imagef(output, coord.zw, sum);\n\ + for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + coord.w++;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.zw, sum);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.xy, sum);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.xy, sum);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_U8toU8_axis1_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + uint4 sum = (uint4)(0);\n\ + uint4 dst = (uint4)(0);\n\ +\n\ + int tmp_zp = convert_int_rte(output_zp);\n\ + dst.x = convert_uint_sat(tmp_zp);\n\ +\n\ + float cnt = 0;\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord.w = height - 1;\n\ + write_imageui(output, coord.zw, dst);\n\ + for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + coord.w--;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.zw, dst);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + write_imageui(output, coord.zw, dst);\n\ + for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + coord.w++;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.zw, dst);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.xy, dst);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.xy, dst);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_F32toU8_axis1_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + float4 sum = (float4)(0);\n\ + uint4 dst = (uint4)(0);\n\ + int tmp_zp = convert_int_rte(output_zp);\n\ + dst.x = convert_uint_sat(tmp_zp);\n\ +\n\ + float cnt = 0;\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord.w = height - 1;\n\ + write_imageui(output, coord.zw, dst);\n\ + for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + cnt += 1.0f;\n\ + coord.w--;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.zw, dst);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + write_imageui(output, coord.zw, dst);\n\ + for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + cnt += 1.0f;\n\ + coord.w++;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.zw, dst);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.xy, dst);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.xy, dst);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_F32toF32_axis0_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord.x = width - 1;\n\ + coord.z = coord.x;\n\ + write_imagef(output, coord.zw, sum);\n\ + for(; coord.x > 0; coord.x--)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + coord.z--;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.zw, sum);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord.z = 0;\n\ + write_imagef(output, coord.zw, sum);\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + coord.z++;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.zw, sum);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.xy, sum);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.x = 0; coord.x < width; coord.x++)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.xy, sum);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_U8toU8_axis0_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + uint4 sum = (uint4)(0);\n\ + uint4 dst = (uint4)(0);\n\ +\n\ + int tmp_zp = convert_int_rte(output_zp);\n\ + dst.x = convert_uint_sat(tmp_zp);\n\ +\n\ + float cnt = 0.0f;\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord.x = width - 1;\n\ + coord.z = coord.x;\n\ + write_imageui(output, coord.zw, dst);\n\ + for(; coord.x > 0; coord.x--)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + coord.z--;\n\ + cnt += 1.0;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.zw, dst);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord.z = 0;\n\ + write_imageui(output, coord.zw, dst);\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + coord.z++;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.zw, dst);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.xy, dst);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.x = 0; coord.x < width; coord.x++)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.xy, dst);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_F32toU8_axis0_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + float4 sum = (float4)(0);\n\ + uint4 dst = (uint4)(0);\n\ + int tmp_zp = convert_int_rte(output_zp);\n\ + dst.x = convert_uint_sat(tmp_zp);\n\ +\n\ + float cnt = 0.0f;\n\ + if(exclusive && rev)\n\ + {\n\ + coord.x = width - 1;\n\ + coord.z = coord.x;\n\ + write_imageui(output, coord.zw, dst);\n\ + for(; coord.x > 0; coord.x--)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + coord.z--;\n\ + cnt += 1.0;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.zw, dst);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord.z = 0;\n\ + write_imageui(output, coord.zw, dst);\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + cnt += 1.0f;\n\ + coord.z++;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.zw, dst);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ \n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.xy, dst);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.x = 0; coord.x < width; coord.x++)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ \n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ \n\ -__kernel void cumsum_F32toF32_axis1(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output,\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.xy, dst);\n\ + }\n\ + }\n\ +}\n\ +"; /* end of cumsum_2d_cl*/ + +static const char cumsum_array_2d_axis0_cl[] = "\n\ +__kernel void cumsum_array_F32toF32_axis0_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ int axis,\n\ int exclusive,\n\ int rev,\n\ int width,\n\ int height,\n\ - int channel,\n\ + int chn,\n\ int input_zp,\n\ float in_out_scale,\n\ float in_out_zp_scale,\n\ float output_zp\n\ )\n\ {\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - int4 coord_out = coord;\n\ -\n\ - float4 sum = (float4)(0);\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ \n\ + float sum = (float)(0);\n\ + Image img1 = create_image_from_image2d(input, 4);\n\ + Image img2 = create_image_from_image2d(output, 4);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global float* in_ptr = (__global float*)input_ptr;\n\ + __global float* out_ptr = (__global float*)output_ptr;\n\ if(exclusive && rev)\n\ {\n\ - coord_out.y = height - 1;\n\ - write_imagef(output, coord_out, sum);\n\ - for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ + coord.x = width - 1;\n\ + coord.z = coord.x;\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ +\n\ + for(; coord.x > 0; coord.x--)\n\ {\n\ - float4 data = read_imagef(input, coord);\n\ - coord_out.y--;\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ + coord.z--;\n\ sum += data;\n\ \n\ - write_imagef(output, coord_out, sum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else if(exclusive)\n\ {\n\ - coord_out.y = 0;\n\ - write_imagef(output, coord_out, sum);\n\ - for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ + coord.z = 0;\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ {\n\ - float4 data = read_imagef(input, coord);\n\ - coord_out.y++;\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ + coord.z++;\n\ sum += data;\n\ \n\ - write_imagef(output, coord_out, sum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else if(rev)\n\ {\n\ - for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ {\n\ - float4 data = read_imagef(input, coord);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ sum += data;\n\ \n\ - write_imagef(output, coord, sum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else\n\ {\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ + for(coord.x = 0; coord.x < width; coord.x++)\n\ {\n\ - float4 data = read_imagef(input, coord);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ sum += data;\n\ \n\ - write_imagef(output, coord, sum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ }\n\ \n\ -#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \\\n\ -__kernel void cumsum_##name##toU8_axis1( \\\n\ - __read_only image2d_array_t input, \\\n\ - __write_only image2d_array_t output, \\\n\ - int axis, \\\n\ - int exclusive, \\\n\ - int rev, \\\n\ - int width, \\\n\ - int height, \\\n\ - int channel, \\\n\ - int input_zp, \\\n\ - float in_out_scale, \\\n\ - float in_out_zp_scale, \\\n\ - float output_zp \\\n\ - ) \\\n\ -{ \\\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ - int4 coord_out = coord; \\\n\ - \\\n\ - src_type sum = (src_type)(0); \\\n\ - uint4 dst = (uint4)(0); \\\n\ - int tmp_zp = convert_int_rte(output_zp); \\\n\ - dst.x = convert_uint_sat(tmp_zp); \\\n\ - \\\n\ - float cnt = 0; \\\n\ - \\\n\ - if(exclusive && rev) \\\n\ - { \\\n\ - coord_out.y = height - 1; \\\n\ - write_imageui(output, coord_out, dst); \\\n\ - \\\n\ - for(coord.y = height - 1; coord.y > 0; coord.y--) \\\n\ - { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ - cnt += 1.0f; \\\n\ - coord_out.y--; \\\n\ - sum += data; \\\n\ - \\\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ - \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord_out, dst); \\\n\ - } \\\n\ - } \\\n\ - else if(exclusive) \\\n\ - { \\\n\ - coord_out.y = 0; \\\n\ - write_imageui(output, coord_out, dst); \\\n\ - for(coord.y = 0; coord.y < height - 1; coord.y++) \\\n\ - { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ - cnt += 1.0f; \\\n\ - coord_out.y++; \\\n\ - sum += data; \\\n\ - \\\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ - \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord_out, dst); \\\n\ - } \\\n\ - } \\\n\ - else if(rev) \\\n\ - { \\\n\ - for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\ - { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ - cnt += 1.0f; \\\n\ - sum += data; \\\n\ - \\\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ - \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord, dst); \\\n\ - } \\\n\ - } \\\n\ - else \\\n\ - { \\\n\ - for(coord.y = 0; coord.y < height; coord.y++) \\\n\ - { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ - cnt += 1.0f; \\\n\ - sum += data; \\\n\ - \\\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ - \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord, dst); \\\n\ - } \\\n\ - } \\\n\ -}\n\ -CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)\n\ -CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)\n\ +__kernel void cumsum_array_U8toU8_axis0_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ \n\ + uint sum = (uint)(0);\n\ + uint dst = (uint)(0);\n\ \n\ -__kernel void cumsum_F32toF32_axis0(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output,\n\ + int tmp_zp = convert_int_rte(output_zp);\n\ + dst.x = convert_uint_sat(tmp_zp);\n\ +\n\ + float cnt = 0.0f;\n\ +\n\ + Image img1 = create_image_from_image2d(input, 4);\n\ + Image img2 = create_image_from_image2d(output, 4);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global uint* in_ptr = (__global uint*)input_ptr;\n\ + __global uint* out_ptr = (__global uint*)output_ptr;\n\ + if(exclusive && rev)\n\ + {\n\ + coord.x = width - 1;\n\ + coord.z = coord.x;\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = dst;\n\ + for(; coord.x > 0; coord.x--)\n\ + {\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global uint*)input_ptr;\n\ + uint data = in_ptr[0];\n\ + coord.z--;\n\ + cnt += 1.0;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ +\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord.z = 0;\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = dst;\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ + {\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global uint*)input_ptr;\n\ + uint data = in_ptr[0];\n\ + cnt += 1.0f;\n\ + coord.z++;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ +\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = dst;\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ + {\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global uint*)input_ptr;\n\ + uint data = in_ptr[0];\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ +\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = dst;\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.x = 0; coord.x < width; coord.x++)\n\ + {\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global uint*)input_ptr;\n\ + uint data = in_ptr[0];\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ +\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = dst;\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_array_F32toU8_axis0_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ int axis,\n\ int exclusive,\n\ int rev,\n\ int width,\n\ int height,\n\ - int channel,\n\ + int chn,\n\ int input_zp,\n\ float in_out_scale,\n\ float in_out_zp_scale,\n\ float output_zp\n\ )\n\ {\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - int4 coord_out = coord;\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ \n\ float4 sum = (float4)(0);\n\ + uint4 dst = (uint4)(0);\n\ + int tmp_zp = convert_int_rte(output_zp);\n\ + dst.x = convert_uint_sat(tmp_zp);\n\ \n\ + float cnt = 0.0f;\n\ + Image img1 = create_image_from_image2d(input, 4);\n\ + Image img2 = create_image_from_image2d(output, 4);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global float* in_ptr = (__global float*)input_ptr;\n\ + __global uint* out_ptr = (__global uint*)output_ptr;\n\ if(exclusive && rev)\n\ {\n\ - coord_out.x = width - 1;\n\ - write_imagef(output, coord_out, sum);\n\ - for(coord.x = width - 1; coord.x > 0; coord.x--)\n\ + coord.x = width - 1;\n\ + coord.z = coord.x;\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ + for(; coord.x > 0; coord.x--)\n\ {\n\ - float4 data = read_imagef(input, coord);\n\ - coord_out.x--;\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ + coord.z--;\n\ + cnt += 1.0;\n\ sum += data;\n\ \n\ - write_imagef(output, coord_out, sum);\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ +\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(exclusive)\n\ {\n\ - coord_out.x = 0;\n\ - write_imagef(output, coord_out, sum);\n\ + coord.z = 0;\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ {\n\ - float4 data = read_imagef(input, coord);\n\ - coord_out.x++;\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ + cnt += 1.0f;\n\ + coord.z++;\n\ sum += data;\n\ \n\ - write_imagef(output, coord_out, sum);\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ +\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(rev)\n\ {\n\ for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ {\n\ - float4 data = read_imagef(input, coord);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ + cnt += 1.0f;\n\ sum += data;\n\ \n\ - write_imagef(output, coord, sum);\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ +\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else\n\ {\n\ for(coord.x = 0; coord.x < width; coord.x++)\n\ {\n\ - float4 data = read_imagef(input, coord);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ + cnt += 1.0f;\n\ sum += data;\n\ \n\ - write_imagef(output, coord, sum);\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ }\n\ -\n\ -#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \\\n\ -__kernel void cumsum_##name##toU8_axis0( \\\n\ - __read_only image2d_array_t input, \\\n\ - __write_only image2d_array_t output, \\\n\ - int axis, \\\n\ - int exclusive, \\\n\ - int rev, \\\n\ - int width, \\\n\ - int height, \\\n\ - int channel, \\\n\ - int input_zp, \\\n\ - float in_out_scale, \\\n\ - float in_out_zp_scale, \\\n\ - float output_zp \\\n\ - ) \\\n\ -{ \\\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ - int4 coord_out = coord; \\\n\ - \\\n\ - src_type sum = (src_type)(0); \\\n\ - uint4 dst = (uint4)(0); \\\n\ - int tmp_zp = convert_int_rte(output_zp); \\\n\ - dst.x = convert_uint_sat(tmp_zp); \\\n\ - \\\n\ - float cnt = 0; \\\n\ - \\\n\ - if(exclusive && rev) \\\n\ - { \\\n\ - coord_out.x = width - 1; \\\n\ - write_imageui(output, coord_out, dst); \\\n\ - for(coord.x = width - 1; coord.x > 0; coord.x--) \\\n\ - { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ - coord_out.x--; \\\n\ - cnt += 1.0f; \\\n\ - sum += data; \\\n\ - \\\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ - \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord_out, dst); \\\n\ - } \\\n\ - } \\\n\ - else if(exclusive) \\\n\ - { \\\n\ - coord_out.x = 0; \\\n\ - write_imageui(output, coord_out, dst); \\\n\ - for(coord.x = 0; coord.x < width - 1; coord.x++) \\\n\ - { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ - coord_out.x++; \\\n\ - cnt += 1.0f; \\\n\ - sum += data; \\\n\ - \\\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ - \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord_out, dst); \\\n\ - } \\\n\ - } \\\n\ - else if(rev) \\\n\ - { \\\n\ - for(coord.x = width - 1; coord.x >= 0; coord.x--) \\\n\ - { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ - cnt += 1.0f; \\\n\ - sum += data; \\\n\ - \\\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ - \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord, dst); \\\n\ - } \\\n\ - } \\\n\ - else \\\n\ - { \\\n\ - for(coord.x = 0; coord.x < width; coord.x++) \\\n\ - { \\\n\ - src_type data = read_image_type(input, coord); \\\n\ - cnt += 1.0f; \\\n\ - sum += data; \\\n\ - \\\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\ - \\\n\ - dst.x = (uint)convert_int_rte(tmpSum); \\\n\ - write_imageui(output, coord, dst); \\\n\ - } \\\n\ - } \\\n\ -}\n\ -CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)\n\ -CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)\n\ -"; /* end of cumsum_cl*/ +"; /* end of cumsum_array_2d_axis0_cl*/ -static const char cumsum_2d_cl[] = "\n\ -__kernel void cumsum_F32toF32_axis1_2D(\n\ +static const char cumsum_array_2d_axis1_cl[] = "\n\ +__kernel void cumsum_array_F32toF32_axis1_2D(\n\ __read_only image2d_t input,\n\ __write_only image2d_t output,\n\ int axis,\n\ @@ -61070,19 +65099,30 @@ __kernel void cumsum_F32toF32_axis1_2D(\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ \n\ - float4 sum = (float4)(0);\n\ -\n\ + float sum = (float)(0);\n\ + Image img1 = create_image_from_image2d(input, 4);\n\ + Image img2 = create_image_from_image2d(output, 4);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global float* in_ptr = (__global float*)input_ptr;\n\ + __global float* out_ptr = (__global float*)output_ptr;\n\ if(exclusive && rev)\n\ {\n\ coord.w = height - 1;\n\ - write_imagef(output, coord.zw, sum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ coord.w--;\n\ sum += data;\n\ \n\ - write_imagef(output, coord.zw, sum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else if(exclusive)\n\ @@ -61090,36 +65130,47 @@ __kernel void cumsum_F32toF32_axis1_2D(\n\ write_imagef(output, coord.zw, sum);\n\ for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ coord.w++;\n\ sum += data;\n\ \n\ - write_imagef(output, coord.zw, sum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else if(rev)\n\ {\n\ for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ sum += data;\n\ -\n\ - write_imagef(output, coord.xy, sum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else\n\ {\n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ sum += data;\n\ \n\ - write_imagef(output, coord.xy, sum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ }\n\ \n\ -__kernel void cumsum_U8toU8_axis1_2D(\n\ +__kernel void cumsum_array_U8toU8_axis1_2D(\n\ __read_only image2d_t input,\n\ __write_only image2d_t output,\n\ int axis,\n\ @@ -61136,82 +65187,107 @@ __kernel void cumsum_U8toU8_axis1_2D(\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ \n\ - uint4 sum = (uint4)(0);\n\ - uint4 dst = (uint4)(0);\n\ + uint sum = (uint)(0);\n\ + uint dst = (uint)(0);\n\ \n\ int tmp_zp = convert_int_rte(output_zp);\n\ - dst.x = convert_uint_sat(tmp_zp);\n\ + dst = convert_uint_sat(tmp_zp);\n\ \n\ float cnt = 0;\n\ -\n\ + Image img1 = create_image_from_image2d(input, 4);\n\ + Image img2 = create_image_from_image2d(output, 4);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global uint* in_ptr = (__global uint*)input_ptr;\n\ + __global uint* out_ptr = (__global uint*)output_ptr;\n\ if(exclusive && rev)\n\ {\n\ coord.w = height - 1;\n\ - write_imageui(output, coord.zw, dst);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = dst;\n\ for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global uint*)input_ptr;\n\ + uint data = in_ptr[0];\n\ cnt += 1.0f;\n\ coord.w--;\n\ sum += data;\n\ \n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ \n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(exclusive)\n\ {\n\ - write_imageui(output, coord.zw, dst);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = dst;\n\ for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global uint*)input_ptr;\n\ + uint data = in_ptr[0];\n\ cnt += 1.0f;\n\ coord.w++;\n\ sum += data;\n\ \n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ \n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(rev)\n\ {\n\ for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global uint*)input_ptr;\n\ + uint data = in_ptr[0];\n\ cnt += 1.0f;\n\ sum += data;\n\ \n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ \n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else\n\ {\n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global uint*)input_ptr;\n\ + uint data = in_ptr[0];\n\ cnt += 1.0f;\n\ sum += data;\n\ \n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ \n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ }\n\ \n\ -__kernel void cumsum_F32toU8_axis1_2D(\n\ +__kernel void cumsum_array_F32toU8_axis1_2D(\n\ __read_only image2d_t input,\n\ __write_only image2d_t output,\n\ int axis,\n\ @@ -61228,334 +65304,757 @@ __kernel void cumsum_F32toU8_axis1_2D(\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ \n\ - float4 sum = (float4)(0);\n\ - uint4 dst = (uint4)(0);\n\ + float sum = (float)(0);\n\ + uint dst = (uint)(0);\n\ int tmp_zp = convert_int_rte(output_zp);\n\ - dst.x = convert_uint_sat(tmp_zp);\n\ + dst = convert_uint_sat(tmp_zp);\n\ \n\ float cnt = 0;\n\ -\n\ + Image img1 = create_image_from_image2d(input, 4);\n\ + Image img2 = create_image_from_image2d(output, 4);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global float* in_ptr = (__global float*)input_ptr;\n\ + __global uint* out_ptr = (__global uint*)output_ptr;\n\ if(exclusive && rev)\n\ {\n\ coord.w = height - 1;\n\ - write_imageui(output, coord.zw, dst);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ cnt += 1.0f;\n\ coord.w--;\n\ sum += data;\n\ \n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ \n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(exclusive)\n\ {\n\ - write_imageui(output, coord.zw, dst);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ cnt += 1.0f;\n\ coord.w++;\n\ sum += data;\n\ \n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ \n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else if(rev)\n\ {\n\ for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ cnt += 1.0f;\n\ sum += data;\n\ \n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ \n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ else\n\ {\n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ + input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ cnt += 1.0f;\n\ sum += data;\n\ \n\ float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ + float tmpSum = sum * in_out_scale + tmpAlpha;\n\ \n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ + dst = (uint)convert_int_rte(tmpSum);\n\ + output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\ + out_ptr = (__global uint*)output_ptr;\n\ + out_ptr[0] = dst;\n\ }\n\ }\n\ }\n\ -\n\ -__kernel void cumsum_F32toF32_axis0_2D(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ +"; /* end of cumsum_array_2d_axis1_cl*/ + +static const char cumsum_array_axis0_cl[] = "\n\ +__kernel void cumsum_array_F32toF32_axis0(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ int axis,\n\ int exclusive,\n\ int rev,\n\ int width,\n\ int height,\n\ - int chn,\n\ + int channel,\n\ int input_zp,\n\ float in_out_scale,\n\ float in_out_zp_scale,\n\ float output_zp\n\ )\n\ {\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - float4 sum = (float4)(0);\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ \n\ + float sum = (float)(0);\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 4);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 4);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global float* in_ptr = (__global float*)input_ptr;\n\ + __global float* out_ptr = (__global float*)output_ptr;\n\ if(exclusive && rev)\n\ {\n\ - coord.x = width - 1;\n\ - coord.z = coord.x;\n\ - write_imagef(output, coord.zw, sum);\n\ - for(; coord.x > 0; coord.x--)\n\ + coord_out.x = width - 1;\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ + for(coord.x = width - 1; coord.x > 0; coord.x--)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - coord.z--;\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ + coord_out.x--;\n\ sum += data;\n\ \n\ - write_imagef(output, coord.zw, sum);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else if(exclusive)\n\ {\n\ - coord.z = 0;\n\ - write_imagef(output, coord.zw, sum);\n\ + coord_out.x = 0;\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - coord.z++;\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ + coord_out.x++;\n\ sum += data;\n\ \n\ - write_imagef(output, coord.zw, sum);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else if(rev)\n\ {\n\ for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ sum += data;\n\ \n\ - write_imagef(output, coord.xy, sum);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else\n\ {\n\ for(coord.x = 0; coord.x < width; coord.x++)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ sum += data;\n\ \n\ - write_imagef(output, coord.xy, sum);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ }\n\ -\n\ -__kernel void cumsum_U8toU8_axis0_2D(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ +\n\ +#define CUMSUM_ARRAY_toU8_AXIS0_SH(name, src_type) \\\n\ +__kernel void cumsum_array_##name##toU8_axis0( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int exclusive, \\\n\ + int rev, \\\n\ + int width, \\\n\ + int height, \\\n\ + int channel, \\\n\ + int input_zp, \\\n\ + float in_out_scale, \\\n\ + float in_out_zp_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + src_type sum = (src_type)(0); \\\n\ + uint dst = (uint)(0); \\\n\ + int tmp_zp = convert_int_rte(output_zp); \\\n\ + dst = convert_uint_sat(tmp_zp); \\\n\ + \\\n\ + float cnt = 0; \\\n\ + \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 4); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 4); \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\ + __global uint* out_ptr = (__global uint*)output_ptr; \\\n\ + if(exclusive && rev) \\\n\ + { \\\n\ + coord_out.x = width - 1; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + for(coord.x = width - 1; coord.x > 0; coord.x--) \\\n\ + { \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src_type data = in_ptr[0]; \\\n\ + coord_out.x--; \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst = (uint)convert_int_rte(tmpSum); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive) \\\n\ + { \\\n\ + coord_out.x = 0; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++) \\\n\ + { \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src_type data = in_ptr[0]; \\\n\ + coord_out.x++; \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst = (uint)convert_int_rte(tmpSum); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ + } \\\n\ + else if(rev) \\\n\ + { \\\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--) \\\n\ + { \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src_type data = in_ptr[0]; \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst = (uint)convert_int_rte(tmpSum); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + for(coord.x = 0; coord.x < width; coord.x++) \\\n\ + { \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src_type data = in_ptr[0]; \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst = (uint)convert_int_rte(tmpSum); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_ARRAY_toU8_AXIS0_SH(U8,uint)\n\ +CUMSUM_ARRAY_toU8_AXIS0_SH(F32,float)\n\ +"; /* end of cumsum_array_axis0_cl*/ + +static const char cumsum_array_axis1_cl[] = "\n\ +__kernel void cumsum_array_F32toF32_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ int axis,\n\ int exclusive,\n\ int rev,\n\ int width,\n\ int height,\n\ - int chn,\n\ + int channel,\n\ int input_zp,\n\ float in_out_scale,\n\ float in_out_zp_scale,\n\ float output_zp\n\ )\n\ {\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - uint4 sum = (uint4)(0);\n\ - uint4 dst = (uint4)(0);\n\ -\n\ - int tmp_zp = convert_int_rte(output_zp);\n\ - dst.x = convert_uint_sat(tmp_zp);\n\ -\n\ - float cnt = 0.0f;\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ \n\ + float sum = (float)(0);\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 4);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 4);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global float* in_ptr = (__global float*)input_ptr;\n\ + __global float* out_ptr = (__global float*)output_ptr;\n\ if(exclusive && rev)\n\ {\n\ - coord.x = width - 1;\n\ - coord.z = coord.x;\n\ - write_imageui(output, coord.zw, dst);\n\ - for(; coord.x > 0; coord.x--)\n\ + coord_out.y = height - 1;\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ + for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ - coord.z--;\n\ - cnt += 1.0;\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ + coord_out.y--;\n\ sum += data;\n\ \n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else if(exclusive)\n\ {\n\ - coord.z = 0;\n\ - write_imageui(output, coord.zw, dst);\n\ - for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ + coord_out.y = 0;\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ + for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ - cnt += 1.0f;\n\ - coord.z++;\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ + coord_out.y++;\n\ sum += data;\n\ \n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else if(rev)\n\ {\n\ - for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ - cnt += 1.0f;\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ sum += data;\n\ \n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else\n\ {\n\ - for(coord.x = 0; coord.x < width; coord.x++)\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - uint4 data = read_imageui(input, coord.xy);\n\ - cnt += 1.0f;\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ sum += data;\n\ \n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ }\n\ \n\ -__kernel void cumsum_F32toU8_axis0_2D(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ +#define CUMSUM_ARRAY_toU8_AXIS1_SH(name, src_type) \\\n\ +__kernel void cumsum_array_##name##toU8_axis1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int exclusive, \\\n\ + int rev, \\\n\ + int width, \\\n\ + int height, \\\n\ + int channel, \\\n\ + int input_zp, \\\n\ + float in_out_scale, \\\n\ + float in_out_zp_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + src_type sum = (src_type)(0); \\\n\ + uint dst = (uint4)(0); \\\n\ + int tmp_zp = convert_int_rte(output_zp); \\\n\ + dst = convert_uint_sat(tmp_zp); \\\n\ + \\\n\ + float cnt = 0; \\\n\ + \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 4); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 4); \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\ + __global uint* out_ptr = (__global uint*)output_ptr; \\\n\ + if(exclusive && rev) \\\n\ + { \\\n\ + coord_out.y = height - 1; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + \\\n\ + for(coord.y = height - 1; coord.y > 0; coord.y--) \\\n\ + { \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src_type data = in_ptr[0]; \\\n\ + cnt += 1.0f; \\\n\ + coord_out.y--; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst = (uint)convert_int_rte(tmpSum); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive) \\\n\ + { \\\n\ + coord_out.y = 0; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + for(coord.y = 0; coord.y < height - 1; coord.y++) \\\n\ + { \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src_type data = in_ptr[0]; \\\n\ + cnt += 1.0f; \\\n\ + coord_out.y++; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst = (uint)convert_int_rte(tmpSum); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ + } \\\n\ + else if(rev) \\\n\ + { \\\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\ + { \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src_type data = in_ptr[0]; \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst = (uint)convert_int_rte(tmpSum); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src_type data = in_ptr[0]; \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst = (uint)convert_int_rte(tmpSum); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_ARRAY_toU8_AXIS1_SH(U8,uint)\n\ +CUMSUM_ARRAY_toU8_AXIS1_SH(F32,float)\n\ +"; /* end of cumsum_array_axis1_cl*/ + +static const char cumsum_array_axis2_cl[] = "__kernel void cumsum_array_F32toF32_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ int axis,\n\ int exclusive,\n\ int rev,\n\ int width,\n\ int height,\n\ - int chn,\n\ + int channel,\n\ int input_zp,\n\ float in_out_scale,\n\ float in_out_zp_scale,\n\ float output_zp\n\ )\n\ {\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ -\n\ - float4 sum = (float4)(0);\n\ - uint4 dst = (uint4)(0);\n\ - int tmp_zp = convert_int_rte(output_zp);\n\ - dst.x = convert_uint_sat(tmp_zp);\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ \n\ - float cnt = 0.0f;\n\ + float sum = 0;\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 4);\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 4);\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + __global float* in_ptr = (__global float*)input_ptr;\n\ + __global float* out_ptr = (__global float*)output_ptr;\n\ if(exclusive && rev)\n\ {\n\ - coord.x = width - 1;\n\ - coord.z = coord.x;\n\ - write_imageui(output, coord.zw, dst);\n\ - for(; coord.x > 0; coord.x--)\n\ + coord_out.z = channel - 1;\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ +\n\ + for(coord.z = channel - 1; coord.z > 0; coord.z--)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - coord.z--;\n\ - cnt += 1.0;\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ + coord_out.z--;\n\ sum += data;\n\ -\n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else if(exclusive)\n\ {\n\ - coord.z = 0;\n\ - write_imageui(output, coord.zw, dst);\n\ - for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ + coord_out.z = 0;\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ + for(coord.z = 0; coord.z < channel - 1; coord.z++)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - cnt += 1.0f;\n\ - coord.z++;\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ + coord_out.z++;\n\ sum += data;\n\ \n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.zw, dst);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else if(rev)\n\ {\n\ - for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - cnt += 1.0f;\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ sum += data;\n\ \n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ else\n\ {\n\ - for(coord.x = 0; coord.x < width; coord.x++)\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ {\n\ - float4 data = read_imagef(input, coord.xy);\n\ - cnt += 1.0f;\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\ + in_ptr = (__global float*)input_ptr;\n\ + float data = in_ptr[0];\n\ sum += data;\n\ \n\ - float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ - float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ -\n\ - dst.x = (uint)convert_int_rte(tmpSum);\n\ - write_imageui(output, coord.xy, dst);\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\ + out_ptr = (__global float*)output_ptr;\n\ + out_ptr[0] = sum;\n\ }\n\ }\n\ }\n\ -"; /* end of cumsum_2d_cl*/ +\n\ +#define CUMSUM_ARRAY_toU8_AXIS2_SH(name, src_type) \\\n\ +__kernel void cumsum_array_##name##toU8_axis2( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, \\\n\ + int exclusive, \\\n\ + int rev, \\\n\ + int width, \\\n\ + int height, \\\n\ + int channel, \\\n\ + int input_zp, \\\n\ + float in_out_scale, \\\n\ + float in_out_zp_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + src_type sum = (src_type)(0); \\\n\ + uint dst = (uint)(0); \\\n\ + int tmp_zp = convert_int_rte(output_zp); \\\n\ + dst = convert_uint_sat(tmp_zp); \\\n\ + \\\n\ + float cnt = 0.0f; \\\n\ + Tensor img1 = create_tensor_from_image2d_array(input, 4); \\\n\ + Tensor img2 = create_tensor_from_image2d_array(output, 4); \\\n\ + uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\ + __global uint* out_ptr = (__global uint*)output_ptr; \\\n\ + \\\n\ + if(exclusive && rev) \\\n\ + { \\\n\ + coord_out.z = channel - 1; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + for(coord.z = channel - 1; coord.z > 0; coord.z--) \\\n\ + { \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src_type data = in_ptr[0]; \\\n\ + coord_out.z--; \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst = (uint)convert_int_rte(tmpSum); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive) \\\n\ + { \\\n\ + coord_out.z = 0; \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + for(coord.z = 0; coord.z < channel - 1; coord.z++) \\\n\ + { \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src_type data = in_ptr[0]; \\\n\ + coord_out.z++; \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst = (uint)convert_int_rte(tmpSum); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ + } \\\n\ + else if(rev) \\\n\ + { \\\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\ + { \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src_type data = in_ptr[0]; \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst = (uint)convert_int_rte(tmpSum); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ + { \\\n\ + input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\ + in_ptr = (__global src_type*)input_ptr; \\\n\ + src_type data = in_ptr[0]; \\\n\ + cnt += 1.0f; \\\n\ + sum += data; \\\n\ + \\\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\ + float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\ + \\\n\ + dst = (uint)convert_int_rte(tmpSum); \\\n\ + output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\ + out_ptr = (__global uint*)output_ptr; \\\n\ + out_ptr[0] = dst; \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_ARRAY_toU8_AXIS2_SH(U8,uint)\n\ +CUMSUM_ARRAY_toU8_AXIS2_SH(F32,float)\n\ +\n\ +"; /* end of cumsum_array_axis2_cl*/ static const char depth2space_crd_cl[] = "\n\ __kernel void depth2space_crd_F32toF32(\n\ @@ -80476,8 +84975,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ \\\n\ - __local float local_data[128]; \\\n\ - __local uint local_indices[128]; \\\n\ + __local float local_data[LOCAL_SIZE0 * 2]; \\\n\ + __local uint local_indices[LOCAL_SIZE0 * 2]; \\\n\ \\\n\ float left = read_imagef(input, coord.xy).x; \\\n\ coord.z += work_group_size; \\\n\ @@ -80509,7 +85008,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \\\n\ float right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem) ^ signo) \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -80536,13 +85035,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag write_imagei(indices, coord.xy, index.xxxx); \\\n\ write_imagei(indices, coord.zy, index.yyyy); \\\n\ }\n\ -TOPK_F32(1 << 0, 0)\n\ -TOPK_F32(1 << 1, 1)\n\ -TOPK_F32(1 << 2, 2)\n\ -TOPK_F32(1 << 3, 3)\n\ -TOPK_F32(1 << 4, 4)\n\ -TOPK_F32(1 << 5, 5)\n\ -TOPK_F32(1 << 6, 6)\n\ +TOPK_F32((1 << 0), 0)\n\ +TOPK_F32((1 << 1), 1)\n\ +TOPK_F32((1 << 2), 2)\n\ +TOPK_F32((1 << 3), 3)\n\ +TOPK_F32((1 << 4), 4)\n\ +TOPK_F32((1 << 5), 5)\n\ +TOPK_F32((1 << 6), 6)\n\ +TOPK_F32((1 << 9), 9)\n\ \n\ #define TOPK_U32(LOCAL_SIZE0, STAGES) \\\n\ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \\\n\ @@ -80564,8 +85064,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ \\\n\ - __local uint local_data[128]; \\\n\ - __local uint local_indices[128]; \\\n\ + __local uint local_data[LOCAL_SIZE0 * 2]; \\\n\ + __local uint local_indices[LOCAL_SIZE0 * 2]; \\\n\ \\\n\ uint left = read_imageui(input, coord.xy).x; \\\n\ coord.z += work_group_size; \\\n\ @@ -80597,7 +85097,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag uint left_elem = local_data[left_id]; \\\n\ uint right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem) ^ signo) \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -80624,13 +85124,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag write_imagei(indices, coord.xy, index.xxxx); \\\n\ write_imagei(indices, coord.zy, index.yyyy); \\\n\ }\n\ -TOPK_U32(1 << 0, 0)\n\ -TOPK_U32(1 << 1, 1)\n\ -TOPK_U32(1 << 2, 2)\n\ -TOPK_U32(1 << 3, 3)\n\ -TOPK_U32(1 << 4, 4)\n\ -TOPK_U32(1 << 5, 5)\n\ -TOPK_U32(1 << 6, 6)\n\ +TOPK_U32((1 << 0), 0)\n\ +TOPK_U32((1 << 1), 1)\n\ +TOPK_U32((1 << 2), 2)\n\ +TOPK_U32((1 << 3), 3)\n\ +TOPK_U32((1 << 4), 4)\n\ +TOPK_U32((1 << 5), 5)\n\ +TOPK_U32((1 << 6), 6)\n\ +TOPK_U32((1 << 9), 9)\n\ \n\ #define TOPK_I32(LOCAL_SIZE0, STAGES) \\\n\ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \\\n\ @@ -80652,8 +85153,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ \\\n\ - __local int local_data[128]; \\\n\ - __local int local_indices[128]; \\\n\ + __local int local_data[LOCAL_SIZE0 * 2]; \\\n\ + __local int local_indices[LOCAL_SIZE0 * 2]; \\\n\ \\\n\ int left = read_imagei(input, coord.xy).x; \\\n\ coord.z += work_group_size; \\\n\ @@ -80685,7 +85186,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag int left_elem = local_data[left_id]; \\\n\ int right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem) ^ signo) \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -80712,13 +85213,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag write_imagei(indices, coord.xy, index.xxxx); \\\n\ write_imagei(indices, coord.zy, index.yyyy); \\\n\ }\n\ -TOPK_I32(1 << 0, 0)\n\ -TOPK_I32(1 << 1, 1)\n\ -TOPK_I32(1 << 2, 2)\n\ -TOPK_I32(1 << 3, 3)\n\ -TOPK_I32(1 << 4, 4)\n\ -TOPK_I32(1 << 5, 5)\n\ -TOPK_I32(1 << 6, 6)\n\ +TOPK_I32((1 << 0), 0)\n\ +TOPK_I32((1 << 1), 1)\n\ +TOPK_I32((1 << 2), 2)\n\ +TOPK_I32((1 << 3), 3)\n\ +TOPK_I32((1 << 4), 4)\n\ +TOPK_I32((1 << 5), 5)\n\ +TOPK_I32((1 << 6), 6)\n\ +TOPK_I32((1 << 9), 9)\n\ \n\ #define TOPK_F32toU32(LOCAL_SIZE0, STAGES) \\\n\ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toU32_I32 \\\n\ @@ -80740,8 +85242,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ \\\n\ - __local float local_data[128]; \\\n\ - __local uint local_indices[128]; \\\n\ + __local float local_data[LOCAL_SIZE0 * 2]; \\\n\ + __local uint local_indices[LOCAL_SIZE0 * 2]; \\\n\ \\\n\ float left = read_imagef(input, coord.xy).x; \\\n\ coord.z += work_group_size; \\\n\ @@ -80773,7 +85275,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \\\n\ float right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem) ^ signo) \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -80800,13 +85302,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag write_imagei(indices, coord.zy, index.yyyy); \\\n\ }\n\ \n\ -TOPK_F32toU32(1 << 0, 0)\n\ -TOPK_F32toU32(1 << 1, 1)\n\ -TOPK_F32toU32(1 << 2, 2)\n\ -TOPK_F32toU32(1 << 3, 3)\n\ -TOPK_F32toU32(1 << 4, 4)\n\ -TOPK_F32toU32(1 << 5, 5)\n\ -TOPK_F32toU32(1 << 6, 6)\n\ +TOPK_F32toU32((1 << 0), 0)\n\ +TOPK_F32toU32((1 << 1), 1)\n\ +TOPK_F32toU32((1 << 2), 2)\n\ +TOPK_F32toU32((1 << 3), 3)\n\ +TOPK_F32toU32((1 << 4), 4)\n\ +TOPK_F32toU32((1 << 5), 5)\n\ +TOPK_F32toU32((1 << 6), 6)\n\ +TOPK_F32toU32((1 << 9), 9)\n\ \n\ #define TOPK_F32toI32(LOCAL_SIZE0, STAGES) \\\n\ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toI32_I32 \\\n\ @@ -80828,8 +85331,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ \\\n\ - __local float local_data[128]; \\\n\ - __local uint local_indices[128]; \\\n\ + __local float local_data[LOCAL_SIZE0 * 2]; \\\n\ + __local uint local_indices[LOCAL_SIZE0 * 2]; \\\n\ \\\n\ float left = read_imagef(input, coord.xy).x; \\\n\ coord.z += work_group_size; \\\n\ @@ -80861,7 +85364,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag float left_elem = local_data[left_id]; \\\n\ float right_elem = local_data[right_id]; \\\n\ \\\n\ - if ((left_elem < right_elem) ^ signo) \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ { \\\n\ local_data[left_id] = right_elem; \\\n\ local_data[right_id] = left_elem; \\\n\ @@ -80888,13 +85391,384 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag write_imagei(indices, coord.zy, index.yyyy); \\\n\ }\n\ \n\ -TOPK_F32toI32(1 << 0, 0)\n\ -TOPK_F32toI32(1 << 1, 1)\n\ -TOPK_F32toI32(1 << 2, 2)\n\ -TOPK_F32toI32(1 << 3, 3)\n\ -TOPK_F32toI32(1 << 4, 4)\n\ -TOPK_F32toI32(1 << 5, 5)\n\ -TOPK_F32toI32(1 << 6, 6)"; /* end of topk_cl*/ +TOPK_F32toI32((1 << 0), 0)\n\ +TOPK_F32toI32((1 << 1), 1)\n\ +TOPK_F32toI32((1 << 2), 2)\n\ +TOPK_F32toI32((1 << 3), 3)\n\ +TOPK_F32toI32((1 << 4), 4)\n\ +TOPK_F32toI32((1 << 5), 5)\n\ +TOPK_F32toI32((1 << 6), 6)\n\ +TOPK_F32toI32((1 << 9), 9)"; /* end of topk_cl*/ + +static const char topk2_cl[] = "\n\ +#define BITONIC_STEP(dtype) \\\n\ +void bitonic_step_##dtype(uint num_stages, int lx, \\\n\ + __local dtype *local_data, __local int *local_indices) \\\n\ +{ \\\n\ + for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\ + { \\\n\ + uint signo = (lx >> stage) & 1; \\\n\ + \\\n\ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\ + { \\\n\ + uint postShift = (stage - passOfStage); \\\n\ + uint pairDistance = 1 << postShift; \\\n\ + \\\n\ + uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\ + uint right_id = left_id + pairDistance; \\\n\ + \\\n\ + int left_idx = local_indices[left_id]; \\\n\ + int right_idx = local_indices[right_id]; \\\n\ + \\\n\ + dtype left_elem = local_data[left_id]; \\\n\ + dtype right_elem = local_data[right_id]; \\\n\ + \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ + { \\\n\ + local_data[left_id] = right_elem; \\\n\ + local_data[right_id] = left_elem; \\\n\ + \\\n\ + local_indices[left_id] = right_idx; \\\n\ + local_indices[right_id] = left_idx; \\\n\ + } \\\n\ + \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +BITONIC_STEP(int)\n\ +BITONIC_STEP(uint)\n\ +\n\ +#define BITONIC_STEP_ASCEND(dtype) \\\n\ +void bitonic_step_ascend_##dtype(uint num_stages, int lx, \\\n\ + __local dtype *p_share_k, __local int *p_share_v) \\\n\ +{ \\\n\ + for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\ + { \\\n\ + uint signo = (lx >> stage) & 1; \\\n\ + \\\n\ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\ + { \\\n\ + uint postShift = (stage - passOfStage); \\\n\ + uint pairDistance = 1 << postShift; \\\n\ + \\\n\ + uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\ + uint right_id = left_id + pairDistance; \\\n\ + \\\n\ + int left_idx = p_share_v[left_id]; \\\n\ + int right_idx = p_share_v[right_id]; \\\n\ + \\\n\ + dtype left_elem = p_share_k[left_id]; \\\n\ + dtype right_elem = p_share_k[right_id]; \\\n\ + \\\n\ + if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \\\n\ + { \\\n\ + p_share_k[left_id] = right_elem; \\\n\ + p_share_k[right_id] = left_elem; \\\n\ + \\\n\ + p_share_v[left_id] = right_idx; \\\n\ + p_share_v[right_id] = left_idx; \\\n\ + } \\\n\ + \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +BITONIC_STEP_ASCEND(int)\n\ +BITONIC_STEP_ASCEND(uint)\n\ +\n\ +#define BITONIC_MERGE(dtype) \\\n\ +void bitonic_merge_##dtype(uint num_stages, int lx, \\\n\ + __local dtype *local_data, __local int *local_indices) \\\n\ +{ \\\n\ + uint stage = num_stages; \\\n\ + uint signo = (lx >> stage) & 1; \\\n\ + \\\n\ + for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\ + { \\\n\ + uint postShift = (stage - passOfStage); \\\n\ + uint pairDistance = 1 << postShift; \\\n\ + \\\n\ + uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\ + uint right_id = left_id + pairDistance; \\\n\ + \\\n\ + int left_idx = local_indices[left_id]; \\\n\ + int right_idx = local_indices[right_id]; \\\n\ + \\\n\ + dtype left_elem = local_data[left_id]; \\\n\ + dtype right_elem = local_data[right_id]; \\\n\ + \\\n\ + if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\ + { \\\n\ + local_data[left_id] = right_elem; \\\n\ + local_data[right_id] = left_elem; \\\n\ + \\\n\ + local_indices[left_id] = right_idx; \\\n\ + local_indices[right_id] = left_idx; \\\n\ + } \\\n\ + \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + } \\\n\ +}\n\ +BITONIC_MERGE(int)\n\ +BITONIC_MERGE(uint)\n\ +\n\ +#define BLOCK_SIZE (512)\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32\n\ +(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t indices,\n\ + float input_scale,\n\ + float input_tail,\n\ + float output_scale,\n\ + float output_tail,\n\ + int _num_stages,\n\ + int width\n\ + )\n\ + {\n\ + uint lx = get_local_id(0);\n\ + const int init_k = -2147483647;\n\ + const int init_v = -2147483647;\n\ + const int num_stages = 9;\n\ + const int threads_per_block = BLOCK_SIZE;\n\ + const int index_minus_1 = threads_per_block * 2 - 1;\n\ + uint offset = 0;\n\ + uint lx1 = lx + threads_per_block;\n\ +\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + __local int local_data[1536];\n\ + __local int local_indices[1536];\n\ +\n\ + int left = read_imagei(input, coord.xy).x;\n\ + coord.z += threads_per_block;\n\ + int right = read_imagei(input, coord.zy).x;\n\ +\n\ + local_data[lx] = left;\n\ + local_indices[lx] = coord.x;\n\ + local_data[lx1] = right;\n\ + local_indices[lx1] = coord.z;\n\ +\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_step_int(num_stages, lx, local_data, local_indices);\n\ +\n\ + int min_data = local_data[511];\n\ +\n\ + int *p_share_k = local_data + threads_per_block;\n\ + int *p_share_v = local_indices + threads_per_block;\n\ +\n\ + int limit = (width >> 10) << 10;\n\ + p_share_k[lx] = init_k;\n\ + p_share_v[lx] = init_v;\n\ +\n\ + p_share_k[lx1] = init_k;\n\ + p_share_v[lx1] = init_v;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\ + {\n\ + int2 data;\n\ + coord.z = coord.x + threads_per_block;\n\ + data.x = read_imagei(input, coord.xy).x;\n\ + data.y = read_imagei(input, coord.zy).x;\n\ +\n\ + p_share_k[lx] = data.x;\n\ + p_share_v[lx] = coord.x;\n\ +\n\ + p_share_k[lx1] = data.y;\n\ + p_share_v[lx1] = coord.z;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\ +\n\ + if (p_share_k[index_minus_1] < min_data)\n\ + {\n\ + continue;\n\ + }\n\ +\n\ + p_share_k[lx] = p_share_k[lx1];\n\ + p_share_v[lx] = p_share_v[lx1];\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\ +\n\ + min_data = local_data[511];\n\ + p_share_k[lx] = init_k;\n\ + p_share_v[lx] = init_v;\n\ + p_share_k[lx1] = init_k;\n\ + p_share_v[lx1] = init_v;\n\ + }\n\ +\n\ + if (width > limit)\n\ + {\n\ + if (coord.x < width)\n\ + {\n\ + int2 data;\n\ + data.x = read_imagei(input, coord.xy).x;\n\ + coord.z = coord.x + threads_per_block;\n\ + data.y = read_imagei(input, coord.zy).x;\n\ +\n\ + p_share_k[lx] = data.x;\n\ + p_share_v[lx] = coord.x;\n\ +\n\ + p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\ + p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\ + }\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\ +\n\ + if (p_share_k[index_minus_1] >= min_data)\n\ + {\n\ + p_share_k[lx] = p_share_k[lx1];\n\ + p_share_v[lx] = p_share_v[lx1];\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\ + }\n\ + }\n\ +\n\ + int4 dst;\n\ + dst.x = local_data[lx];\n\ +\n\ + coord.x = lx;\n\ + write_imagei(output, coord.xy, dst.xxxx);\n\ +\n\ + int4 index;\n\ + index.x = local_indices[lx];\n\ +\n\ + write_imagei(indices, coord.xy, index.xxxx);\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32\n\ +(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t indices,\n\ + float input_scale,\n\ + float input_tail,\n\ + float output_scale,\n\ + float output_tail,\n\ + int _num_stages,\n\ + int width\n\ + )\n\ + {\n\ + uint lx = get_local_id(0);\n\ + const uint init_k = 0;\n\ + const int init_v = -2147483647;\n\ + const int num_stages = 9;\n\ + const int threads_per_block = BLOCK_SIZE;\n\ + const int index_minus_1 = threads_per_block * 2 - 1;\n\ + uint offset = 0;\n\ + uint lx1 = lx + threads_per_block;\n\ +\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + __local uint local_data[1536];\n\ + __local int local_indices[1536];\n\ +\n\ + uint left = read_imageui(input, coord.xy).x;\n\ + coord.z += threads_per_block;\n\ + uint right = read_imageui(input, coord.zy).x;\n\ +\n\ + local_data[lx] = left;\n\ + local_indices[lx] = coord.x;\n\ + local_data[lx1] = right;\n\ + local_indices[lx1] = coord.z;\n\ +\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_step_uint(num_stages, lx, local_data, local_indices);\n\ +\n\ + uint min_data = local_data[511];\n\ +\n\ + uint *p_share_k = local_data + threads_per_block;\n\ + int *p_share_v = local_indices + threads_per_block;\n\ +\n\ + int limit = (width >> 10) << 10;\n\ + p_share_k[lx] = init_k;\n\ + p_share_v[lx] = init_v;\n\ +\n\ + p_share_k[lx1] = init_k;\n\ + p_share_v[lx1] = init_v;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\ + {\n\ + uint2 data;\n\ + coord.z = coord.x + threads_per_block;\n\ + data.x = read_imageui(input, coord.xy).x;\n\ + data.y = read_imageui(input, coord.zy).x;\n\ +\n\ + p_share_k[lx] = data.x;\n\ + p_share_v[lx] = coord.x;\n\ +\n\ + p_share_k[lx1] = data.y;\n\ + p_share_v[lx1] = coord.z;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\ +\n\ + if (p_share_k[index_minus_1] < min_data)\n\ + {\n\ + continue;\n\ + }\n\ +\n\ + p_share_k[lx] = p_share_k[lx1];\n\ + p_share_v[lx] = p_share_v[lx1];\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\ +\n\ + min_data = local_data[511];\n\ + p_share_k[lx] = init_k;\n\ + p_share_v[lx] = init_v;\n\ + p_share_k[lx1] = init_k;\n\ + p_share_v[lx1] = init_v;\n\ + }\n\ +\n\ + if (width > limit)\n\ + {\n\ + if (coord.x < width)\n\ + {\n\ + uint2 data;\n\ + data.x = read_imageui(input, coord.xy).x;\n\ + coord.z = coord.x + threads_per_block;\n\ + data.y = read_imageui(input, coord.zy).x;\n\ +\n\ + p_share_k[lx] = data.x;\n\ + p_share_v[lx] = coord.x;\n\ +\n\ + p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\ + p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\ + }\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\ +\n\ + if (p_share_k[index_minus_1] >= min_data)\n\ + {\n\ + p_share_k[lx] = p_share_k[lx1];\n\ + p_share_v[lx] = p_share_v[lx1];\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\ + }\n\ + }\n\ +\n\ + uint4 dst;\n\ + dst.x = local_data[lx];\n\ +\n\ + coord.x = lx;\n\ + write_imageui(output, coord.xy, dst.xxxx);\n\ +\n\ + int4 index;\n\ + index.x = local_indices[lx];\n\ +\n\ + write_imagei(indices, coord.xy, index.xxxx);\n\ +}\n\ +"; /* end of topk2_cl*/ static const char topk_odd_even_sort_cl[] = "#define LOCAL_SIZE_X (32)\n\ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32\n\ @@ -81702,6 +86576,14 @@ static const source_map_t evis_resource[] = {"crop_and_resize_nearest_neighbor_vx", crop_and_resize_nearest_neighbor_vx}, {"cumsum_vx", cumsum_vx}, {"cumsum_2d_vx", cumsum_2d_vx}, + {"cumsum_array_vx", cumsum_array_vx}, + {"cumsum_array_2d_vx", cumsum_array_2d_vx}, + {"cumsum_array_bf16_vx", cumsum_array_bf16_vx}, + {"cumsum_array_ex_rev_axis0_vx", cumsum_array_ex_rev_axis0_vx}, + {"cumsum_array_ex_rev_axis1_vx", cumsum_array_ex_rev_axis1_vx}, + {"cumsum_array_ex_rev_axis2_vx", cumsum_array_ex_rev_axis2_vx}, + {"cumsum_array_f16_u8_vx", cumsum_array_f16_u8_vx}, + {"cumsum_array_f16_u8_2d_vx", cumsum_array_f16_u8_2d_vx}, {"cumsum_bf16_vx", cumsum_bf16_vx}, {"cumsum_ex_rev_axis0_vx", cumsum_ex_rev_axis0_vx}, {"cumsum_ex_rev_axis1_vx", cumsum_ex_rev_axis1_vx}, @@ -81986,10 +86868,16 @@ static const source_map_t cl_resource[] = {"clip_F32_cl", clip_F32_cl}, {"clip_I32_cl", clip_I32_cl}, {"clip_U8_cl", clip_U8_cl}, + {"col2im_cl", col2im_cl}, {"crop_and_resize_bilinear_cl", crop_and_resize_bilinear_cl}, {"crop_and_resize_nearest_neighbor_cl", crop_and_resize_nearest_neighbor_cl}, {"cumsum_cl", cumsum_cl}, {"cumsum_2d_cl", cumsum_2d_cl}, + {"cumsum_array_2d_axis0_cl", cumsum_array_2d_axis0_cl}, + {"cumsum_array_2d_axis1_cl", cumsum_array_2d_axis1_cl}, + {"cumsum_array_axis0_cl", cumsum_array_axis0_cl}, + {"cumsum_array_axis1_cl", cumsum_array_axis1_cl}, + {"cumsum_array_axis2_cl", cumsum_array_axis2_cl}, {"depth2space_crd_cl", depth2space_crd_cl}, {"eltwise_ops_helper_cl", eltwise_ops_helper_cl}, {"eltwise_unary_0_cl", eltwise_unary_0_cl}, @@ -82114,6 +87002,7 @@ static const source_map_t cl_resource[] = {"swish_cl", swish_cl}, {"tile_cl", tile_cl}, {"topk_cl", topk_cl}, + {"topk2_cl", topk2_cl}, {"topk_odd_even_sort_cl", topk_odd_even_sort_cl}, {"topk_odd_even_sort2_cl", topk_odd_even_sort2_cl}, {"upsample_cl", upsample_cl}, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c index dec079cb..6415ac0c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c @@ -62,11 +62,20 @@ static vsi_status _argmaxmin_op_compute } status = VSI_FAILURE; - param =vsi_nn_kernel_param_create(); + param = vsi_nn_kernel_param_create(); if (strcmp(kernel_name, "argmax") == 0) { vsi_nn_argmax_param * p = &(self->nn_param.argmax); axis = p->axis; +#if (VX_ARGMAX_VX_SUPPORT) + vsi_nn_kernel_param_add_int32(param, "axis", axis); + self->n = (vx_node)vsi_nn_kernel_selector(self->graph, + kernel_name, + inputs, 1, + outputs, 1, param); + goto final; +#endif + } else { @@ -101,6 +110,10 @@ static vsi_status _argmaxmin_op_compute vsi_nn_ReleaseTensor( &reshape_tensors[0] ); vsi_nn_ReleaseTensor( &reshape_tensors[1] ); } + +#if (VX_ARGMAX_VX_SUPPORT) +final: +#endif if( self->n ) { status = VSI_SUCCESS; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c new file mode 100644 index 00000000..c47bd279 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c @@ -0,0 +1,153 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" + +typedef struct _bitcast_local_data_t { + int32_t placeholder; +} bitcast_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_t n = NULL; + + n = vsi_nn_kernel_selector( self->graph, "bitcast", inputs, 1, outputs, 1, NULL ); + if (n != NULL) + { + status = VSI_SUCCESS; + } + self->n = (vx_node)n; + + return status; +} /* op_compute() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + int32_t i = 0; + + VSI_UNREFERENCED(self); + + if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) + { + uint32_t input_byte = 0; + uint32_t output_byte = 0; + uint32_t in_dim = inputs[0]->attr.dim_num; + input_byte = vsi_nn_TypeGetBytesExt(inputs[0]->attr.dtype.vx_type); + output_byte = vsi_nn_TypeGetBytesExt(outputs[0]->attr.dtype.vx_type); + + if (input_byte == output_byte) + { + outputs[0]->attr.dim_num = in_dim; + for (i = 0; i < (int32_t)(in_dim); i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + } + else if (input_byte > output_byte) + { + outputs[0]->attr.dim_num = in_dim + 1; + outputs[0]->attr.size[0] = input_byte / output_byte; + for (i = 1;i < (int32_t)(outputs[0]->attr.dim_num); i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i - 1]; + } + } + else + { + if ((uint32_t)(inputs[0]->attr.size[in_dim - 1]) != output_byte / input_byte) + { + VSILOGE("If input datatype is smaller than output datatype, bitcast op requires that \ + the rightmost dimension be equal to sizeof(output datatype) / sizeof(input datatype)"); + return FALSE; + } + outputs[0]->attr.dim_num = in_dim - 1; + if (outputs[0]->attr.dim_num == 0) + { + outputs[0]->attr.size[0] = 1; + vsi_nn_SetTensorIsScalar(outputs[0], TRUE); + } + else + { + for (i = 0; i < (int32_t)(outputs[0]->attr.dim_num); i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1]; + } + } + } + } + + return TRUE; +} /* op_setup() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ BITCAST, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ NULL, + /* check */ NULL, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_col2im.c b/src/tim/vx/internal/src/ops/vsi_nn_op_col2im.c new file mode 100644 index 00000000..d82f349c --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_col2im.c @@ -0,0 +1,258 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +typedef struct _col2im_local_data_t { + int32_t placeholder; +} col2im_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t* param = NULL; + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "stride_w", self->nn_param.col2im.strides[0] ); + vsi_nn_kernel_param_add_int32( param, "stride_h", self->nn_param.col2im.strides[1] ); + vsi_nn_kernel_param_add_int32( param, "stride_d", self->nn_param.col2im.strides[2] ); + vsi_nn_kernel_param_add_int32( param, "pad_w_front", self->nn_param.col2im.pads[0] ); + vsi_nn_kernel_param_add_int32( param, "pad_w_end", self->nn_param.col2im.pads[1] ); + vsi_nn_kernel_param_add_int32( param, "pad_h_front", self->nn_param.col2im.pads[2] ); + vsi_nn_kernel_param_add_int32( param, "pad_h_end", self->nn_param.col2im.pads[3] ); + vsi_nn_kernel_param_add_int32( param, "pad_d_front", self->nn_param.col2im.pads[4] ); + vsi_nn_kernel_param_add_int32( param, "pad_d_end", self->nn_param.col2im.pads[5] ); + vsi_nn_kernel_param_add_int32( param, "dilation_w", self->nn_param.col2im.dilations[0] ); + vsi_nn_kernel_param_add_int32( param, "dilation_h", self->nn_param.col2im.dilations[1] ); + vsi_nn_kernel_param_add_int32( param, "dilation_d", self->nn_param.col2im.dilations[2] ); + vsi_nn_kernel_param_add_buffer( param, "block_shape", (void*)self->nn_param.col2im.block_shape, \ + self->nn_param.col2im.dim_num ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "col2im", + inputs, 1, outputs, 1, param ); + + if (self->n) + { + status = VSI_SUCCESS; + } + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(COL2IM, 1, 1) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_I32) + IO_TYPE(D_F32, D_U32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_I32, D_F32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_I32, D_U32) + IO_TYPE(D_I32, D_F16) + IO_TYPE(D_U32, D_F32) + IO_TYPE(D_U32, D_I32) + IO_TYPE(D_U32, D_U32) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_SYM) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_I16, D_F16) + IO_TYPE(D_I16, D_I8|Q_DFP) + IO_TYPE(D_I16, D_U8|Q_ASYM) + IO_TYPE(D_I16, D_I32) + IO_TYPE(D_I16, D_U32) + IO_TYPE(D_I16, D_F32) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_I8, D_F16) + IO_TYPE(D_I8, D_I16|Q_DFP) + IO_TYPE(D_I8, D_U8|Q_ASYM) + IO_TYPE(D_I8, D_I32) + IO_TYPE(D_I8, D_U32) + IO_TYPE(D_I8, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8, D_F16) + IO_TYPE(D_U8, D_I16|Q_DFP) + IO_TYPE(D_U8, D_I8|Q_DFP) + IO_TYPE(D_U8, D_I32) + IO_TYPE(D_U8, D_U32) + IO_TYPE(D_U8, D_F32) + IO_TYPE(D_F32, D_I16|Q_DFP) + IO_TYPE(D_F32, D_I16|Q_ASYM) + IO_TYPE(D_F32, D_I16|Q_SYM) + IO_TYPE(D_F32, D_I8|Q_DFP) + IO_TYPE(D_F32, D_I8|Q_ASYM) + IO_TYPE(D_F32, D_I8|Q_SYM) + IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I16|Q_DFP) + IO_TYPE(D_I32, D_I16|Q_ASYM) + IO_TYPE(D_I32, D_I16|Q_SYM) + IO_TYPE(D_I32, D_I8|Q_DFP) + IO_TYPE(D_I32, D_I8|Q_ASYM) + IO_TYPE(D_I32, D_I8|Q_SYM) + IO_TYPE(D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_I32) + IO_TYPE(D_F16, D_I16) + IO_TYPE(D_F16, D_U8) + IO_TYPE(D_F16, D_I8) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I32) + IO_TYPE(D_BF16, D_BF16) + END_IO_TYPE_DECL(COL2IM) + if (!VALIDATE_OP_IO_TYPES(COL2IM, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_col2im_param *p = NULL; + p = (vsi_nn_col2im_param* )&(self->nn_param.col2im); + int32_t i = 0; + vsi_size_t block_size = 1; + vsi_size_t channel = 1; + if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) + { + outputs[0]->attr.dim_num = p->dim_num + 2; + for (i = 0; i < p->dim_num; i++) + { + outputs[0]->attr.size[i] = (vsi_size_t)p->image_shape[i]; + block_size = block_size * (vsi_size_t)p->block_shape[i]; + } + channel = inputs[0]->attr.size[1] / block_size; + outputs[0]->attr.size[i + 1] = channel; + outputs[0]->attr.size[i + 2] = inputs[0]->attr.size[0]; + + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + self->nn_param.col2im.pads[0] = 0; + self->nn_param.col2im.pads[1] = 0; + self->nn_param.col2im.pads[2] = 0; + self->nn_param.col2im.pads[3] = 0; + self->nn_param.col2im.pads[4] = 0; + self->nn_param.col2im.pads[5] = 0; + self->nn_param.col2im.strides[0] = 1; + self->nn_param.col2im.strides[1] = 1; + self->nn_param.col2im.strides[2] = 1; + self->nn_param.col2im.dilations[0] = 1; + self->nn_param.col2im.dilations[1] = 1; + self->nn_param.col2im.dilations[2] = 1; + + return VSI_SUCCESS; +} + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ COL2IM, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c index ecb16406..26d25664 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c @@ -28,6 +28,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_log.h" #include "vsi_nn_graph.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_node.h" #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" @@ -278,7 +279,7 @@ static vsi_status op_compute if(_is_tensorview_support(self, outputs) && _is_same_quant(self, inputs, outputs) && (_has_norm_input(self, inputs) == FALSE) - && self->graph->ctx->options.enable_concat_optimize) + && ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_concat_optimize) { iter = self->nn_param.concat.lcl_data; while( NULL != iter ) @@ -443,7 +444,7 @@ static vsi_status op_optimize if (_is_tensorview_support(self, outputs) == FALSE || _is_same_quant(self, inputs, outputs) == FALSE || _has_norm_input(self, inputs) == TRUE || - self->graph->ctx->options.enable_concat_optimize == 0) + ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_concat_optimize == 0) { return status; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index bfeeab29..11f0268a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -23,6 +23,7 @@ *****************************************************************************/ #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_platform.h" #include "vsi_nn_prv.h" #include "vsi_nn_graph.h" @@ -95,7 +96,7 @@ static vsi_status op_optimize status = VSI_SUCCESS; - if( !self->graph->ctx->options.enable_dataconvert_optimize ) + if( !((vsi_nn_graph_prv_t*)(self->graph))->options->enable_dataconvert_optimize ) { return status; } @@ -266,14 +267,14 @@ static vsi_bool op_check IO_TYPE(D_BF16, D_BF16) IO_TYPE(D_BF16, D_F16) IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_I32, D_I32) - IO_TYPE(D_I32, D_F32) - IO_TYPE(D_I32, D_F16) - IO_TYPE(D_I32, D_I16|Q_DFP) - IO_TYPE(D_I32, D_I8|Q_DFP) - IO_TYPE(D_I32, D_U32) - IO_TYPE(D_I32, D_U16) - IO_TYPE(D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM) + IO_TYPE(D_I32|Q_ASYM, D_F32) + IO_TYPE(D_I32|Q_ASYM, D_F16) + IO_TYPE(D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I32|Q_ASYM, D_U32|Q_ASYM) + IO_TYPE(D_I32|Q_ASYM, D_U16|Q_ASYM) + IO_TYPE(D_I32|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U32, D_U32) IO_TYPE(D_U32, D_I16|Q_DFP) IO_TYPE(D_U32, D_I8|Q_DFP) @@ -281,7 +282,7 @@ static vsi_bool op_check IO_TYPE(D_U32, D_U8|Q_ASYM) IO_TYPE(D_U32, D_U8) IO_TYPE(D_BF16, D_I32) - IO_TYPE(D_I32, D_BF16) + IO_TYPE(D_I32|Q_ASYM, D_BF16) IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U4|Q_SYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U4|Q_ASYM) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c index 44e051e9..a768b467 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c @@ -183,10 +183,16 @@ vsi_bool vsi_nn_op_eltwise_setup shape[i] = sz0; } - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = out_rank; memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); + if (out_rank == 1 && + vsi_nn_GetTensorIsScalar(inputs[0]) && + vsi_nn_GetTensorIsScalar(inputs[1])) + { + vsi_nn_SetTensorIsScalar(outputs[0], TRUE); + } } else { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c index d035ddae..404588b9 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c @@ -54,10 +54,12 @@ static vsi_status op_compute vsi_nn_kernel_param_t* param = NULL; int32_t align_corners = self->nn_param.gridsample.align_corners; int32_t pad_mode = (int32_t)self->nn_param.gridsample.padding_mode; + int32_t mode = (int32_t)self->nn_param.gridsample.mode; vsi_nn_kernel_node_t n; char kernel_name[128]; param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32(param, "mode", mode); vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners); vsi_nn_kernel_param_add_int32(param, "padding_mode", pad_mode); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv3d.c new file mode 100644 index 00000000..8ac872c4 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv3d.c @@ -0,0 +1,412 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + + +/* + Declare number of input and output. + */ +#define _ARG_NUM (1) +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _PARAM_NUM (_ARG_NUM + _IO_NUM) + +#define LOCAL() ((vsi_nn_grouped_conv3d_param_local_data *)nn_param->local) + +typedef struct _vsi_nn_grouped_conv3d_param_local_data { + vsi_nn_tensor_t ** input_tensor_group; + vsi_nn_tensor_t ** weight_tensor_group; + vsi_nn_tensor_t ** bias_tensor_group; + vsi_nn_tensor_t ** output_tensor_group; +} vsi_nn_grouped_conv3d_param_local_data; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ +#if VX_CONV_3D_API_SUPPORT +#define _TENSOR_LEN 64 + vsi_bool res; + uint32_t i; + char tensor_name[_TENSOR_LEN]; + vsi_nn_grouped_conv3d_param *nn_param = &self->nn_param.grouped_conv3d; + nn_param->local = (vsi_nn_grouped_conv3d_param_local_data*)malloc( + sizeof(vsi_nn_grouped_conv3d_param_local_data)); + if (NULL == nn_param->local) + { + VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + memset(nn_param->local, 0, sizeof(vsi_nn_grouped_conv3d_param_local_data)); + LOCAL()->input_tensor_group = (vsi_nn_tensor_t **)malloc( + nn_param->group * sizeof(vsi_nn_tensor_t *)); + if (NULL == LOCAL()->input_tensor_group) + { + VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + memset(LOCAL()->input_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *)); + res = vsi_nn_CreateTensorGroup(self->graph, inputs[0], 3, + LOCAL()->input_tensor_group, nn_param->group); + if (res == FALSE) + { + VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + + LOCAL()->weight_tensor_group = (vsi_nn_tensor_t **)malloc( + nn_param->group * sizeof(vsi_nn_tensor_t *)); + if (NULL == LOCAL()->weight_tensor_group) + { + VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + memset(LOCAL()->weight_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *)); + res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 4, + LOCAL()->weight_tensor_group, nn_param->group); + if (res == FALSE) + { + VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + + LOCAL()->bias_tensor_group = (vsi_nn_tensor_t **)malloc( + nn_param->group * sizeof(vsi_nn_tensor_t *)); + if (NULL == LOCAL()->bias_tensor_group) + { + VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + memset(LOCAL()->bias_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *)); + if (inputs[2] != NULL) + { + res = vsi_nn_CreateTensorGroup(self->graph, inputs[2], 0, + LOCAL()->bias_tensor_group, nn_param->group); + if (res == FALSE) + { + VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + } + + LOCAL()->output_tensor_group = (vsi_nn_tensor_t **)malloc( + nn_param->group * sizeof(vsi_nn_tensor_t *)); + if (NULL == LOCAL()->output_tensor_group) + { + VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + memset(LOCAL()->output_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *)); + res = vsi_nn_CreateTensorGroup(self->graph, outputs[0], 3, + LOCAL()->output_tensor_group, nn_param->group); + if (res == FALSE) + { + VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + + for (i = 0; i < nn_param->group; i++) + { + vx_tensor bias; + vx_nn_convolution_3d_params_t *param = NULL; + vx_nn_convolution_3d_params_t param_; + memset( ¶m_, 0, sizeof( vx_nn_convolution_3d_params_t ) ); + param = ¶m_; + param->padding_w_left = self->nn_param.grouped_conv3d.pad[0]; + param->padding_w_right = self->nn_param.grouped_conv3d.pad[1]; + param->padding_h_top = self->nn_param.grouped_conv3d.pad[2]; + param->padding_h_bottom = self->nn_param.grouped_conv3d.pad[3]; + param->padding_d_front = self->nn_param.grouped_conv3d.pad[4]; + param->padding_d_rear = self->nn_param.grouped_conv3d.pad[5]; + + param->stride_w = self->nn_param.grouped_conv3d.stride[0]; + param->stride_h = self->nn_param.grouped_conv3d.stride[1]; + param->stride_d = self->nn_param.grouped_conv3d.stride[2]; + + if (self->nn_param.grouped_conv3d.dilation[0] * + self->nn_param.grouped_conv3d.dilation[1] * + self->nn_param.grouped_conv3d.dilation[2] > 1) + { + VSILOGE("conv3d could not support dilation > 1\n"); + return VSI_FAILURE; + } + if ( self->nn_param.grouped_conv3d.dilation[0] > 0 ) + { + param->dilation_w = self->nn_param.grouped_conv3d.dilation[0] - 1; + } + if ( self->nn_param.grouped_conv3d.dilation[1] > 0 ) + { + param->dilation_h = self->nn_param.grouped_conv3d.dilation[1] - 1; + } + if ( self->nn_param.grouped_conv3d.dilation[2] > 0 ) + { + param->dilation_d = self->nn_param.grouped_conv3d.dilation[2] - 1; + } + param->pad_mode = vsi_nn_get_vx_pad_mode(nn_param->pad_mode); + param->depth_multiplier = self->nn_param.grouped_conv3d.multiplier; + param->overflow_policy = self->vx_param.overflow_policy; + param->rounding_policy = self->vx_param.rounding_policy; + param->down_scale_size_rounding = self->vx_param.down_scale_size_rounding; + + if ( inputs[2] == NULL ) + { + bias = NULL; + } + else + { + bias = LOCAL()->bias_tensor_group[i]->t; + } + + self->n = vxConv3dLayer( + self->graph->g, + LOCAL()->input_tensor_group[i]->t, + LOCAL()->weight_tensor_group[i]->t, + bias, + (vx_nn_convolution_3d_params_t* )param, + sizeof( vx_nn_convolution_3d_params_t), + LOCAL()->output_tensor_group[i]->t + ); + + memset(tensor_name, 0, sizeof(tensor_name)); + snprintf(tensor_name, sizeof(tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, i); + if (vxSetReferenceName((vx_reference)LOCAL()->output_tensor_group[i]->t, tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u copy node output name fail", self->uid); + return VSI_FAILURE; + } + if ( NULL == self->n ) + { + VSILOGE("Add vxConvolutionLayer fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + else + { + // no need to maintain self->n + vxReleaseNode( &self->n ); + self->n = NULL; + } + } +#else + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); +#endif + return VSI_SUCCESS; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_CONV3D, self, inputs, outputs); + + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + vsi_nn_grouped_conv3d_param *nn_param; + vsi_size_t perm[] = { 3, 2, 0, 1 }; + +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + + if ( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt && + VSI_NN_TYPE_VDATA != inputs[1]->attr.dtype.vx_type ) + { + vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL ); + inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW; + } + + nn_param = &self->nn_param.grouped_conv3d; + { + vsi_size_t i, pad[_cnt_of_array(nn_param->pad)] = {0}; + for (i = 0; i < _cnt_of_array(nn_param->pad); i++) + { + pad[i] = self->nn_param.grouped_conv3d.pad[i]; + } + vsi_nn_compute_padding_3d( + inputs[0]->attr.size, + inputs[1]->attr.size, + nn_param->stride, + nn_param->dilation, + nn_param->pad_type, + pad + ); + for (i = 0; i < _cnt_of_array(nn_param->pad); i++) + { + self->nn_param.grouped_conv3d.pad[i] = (uint32_t)pad[i]; + } + } + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[0], + inputs[1]->attr.size[0], + &nn_param->pad[0], + nn_param->stride[0], + nn_param->dilation[0], + VSI_NN_ROUND_FLOOR + ); + outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[1], + inputs[1]->attr.size[1], + &nn_param->pad[2], + nn_param->stride[1], + nn_param->dilation[1], + VSI_NN_ROUND_FLOOR + ); + outputs[0]->attr.size[2] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[2], + inputs[1]->attr.size[2], + &nn_param->pad[4], + nn_param->stride[2], + nn_param->dilation[2], + VSI_NN_ROUND_FLOOR + ); + if (self->nn_param.grouped_conv3d.weights > 0) + { + outputs[0]->attr.size[3] = self->nn_param.grouped_conv3d.weights; + } + else if (self->nn_param.grouped_conv3d.multiplier > 0) + { + outputs[0]->attr.size[3] = inputs[0]->attr.size[3] * self->nn_param.grouped_conv3d.multiplier; + } + else + { + outputs[0]->attr.size[3] = inputs[1]->attr.size[4]; + } + outputs[0]->attr.size[4] = inputs[0]->attr.size[4]; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + return TRUE; +} /* op_setup() */ + + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_nn_grouped_conv3d_param *nn_param = &(self->nn_param.grouped_conv3d); + uint32_t i; + if (LOCAL()) + { + if (LOCAL()->input_tensor_group) + { + for (i = 0; i < nn_param->group; i++) + { + vsi_nn_ReleaseTensor(&(LOCAL()->input_tensor_group[i])); + } + free(LOCAL()->input_tensor_group); + } + if (LOCAL()->weight_tensor_group) + { + for (i = 0; i < nn_param->group; i++) + { + vsi_nn_ReleaseTensor(&(LOCAL()->weight_tensor_group[i])); + } + free(LOCAL()->weight_tensor_group); + } + if (LOCAL()->bias_tensor_group != NULL) + { + for (i = 0; i < nn_param->group; i++) + { + vsi_nn_ReleaseTensor(&(LOCAL()->bias_tensor_group[i])); + } + free(LOCAL()->bias_tensor_group); + } + if (LOCAL()->output_tensor_group != NULL) + { + for (i = 0; i < nn_param->group; i++) + { + vsi_nn_ReleaseTensor(&(LOCAL()->output_tensor_group[i])); + } + free(LOCAL()->output_tensor_group); + } + + free(LOCAL()); + } + vsi_nn_op_common_deinit(self); + return VSI_SUCCESS; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GROUPED_CONV3D, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c new file mode 100644 index 00000000..5dbe4a40 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c @@ -0,0 +1,206 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_tensor_util_prv.h" + +typedef struct _l1_layer_norm_local_data_t { + int32_t placeholder; +} l1_layer_norm_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (4) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + float eps = self->nn_param.l1_layer_norm.eps; + int32_t axis = self->nn_param.l1_layer_norm.axis; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_float32( param, "eps", eps ); + vsi_nn_kernel_param_add_int32( param, "axis", axis ); + n = vsi_nn_kernel_selector( self->graph, "l1_layer_norm", + inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + + if ( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + if (param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num); + + if (!ret) + { + BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_SYM) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_SYM) + IO_TYPE(D_BF16, D_F32, D_F32, D_F32, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_F32, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_F32, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F32, D_F32, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F32, D_F32, D_F16) + END_IO_TYPE_DECL(L1_LAYER_NORM) + if (!VALIDATE_OP_IO_TYPES(L1_LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + int32_t i = 0; + VSI_UNREFERENCED(self); + + if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.l1_layer_norm.axis = 0; + + return status; +} /* op_init() */ + + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ L1_LAYER_NORM, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ NULL, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c index 682628c2..a9739381 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c @@ -25,6 +25,7 @@ #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_platform.h" #include "vsi_nn_log.h" #include "vsi_nn_graph.h" @@ -161,7 +162,7 @@ static vsi_bool op_setup if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP) { - enable_rgb88_planar_nhwc = self->graph->ctx->options.enable_rgb88_planar_nhwc; + enable_rgb88_planar_nhwc = ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_rgb88_planar_nhwc; } } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c index 2051c453..4c314b85 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -183,7 +183,8 @@ static vsi_bool _check_is_sp_supported_type return FALSE; } - if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2)) || + if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2 || + (axes[0] == 1 && (input->attr.size[0] == 1 || input->attr.size[2] == 1)))) || (axes_num == 2 && ((axes[0] < 2 && axes[1] < 2) || (axes[0] == 1 && axes[1] == 2))) ) { return TRUE; @@ -1167,6 +1168,7 @@ static vsi_bool op_setup { outputs[0]->attr.dim_num = 1; outputs[0]->attr.size[0] = 1; + vsi_nn_SetTensorIsScalar(outputs[0], TRUE); } else { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c index 84387d7f..8d1610f5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c @@ -93,52 +93,32 @@ static vsi_bool op_check if (!ret) { BEGIN_IO_TYPE_DECL(RMS_NORM, 2, 1) - IO_TYPE(D_F32, D_F32, D_F32) - IO_TYPE(D_F16, D_F32, D_F16) - IO_TYPE(D_F16, D_F32, D_F16) - IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM) - IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM) - IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP) - IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP) - IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM) - IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM) - IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM) - IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM) - IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP) - IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP) - IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM) - IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM) - IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM) - IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM) - IO_TYPE(D_BF16, D_F32, D_BF16) - IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16) - IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM) - IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP) - IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM) - IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM) - IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16) - IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16) - IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16) - IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP) - IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM) - IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM) - IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16) - IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16) - IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16) - IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM) - IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16) - IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP) - IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM) - IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM) - IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16) - IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16) - IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16) - IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP) - IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM) - IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM) - IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16) - IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16) - IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM) + IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP) + IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM) + IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM) + IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP) + IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM) + IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM) + IO_TYPE(D_BF16, D_F32, D_BF16) + IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16) + IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM) + IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP) + IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM) + IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM) + IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16) + IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16) + IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16) + IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP) + IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM) + IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM) + IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16) + IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16) + IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16) END_IO_TYPE_DECL(RMS_NORM) if (!VALIDATE_OP_IO_TYPES(RMS_NORM, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c index 95dc76ab..5d5768fc 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c @@ -25,6 +25,7 @@ #include #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_platform.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" @@ -776,7 +777,7 @@ static vsi_status op_optimize /* Only forward run stride_slice's optimize */ if ( direction == VSI_NN_OPTIMIZE_BACKWARD || - !self->graph->ctx->options.enable_slice_optimize ) + !((vsi_nn_graph_prv_t*)(self->graph))->options->enable_slice_optimize ) { return status; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c index 6291e5c0..e93fe454 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c @@ -78,9 +78,10 @@ static vsi_status _tile_op_compute vsi_size_t new_rank = 0; vsi_bool ret = FALSE; uint32_t i = 0; - vsi_size_t* multiples = (vsi_size_t*)self->nn_param.tile.multiples; + int32_t* multiples_ = (int32_t*)self->nn_param.tile.multiples; vsi_nn_tensor_t* temp_tensors[3] = { NULL }; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = {1}; int32_t multiples_value[VSI_NN_MAX_DIM_NUM] = {0}; vsi_nn_tensor_attr_t attr; @@ -101,6 +102,11 @@ static vsi_status _tile_op_compute temp_tensors[2] = outputs[0]; } + for (i = 0; i < inputs[0]->attr.dim_num; i ++) + { + multiples[i] = (vsi_size_t)multiples_[i]; + } + ret = vsi_nn_kernel_optimize_tile_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, multiples, inputs[0]->attr.dim_num, @@ -111,6 +117,7 @@ static vsi_status _tile_op_compute { if (_is_supported_axis(shapes[1], new_rank) == FALSE) { + uint32_t _multiples = (uint32_t)(new_rank > 4 && shapes[1][4] > 1 ? 3 : 2); reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\ shapes[0], (vsi_size_t)new_rank ); reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, temp_tensors[2],\ @@ -125,8 +132,11 @@ static vsi_status _tile_op_compute memcpy( &attr, &reshape_tensors[0]->attr, sizeof(attr)); attr.is_const = FALSE; attr.vtl = TRUE; - attr.size[0] = reshape_tensors[2]->attr.size[0]; - attr.size[1] = reshape_tensors[2]->attr.size[1]; + + for (i = 0; i < _multiples; i++) + { + attr.size[i] = reshape_tensors[2]->attr.size[i]; + } temp_tensors[0] = vsi_nn_CreateTensor( self->graph, &attr ); memset( &attr, 0 , sizeof(vsi_nn_tensor_attr_t) ); @@ -136,9 +146,11 @@ static vsi_status _tile_op_compute attr.size[0] = new_rank; attr.dim_num = 1; - multiples_value[0] = (int32_t)shapes[1][0]; - multiples_value[1] = (int32_t)shapes[1][1]; - for (i = 0; i < new_rank; i++) + for (i = 0; i < _multiples; i++) + { + multiples_value[i] = (int32_t)shapes[1][i]; + } + for (i = _multiples; i < new_rank; i++) { multiples_value[i] = 1; } @@ -150,9 +162,11 @@ static vsi_status _tile_op_compute goto final; } - multiples_value[0] = 1; - multiples_value[1] = 1; - for (i = 0; i < new_rank; i++) + for (i = 0; i < _multiples; i++) + { + multiples_value[i] = 1; + } + for (i = _multiples; i < new_rank; i++) { multiples_value[i] = (int32_t)shapes[1][i]; } @@ -257,6 +271,7 @@ static vsi_bool op_check IO_TYPE(D_F32, D_F32) IO_TYPE(D_F32, D_U8|Q_ASYM) IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_BOOL8, D_BOOL8) END_IO_TYPE_DECL(TILE) if (!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index 4b2aa7ae..feaa0fcf 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -471,6 +471,10 @@ static _op_param_gen_t s_op_gen[] = /* TAN */ NULL, /* RMSNORM */ NULL, /* SHAPE */ NULL, + /* BITCAST */ NULL, + /* GROUPED_CONV3D */ NULL, + /* COL2IM */ NULL, + /* L1_LAYER_NORM */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c index e1d9b819..3a40e106 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c @@ -772,6 +772,7 @@ vsi_bool vsi_nn_CreateTensorGroup end[1] = in_tensor->attr.size[1]; end[2] = in_tensor->attr.size[2]; end[3] = in_tensor->attr.size[3]; + end[4] = in_tensor->attr.size[4]; end[axis] = 0; for( i = 0; i < group_number; i ++ ) { @@ -1259,6 +1260,32 @@ vsi_bool vsi_nn_is_same_quant_type( } break; } +#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: { + const float diff = (float)1e-5; + int32_t i = 0; + int32_t scale_cnt0 = src_dtype->group_count; + int32_t scale_cnt1 = dst_dtype->group_count; + int32_t group_size0 = src_dtype->group_size; + int32_t group_size1 = dst_dtype->group_size; + if (scale_cnt0 == scale_cnt1 && group_size0 == group_size1) + { + const float* src_scale_ptr = src_dtype->group_scales; + const float* dst_scale_ptr = dst_dtype->group_scales; + for (i = 0; i < scale_cnt0; i++) + { + if (vsi_nn_float_compare( + src_scale_ptr[i], dst_scale_ptr[i], diff) == FALSE) + { + return FALSE; + } + } + } else { + return FALSE; + } + break; + } +#endif default: break; } diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c index 7c7ed61d..4fd9be74 100644 --- a/src/tim/vx/internal/src/vsi_nn_context.c +++ b/src/tim/vx/internal/src/vsi_nn_context.c @@ -22,10 +22,10 @@ * *****************************************************************************/ #include -#include "vsi_nn_types.h" #include "vsi_nn_test.h" #include "vsi_nn_context.h" #include "vsi_nn_platform.h" +#include "vsi_nn_types.h" static vsi_status query_hardware_caps ( @@ -103,6 +103,9 @@ static const char* ENV_ENABLE_STREAM_PROCESSOR = "vendor.VSI_VX_ENABLE_STREAM_PR static const char* ENV_FORCE_RGB888_OUT_NHWC = "vendor.VSI_NN_FORCE_RGB888_OUT_NHWC"; static const char* ENV_ENABLE_SLICE_OPTIMIZE = "vendor.VSI_NN_ENABLE_SLICE_OPTIMIZE"; static const char* ENV_ENABLE_BATCH_OPT = "vendor.VSI_VX_ENABLE_BATCH_OPT"; +static const char* ENV_SAVE_FILE_TYPE = "vendor.VSI_SAVE_FILE_TYPE"; +static const char* VSI_USE_IMAGE_PROCESS = "vendor.VSI_USE_IMAGE_PROCESS"; +static const char* VSI_USE_FROM_HANDLE = "vendor.VSI_USE_FROM_HANDLE"; #else static const char* ENV_ENABLE_SHADER = "VIV_VX_ENABLE_SHADER"; static const char* ENV_ENABLE_OPCHECK = "VSI_NN_ENABLE_OPCHECK"; @@ -113,8 +116,11 @@ static const char* ENV_ENABLE_STREAM_PROCESSOR = "VSI_VX_ENABLE_STREAM_PROCESSOR static const char* ENV_FORCE_RGB888_OUT_NHWC = "VSI_NN_FORCE_RGB888_OUT_NHWC"; static const char* ENV_ENABLE_SLICE_OPTIMIZE = "VSI_NN_ENABLE_SLICE_OPTIMIZE"; static const char* ENV_ENABLE_BATCH_OPT = "VSI_VX_ENABLE_BATCH_OPT"; +static const char* ENV_SAVE_FILE_TYPE = "VSI_SAVE_FILE_TYPE"; +static const char* VSI_USE_IMAGE_PROCESS = "VSI_USE_IMAGE_PROCESS"; +static const char* VSI_USE_FROM_HANDLE = "VSI_USE_FROM_HANDLE"; #endif -static vsi_status vsi_nn_initOptions +vsi_status vsi_nn_initOptions ( vsi_nn_runtime_option_t *options ) @@ -129,7 +135,7 @@ static vsi_status vsi_nn_initOptions default_value = 1; #endif options->enable_concat_optimize = vsi_nn_getenv_asint(ENV_ENABLE_CONCAT_OPTIMIZE, default_value); - options->enable_asymi8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1); + options->enable_i8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1); options->enable_dataconvert_optimize = vsi_nn_getenv_asint(ENV_ENABLE_DATACONVERT_OPTIMIZE, 1); options->enable_stream_processor = vsi_nn_getenv_asint(ENV_ENABLE_STREAM_PROCESSOR, 1); options->enable_rgb88_planar_nhwc = vsi_nn_getenv_asint(ENV_FORCE_RGB888_OUT_NHWC, 0); @@ -140,6 +146,9 @@ static vsi_status vsi_nn_initOptions #endif options->enable_slice_optimize = vsi_nn_getenv_asint(ENV_ENABLE_SLICE_OPTIMIZE, default_value); options->enable_batch_opt = vsi_nn_getenv_asint(ENV_ENABLE_BATCH_OPT, 0); + options->enable_save_file_type = vsi_nn_getenv_asint(ENV_SAVE_FILE_TYPE, 0); + options->enable_use_image_process = vsi_nn_getenv_asint(VSI_USE_IMAGE_PROCESS, -1); + options->enable_use_from_handle = vsi_nn_getenv_asint(VSI_USE_FROM_HANDLE, -1); return VSI_SUCCESS; } diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index 3242621b..85cad885 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -1354,20 +1354,26 @@ vsi_nn_graph_t * vsi_nn_CreateGraph graph->node_num = 0; graph->ctx = ctx; graph->rnn_wksp = NULL; + ((vsi_nn_graph_prv_t*) graph)->options = + (vsi_nn_runtime_option_t *)malloc( sizeof( vsi_nn_runtime_option_t )); + CHECK_PTR_FAIL_GOTO(((vsi_nn_graph_prv_t*) graph)->options, "Create graph options fail.", error); graph->node_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) ); graph->tensor_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) ); graph->isAllowFastMode = TRUE; vsi_nn_MapInit( graph->node_table ); vsi_nn_MapInit( graph->tensor_table ); + vsi_nn_initOptions( ((vsi_nn_graph_prv_t*) graph)->options ); } else { VSILOGE( "Create vx graph fail." ); - free( graph ); + free(graph); graph = NULL; } } + return graph; +error: return graph; } /* vsi_nn_CreateGraph() */ @@ -1429,6 +1435,10 @@ void vsi_nn_ReleaseGraph free( tmp ); } } + if (NULL != ((vsi_nn_graph_prv_t*)ptr)->options) + { + free(((vsi_nn_graph_prv_t*)ptr)->options); + } free( ptr ); *graph = NULL; } @@ -1500,7 +1510,7 @@ vsi_status vsi_nn_SetupGraph } #if VX_GRAPH_BATCH_OPT_SUPPORT - if (graph->ctx->options.enable_batch_opt) + if (((vsi_nn_graph_prv_t*)graph)->options->enable_batch_opt) { /*processing batch splitting*/ status = batchInference_graph(graph, nodes_list); @@ -2064,7 +2074,7 @@ vsi_nn_node_t * vsi_nn_AddExternalNode const char * kernel_name ) { - vsi_nn_node_t * node; + vsi_nn_node_prv_t* node; vsi_nn_node_id_t id; vsi_nn_op_proc_t * node_proc; @@ -2076,16 +2086,17 @@ vsi_nn_node_t * vsi_nn_AddExternalNode { return NULL; } - node = (vsi_nn_node_t *)malloc( sizeof( vsi_nn_node_t ) ); + node = (vsi_nn_node_prv_t*)malloc(sizeof(vsi_nn_node_prv_t)); if( NULL != node ) { - memset( node, 0, sizeof( vsi_nn_node_t ) ); - node->graph = graph; - node->op = op; - node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; - node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO; - node->vx_param.down_scale_size_rounding = VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR; + memset(node, 0, sizeof(vsi_nn_node_prv_t)); + node->pon.graph = graph; + node->pon.op = op; + node->pon.vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + node->pon.vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO; + node->pon.vx_param.down_scale_size_rounding = + VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR; /* init op */ if(node_proc->init != NULL){ @@ -2093,31 +2104,31 @@ vsi_nn_node_t * vsi_nn_AddExternalNode } /* init output struct */ - node->output.num = node_proc->output_num; - node->output.tensors = (vsi_nn_tensor_id_t *) malloc( + node->pon.output.num = node_proc->output_num; + node->pon.output.tensors = (vsi_nn_tensor_id_t*)malloc( node_proc->output_num * sizeof( vsi_nn_tensor_id_t ) ); - if ( NULL == node->output.tensors ) + if (NULL == node->pon.output.tensors) { VSILOGE("Create output tensor id %s. fail", vsi_nn_OpGetName(op)); vsi_nn_safe_free(node); return NULL; } - vsi_nn_InitTensorsId( node->output.tensors, node_proc->output_num ); + vsi_nn_InitTensorsId(node->pon.output.tensors, node_proc->output_num); /* init input struct */ - node->input.num = node_proc->input_num; - node->input.tensors = (vsi_nn_tensor_id_t *) malloc( + node->pon.input.num = node_proc->input_num; + node->pon.input.tensors = (vsi_nn_tensor_id_t*)malloc( node_proc->input_num * sizeof( vsi_nn_tensor_id_t ) ); - if ( NULL == node->input.tensors ) + if (NULL == node->pon.input.tensors) { VSILOGE("Create input tensor id %s. fail", vsi_nn_OpGetName(op)); - vsi_nn_safe_free(node->output.tensors); + vsi_nn_safe_free(node->pon.output.tensors); vsi_nn_safe_free(node); return NULL; } - vsi_nn_InitTensorsId( node->input.tensors, node_proc->input_num ); - node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE; - node->attr.enable_op_constraint_check = TRUE; + vsi_nn_InitTensorsId(node->pon.input.tensors, node_proc->input_num); + node->pon.attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE; + node->pon.attr.enable_op_constraint_check = TRUE; } id = graph->cur_nid; if(NULL != node){ @@ -2126,7 +2137,7 @@ vsi_nn_node_t * vsi_nn_AddExternalNode graph->cur_nid ++; } vsi_nn_OpRegisterExternalOvxInit(op, kernel_name, node_proc); - return node; + return (vsi_nn_node_t*)node; } /* vsi_nn_AddExternalNode() */ void vsi_nn_RemoveNode @@ -3354,24 +3365,245 @@ vsi_status vsi_nn_ExecuteGraphLoop return status; } /* vsi_nn_ExecuteGraphLoop() */ +typedef enum { + VSI_NN_ENABLE_I8TOU8 = 0, + VSI_NN_ENABLE_OPCHECK, + VSI_SAVE_FILE_TYPE, + VSI_USE_IMAGE_PROCESS, + VSI_NN_LOG_LEVEL, + VSI_NN_ENABLE_CONCAT_OPTIMIZE, + VSI_NN_ENABLE_DATACONVERT_OPTIMIZE, + VSI_VX_ENABLE_STREAM_PROCESSOR, + VSI_NN_FORCE_RGB888_OUT_NHWC, + VSI_NN_ENABLE_SLICE_OPTIMIZE, + VSI_VX_ENABLE_BATCH_OPT, + VIV_VX_ENABLE_SHADER, + VSI_USE_FROM_HANDLE, + VIV_VX_ENABLE_GRAPH_TRANSFORM +} VSI_PUBLIC_TYPE vsi_nn_runtime_variable; + +typedef struct { + const char* key; + int32_t value; +} VSI_PUBLIC_TYPE keyValuePair; + +char* vsi_nn_GetRunTimeVariable + ( + const vsi_nn_graph_t* graph, + const char* key + ) +{ + int32_t isVaid = 1; + int32_t value = -1; +#define varSize 256 + char* value_str = (char*)malloc(sizeof(char) * varSize); + CHECK_PTR_FAIL_GOTO(value_str, "Create value_str fail.", final); + memset(value_str, 0, varSize); + char tmp_value[varSize] = {0}; + VSI_UNREFERENCED(tmp_value); + vsi_nn_runtime_option_t* options = ((vsi_nn_graph_prv_t*)graph)->options; + switch (vsi_nn_GetVariable(key)) + { + case VIV_VX_ENABLE_SHADER: + value =options->enable_shader; + break; + case VSI_NN_ENABLE_OPCHECK: + value = options->enable_opcheck; + break; + case VSI_NN_ENABLE_I8TOU8: + value = options->enable_i8_to_u8; + break; + case VSI_VX_ENABLE_STREAM_PROCESSOR: + value = options->enable_stream_processor; + break; + case VSI_VX_ENABLE_BATCH_OPT: + value = options->enable_batch_opt; + break; + case VSI_NN_FORCE_RGB888_OUT_NHWC: + value = options->enable_rgb88_planar_nhwc; + break; + case VSI_SAVE_FILE_TYPE: + value = options->enable_save_file_type; + break; + case VSI_NN_ENABLE_CONCAT_OPTIMIZE: + value = options->enable_concat_optimize; + break; + case VSI_NN_ENABLE_SLICE_OPTIMIZE: + value = options->enable_slice_optimize; + break; + case VSI_USE_IMAGE_PROCESS: + if (options->enable_use_image_process != -1) + { + value = options->enable_use_image_process; + } + else + { + isVaid = 0; + } + break; + case VSI_USE_FROM_HANDLE: + if (options->enable_use_from_handle != -1) + { + value = options->enable_use_from_handle; + } + else + { + isVaid = 0; + } + break; + default: + isVaid = 0; + VSILOGE("Not support this key: %s.", key); + } + if (isVaid == 1) + { + snprintf(tmp_value, varSize, "%d", value); + memcpy(value_str, tmp_value, varSize); + } else + { + goto final; + } +#undef varSize + return value_str; +final: +#undef varSize + vsi_nn_safe_free(value_str); + return value_str; +} -vsi_status vsi_nn_SetGraphTransformOption +vsi_status vsi_nn_SetRunTimeVariable ( vsi_nn_graph_t* graph, - const char* ctrl_str, - size_t size - ) + const char* key, + const char* value + ) { - vsi_status status = VSI_FAILURE; - VSI_UNREFERENCED(graph); - VSI_UNREFERENCED(ctrl_str); + vsi_status status = VSI_SUCCESS; + size_t size = 1; // placeholder, not used in vxSetGraphAttribute. + if (graph == NULL) + { + status = VSI_FAILURE; + return status; + } + vsi_nn_runtime_option_t* options = ((vsi_nn_graph_prv_t*)graph)->options; VSI_UNREFERENCED(size); + if (vsi_nn_getenv(key) == NULL) + { + switch (vsi_nn_GetVariable(key) ) + { + case VIV_VX_ENABLE_SHADER: + options->enable_shader = atoi(value); + break; + case VSI_NN_ENABLE_OPCHECK: + options->enable_opcheck = atoi(value); + break; + case VSI_NN_ENABLE_I8TOU8: + options->enable_i8_to_u8 = atoi(value); + break; + case VSI_VX_ENABLE_STREAM_PROCESSOR: + options->enable_stream_processor = atoi(value); + break; + case VSI_VX_ENABLE_BATCH_OPT: + options->enable_batch_opt = atoi(value); + break; + case VSI_NN_FORCE_RGB888_OUT_NHWC: + options->enable_rgb88_planar_nhwc = atoi(value); + break; + case VSI_NN_ENABLE_CONCAT_OPTIMIZE: + options->enable_concat_optimize = atoi(value); + break; + case VSI_NN_ENABLE_DATACONVERT_OPTIMIZE: + options->enable_dataconvert_optimize = atoi(value); + break; + case VSI_NN_ENABLE_SLICE_OPTIMIZE: + options->enable_slice_optimize = atoi(value); + break; + case VSI_SAVE_FILE_TYPE: + options->enable_save_file_type = atoi(value); + break; + case VSI_USE_IMAGE_PROCESS: + options->enable_use_image_process = atoi(value); + break; + case VSI_USE_FROM_HANDLE: + options->enable_use_from_handle = atoi(value); + break; + case VIV_VX_ENABLE_GRAPH_TRANSFORM: #ifdef VX_GRAPH_TRANSFORM_OPTION_SUPPORT + if (graph && graph->g) { + status = vxSetGraphAttribute( + graph->g, VX_GRAPH_VSI_TRANSFORM_OPTIONS, value, size); + } +#else + status = VSI_FAILURE; + VSILOGE("VX_GRAPH_TRANSFORM_OPTION_SUPPORT is not defined, please check driver version."); +#endif + break; + default: +#ifdef VX_GRAPH_ENV_SUPPORT + status = vxSetGraphEnv(graph->g, key, value); +#else + status = VSI_FAILURE; + VSILOGE("VX_GRAPH_ENV_SUPPORT is not defined, please check driver version."); +#endif + break; + } + } + return status; +} - if(graph && graph->g) +int32_t vsi_nn_GetVariable(const char* variableKey) { + keyValuePair dict[] = { + {"VSI_NN_ENABLE_I8TOU8", VSI_NN_ENABLE_I8TOU8}, + {"VSI_NN_ENABLE_OPCHECK", VSI_NN_ENABLE_OPCHECK}, + {"VSI_SAVE_FILE_TYPE", VSI_SAVE_FILE_TYPE}, + {"VSI_USE_IMAGE_PROCESS", VSI_USE_IMAGE_PROCESS}, + {"VSI_NN_ENABLE_CONCAT_OPTIMIZE", VSI_NN_ENABLE_CONCAT_OPTIMIZE}, + {"VSI_NN_ENABLE_DATACONVERT_OPTIMIZE", VSI_NN_ENABLE_DATACONVERT_OPTIMIZE}, + {"VSI_VX_ENABLE_STREAM_PROCESSOR", VSI_VX_ENABLE_STREAM_PROCESSOR}, + {"VSI_NN_FORCE_RGB888_OUT_NHWC", VSI_NN_FORCE_RGB888_OUT_NHWC}, + {"VSI_NN_ENABLE_SLICE_OPTIMIZE", VSI_NN_ENABLE_SLICE_OPTIMIZE}, + {"VSI_VX_ENABLE_BATCH_OPT", VSI_VX_ENABLE_BATCH_OPT}, + {"VIV_VX_ENABLE_SHADER", VIV_VX_ENABLE_SHADER}, + {"VSI_USE_FROM_HANDLE", VSI_USE_FROM_HANDLE}, + {"VIV_VX_ENABLE_GRAPH_TRANSFORM", VIV_VX_ENABLE_GRAPH_TRANSFORM}, + {NULL, -1} + }; + for (int32_t i = 0; dict[i].key != NULL; i++) { + if (strcmp(dict[i].key, variableKey) == 0) { + return dict[i].value; + } + } + return -1; +} + +OVXLIB_API char* vsi_nn_GenerateGraphJson + ( + vsi_nn_graph_t* graph + ) +{ + char* json = NULL; + VSI_UNREFERENCED(graph); +#ifdef VX_GENERATE_GRAPH_JSON_API_SUPPORT + if (graph && graph->g) { - status = vxSetGraphAttribute(graph->g, VX_GRAPH_VSI_TRANSFORM_OPTIONS, ctrl_str, size); + json = vxGenerateGraphJson(graph->g); } #endif + return json; +} + +OVXLIB_API vsi_status vsi_nn_ReleaseGraphJson + ( + char* json + ) +{ + vsi_status status = VSI_FAILURE; + VSI_UNREFERENCED(json); +#ifdef VX_GENERATE_GRAPH_JSON_API_SUPPORT + if (json) { + status = vxReleaseGraphJson(json); + } +#endif + return status; } \ No newline at end of file diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c index aafc8903..c017ea50 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c +++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c @@ -26,6 +26,7 @@ #include "vsi_nn_graph_optimization.h" #include "vsi_nn_tensor_util.h" #include "vsi_nn_graph.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_log.h" #include "vsi_nn_error.h" @@ -37,14 +38,50 @@ static vsi_bool _is_asymm_int8_norm_tensor { vsi_bool ret = FALSE; - ret = ( tensor != NULL - && tensor->attr.vtl == FALSE && tensor->attr.is_const == FALSE - && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 - && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC); + ret = ( tensor != NULL && + tensor->attr.vtl == FALSE && + tensor->attr.is_const == FALSE && + tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 && + tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC + ); return ret; }/* _is_asymm_int8_norm_tensor() */ +static vsi_bool _is_symm_int8_norm_tensor +( + vsi_nn_tensor_t* tensor +) +{ + vsi_bool ret = FALSE; + + ret = (tensor != NULL && + tensor->attr.vtl == FALSE && + tensor->attr.is_const == FALSE && + tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 && + tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC + ); + + return ret; +}/* _is_symm_int8_norm_tensor() */ + +static vsi_bool _is_int8_norm_tensor +( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* tensor +) +{ + vsi_bool ret = FALSE; + vsi_bool support_symi8 = + ((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2; + + + ret = _is_asymm_int8_norm_tensor(tensor); + ret = ret || (support_symi8 && _is_symm_int8_norm_tensor(tensor)); + + return ret; +}/* _is_int8_norm_tensor() */ + static vsi_bool _is_asymm_int8_const_tensor ( vsi_nn_tensor_t * tensor @@ -52,14 +89,47 @@ static vsi_bool _is_asymm_int8_const_tensor { vsi_bool ret = FALSE; - ret = ( tensor != NULL - && tensor->attr.is_const == TRUE - && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 - && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC); + ret = ( tensor != NULL && + tensor->attr.is_const == TRUE && + tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 && + tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC + ); return ret; }/* _is_asymm_int8_const_tensor() */ +static vsi_bool _is_symm_int8_const_tensor +( + vsi_nn_tensor_t* tensor +) +{ + vsi_bool ret = FALSE; + + ret = (tensor != NULL && + tensor->attr.is_const == TRUE && + tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 && + tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC + ); + + return ret; +}/* _is_symm_int8_const_tensor() */ + +static vsi_bool _is_int8_const_tensor +( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* tensor +) +{ + vsi_bool ret = FALSE; + vsi_bool support_symi8 = + ((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2; + + ret = _is_asymm_int8_const_tensor(tensor); + ret = ret || (support_symi8 && _is_symm_int8_const_tensor(tensor)); + + return ret; +}/* _is_int8_const_tensor() */ + static vsi_bool _is_asymm_int8_virtual_tensor ( vsi_nn_tensor_t * tensor @@ -67,14 +137,47 @@ static vsi_bool _is_asymm_int8_virtual_tensor { vsi_bool ret = FALSE; - ret = ( tensor != NULL - && tensor->attr.vtl == TRUE - && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 - && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC); + ret = ( tensor != NULL && + tensor->attr.vtl == TRUE && + tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 && + tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC + ); return ret; }/* _is_asymm_int8_virtual_tensor() */ +static vsi_bool _is_symm_int8_virtual_tensor +( + vsi_nn_tensor_t* tensor +) +{ + vsi_bool ret = FALSE; + + ret = (tensor != NULL && + tensor->attr.vtl == TRUE && + tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 && + tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC + ); + + return ret; +}/* _is_symm_int8_virtual_tensor() */ + +static vsi_bool _is_int8_virtual_tensor +( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* tensor +) +{ + vsi_bool ret = FALSE; + vsi_bool support_symi8 = + ((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2; + + ret = _is_asymm_int8_virtual_tensor(tensor); + ret = ret || (support_symi8 && _is_symm_int8_virtual_tensor(tensor)); + + return ret; +}/* _is_int8_virtual_tensor() */ + static vsi_status _add_forward_node ( vsi_nn_graph_t* graph, @@ -199,7 +302,7 @@ static void _get_graph_input_asymm_int8_norm_tensor vsi_nn_tensor_id_t id = node->input.tensors[j]; vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); - if (_is_asymm_int8_norm_tensor(tensor)) + if (_is_int8_norm_tensor(graph, tensor)) { if(tensor_ids != NULL) { @@ -251,7 +354,7 @@ static void _get_graph_output_asymm_int8_norm_tensor vsi_nn_tensor_id_t id = node->output.tensors[j]; vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); - if (_is_asymm_int8_norm_tensor(tensor)) + if (_is_int8_norm_tensor(graph, tensor)) { if(tensor_ids != NULL) { @@ -360,6 +463,7 @@ static vsi_status _add_graph_dataconvert_for_int8 { memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t)); attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; attr.dtype.zero_point += 128; attr.vtl = TRUE; output = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL ); @@ -383,6 +487,7 @@ static vsi_status _add_graph_dataconvert_for_int8 { memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t)); attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; attr.dtype.zero_point += 128; attr.vtl = TRUE; input = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL ); @@ -788,6 +893,7 @@ static void _convert_const_I8toU8 } attr->dtype.vx_type = VSI_NN_TYPE_UINT8; + attr->dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; attr->dtype.zero_point += 128; if ( tensor->t ) vxReleaseTensor(&tensor->t); @@ -818,7 +924,7 @@ static vsi_status _convert_graph_const_tensor vsi_nn_tensor_id_t id = node->input.tensors[j]; vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); - if (_is_asymm_int8_const_tensor(tensor)) + if (_is_int8_const_tensor(graph, tensor)) { _convert_const_I8toU8(graph, id); } @@ -835,11 +941,9 @@ static vsi_status _convert_virtual_tensor_attr vsi_nn_tensor_t * tensor ) { - if (_is_asymm_int8_virtual_tensor(tensor)) - { - tensor->attr.dtype.vx_type = VSI_NN_TYPE_UINT8; - tensor->attr.dtype.zero_point += 128; - } + tensor->attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + tensor->attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; + tensor->attr.dtype.zero_point += 128; return VSI_SUCCESS; }/* _convert_virtual_tensor_attr() */ @@ -849,7 +953,7 @@ static vsi_status _convert_graph_virtual_tensor vsi_nn_graph_t* graph ) { - vsi_status status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; uint32_t node_num = graph->node_num; vsi_nn_node_t* node = NULL; uint32_t i = 0; @@ -865,7 +969,10 @@ static vsi_status _convert_graph_virtual_tensor vsi_nn_tensor_id_t id = node->input.tensors[j]; vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); - status = _convert_virtual_tensor_attr(tensor); + if (_is_int8_virtual_tensor(graph, tensor)) + { + status = _convert_virtual_tensor_attr(tensor); + } } for(j = 0; j < node->output.num; j++) @@ -873,7 +980,10 @@ static vsi_status _convert_graph_virtual_tensor vsi_nn_tensor_id_t id = node->output.tensors[j]; vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); - status = _convert_virtual_tensor_attr(tensor); + if (_is_int8_virtual_tensor(graph, tensor)) + { + status = _convert_virtual_tensor_attr(tensor); + } } } @@ -925,7 +1035,7 @@ vsi_status vsi_nn_OptimizeGraph status = VSI_SUCCESS; - if (!nbg_flag && graph->ctx->options.enable_asymi8_to_u8) + if (!nbg_flag &&((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8) { status = _graph_optimization_convert_int8_to_uint8(graph, dirty); CHECK_STATUS_FAIL_GOTO(status, final); diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c index c240d3be..b8f43111 100644 --- a/src/tim/vx/internal/src/vsi_nn_internal_node.c +++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c @@ -452,7 +452,8 @@ void vsi_nn_internal_init_tensor_attr if( dtype->qnt_type == VSI_NN_QNT_TYPE_NONE && ( dtype->vx_type != VSI_NN_TYPE_FLOAT16 && dtype->vx_type != VSI_NN_TYPE_FLOAT32 && - dtype->vx_type != VSI_NN_TYPE_BFLOAT16 ) ) + dtype->vx_type != VSI_NN_TYPE_BFLOAT16 && + dtype->vx_type != VSI_NN_TYPE_INT32) ) { attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; attr->dtype.vx_type = VSI_NN_TYPE_FLOAT16; diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c index 4a9caeaf..eb51f99e 100644 --- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c +++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c @@ -208,6 +208,10 @@ static _node_template s_template[] = /* RESIZE_3D */ NULL, /* REDUCEL2 */ NULL, /* CROP_AND_RESIZE */ NULL, + /* BITCAST */ NULL, + /* GROUPED_CONV3D */ NULL, + /* CO2IM */ NULL, + /* L1_LAYER_NORM */ NULL, }; //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c ); diff --git a/src/tim/vx/internal/src/vsi_nn_ops.c b/src/tim/vx/internal/src/vsi_nn_ops.c index b706240c..950f9570 100644 --- a/src/tim/vx/internal/src/vsi_nn_ops.c +++ b/src/tim/vx/internal/src/vsi_nn_ops.c @@ -26,6 +26,7 @@ #include "vsi_nn_client_op.h" #include "vsi_nn_node.h" #include "vsi_nn_types.h" +#include "vsi_nn_types_prv.h" #include "vsi_nn_graph.h" #include "vsi_nn_log.h" @@ -281,7 +282,7 @@ vsi_bool vsi_nn_OpCheck if ( NULL != proc ) { ret = TRUE; - if ( proc->check && node->graph->ctx->options.enable_opcheck) + if ( proc->check && ((vsi_nn_graph_prv_t*)(node->graph))->options->enable_opcheck) { ret = proc->check( node, inputs, outputs ); } diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index 4d102225..179755f9 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -144,6 +144,17 @@ static void print_tensor tensor->attr.dtype.scale_dim); ext_attr[count] = 0; break; +#endif +#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: + count = snprintf(&ext_attr[0], + _EXT_ATTR_BUF_SZ, + "SYM GPTQ axis=%d, count=%d, group_size=%d", + tensor->attr.dtype.group_channel_dim, + tensor->attr.dtype.group_count, + tensor->attr.dtype.group_size); + ext_attr[count] = 0; + break; #endif default: vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ); @@ -430,6 +441,25 @@ static vsi_bool _init_tensor VSILOGE( "can't support qnt_type " "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC."); +#endif + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: +#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT + params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_GROUP; + // This is a hack that driver doesn't support const scales + scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count); + CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final ); + memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float)); + params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim; + params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size; + params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count; + params.quant_data.affinePerGroup.scales = scales; + params.quant_data.affinePerGroup.zero_points = NULL; + params.quant_data.affinePerGroup.zero_point_group_count = 0; + break; +#else + VSILOGE( + "can't support qnt_type " + "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC."); #endif default: break; diff --git a/src/tim/vx/internal/src/vsi_nn_types_prv.h b/src/tim/vx/internal/src/vsi_nn_types_prv.h index 00b55fd2..4f9fd0bf 100644 --- a/src/tim/vx/internal/src/vsi_nn_types_prv.h +++ b/src/tim/vx/internal/src/vsi_nn_types_prv.h @@ -58,6 +58,7 @@ typedef struct _vsi_nn_graph_prv // Add graph internal attribute here... vsi_nn_swap_handle_cache_t swap_handle_cache; + vsi_nn_runtime_option_t* options; } vsi_nn_graph_prv_t; /** Internal Node structure, internal use only. */