From 94f5fd3969919213740e2b7e5f8e193333e5932a Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Sun, 20 Aug 2023 04:07:34 +0000 Subject: [PATCH 01/23] Squash commits together replace constant with appropriate block dimension WIP: add strides as a kernel parameter for non-packed tensors WIP: created non-packed variants for remaining 3D conv kernels fix: treat input and output as 6D NDHWGC tensor use std::call_once for initializing random seed revamp naive kernels to use strides in the non-packed tensors case (controlled by a bool flag) WIP: fixed kernel compilation issues but unable to load kernel code object WIP: fixed issue with hip rtc split channel strides into group, channels_per_group in solver fix indexing to left-to-right order fix bug with too much padding between kernel args num channels should be a multiple of num groups re-enable naive ref kernels with strides array 2D forward tests are all working now WIP: debugging bwd tests WIP: tests up till 3D bwd conv passing fix bug in bwd ndhwc kernel fix formatting disable prints fix readability-inconsistent-declaration-parameter-name fix clang-format fix hip tidy issue reverting the change to static init of random seed address comments and tidy issues. Remove extra print removed blank line change remove unneeded include env var for choosing packed vs non-packed reference kernel fix warnings from hip-tidy address comment about array initialization clear a tiny hip tidy issue --- src/include/miopen/hipoc_kernel.hpp | 15 +- .../miopen/solver/conv_direct_naive_conv.hpp | 159 +- .../gpu_reference_kernel/naive_conv.cpp | 1927 ++++++++++++----- src/solver/conv_direct_naive_conv.cpp | 58 +- src/solver/conv_direct_naive_conv_bwd.cpp | 47 +- src/solver/conv_direct_naive_conv_fwd.cpp | 32 + src/solver/conv_direct_naive_conv_wrw.cpp | 36 + test/gpu_reference_kernel.cpp | 3 +- 8 files changed, 1696 insertions(+), 581 deletions(-) diff --git a/src/include/miopen/hipoc_kernel.hpp b/src/include/miopen/hipoc_kernel.hpp index ba9992bab3..b18955b5c2 100644 --- a/src/include/miopen/hipoc_kernel.hpp +++ b/src/include/miopen/hipoc_kernel.hpp @@ -47,7 +47,7 @@ inline HipEventPtr make_hip_event() #if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017 -#if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017 +#if 0 // Keep around other storage techinques -- @pfultz2 27.03.2017 template struct KernelArgsPair { @@ -65,9 +65,16 @@ struct KernelArgsPair template struct KernelArgsPair { - KernelArgsPair(T x, U y) : first(x), second(y) {} - T first; - U second; + static const int alignment = alignof(U); + static const int padding = (alignment - (sizeof(T) % alignment)) % alignment; + static_assert(padding >= 0, "padding cannot be negative"); + static const int second_index = sizeof(T) + padding; + KernelArgsPair(T x, U y) + { + new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew) + new(buffer + second_index) U(y); + } + alignas(U) char buffer[second_index + sizeof(U)] = {}; }; #endif diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp index f05bbdf712..927a64feff 100644 --- a/src/include/miopen/solver/conv_direct_naive_conv.hpp +++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp @@ -25,9 +25,14 @@ *******************************************************************************/ #pragma once -#include #include +#include +#include +#include +#include +#include + namespace miopen { namespace solver { @@ -53,5 +58,157 @@ bool IsOutputBfp16(const ProblemDescription&); bool IsOutputInt8(const ProblemDescription&); bool IsOutputInt32(const ProblemDescription&); +int GetGroupStrideIndex(const ProblemDescription& problem); + +void printTensorStrides(const TensorDescriptor& inDesc, + const TensorDescriptor& wDesc, + const TensorDescriptor& outDesc); + +// TODO(Amber): Uncomment when hip RTC accepts std::array +// using StrideIndexType = int; +// using Strides3D = std::array; +// using Strides4D = std::array; +// using Strides5D = std::array; +// using Strides6D = std::array; +#if 1 +template +class MyArray +{ + T data_[N] = {}; + +public: + constexpr static const unsigned SIZE = N; + __host__ __device__ constexpr unsigned size() const { return N; } + + __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; } + + __host__ T& operator[](unsigned i) { return data_[i]; } + + __host__ __device__ MyArray() = default; + __host__ __device__ MyArray(const MyArray&) = default; + __host__ __device__ MyArray(MyArray&&) noexcept = default; + __host__ __device__ MyArray& operator=(const MyArray&) = default; + __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default; + __host__ __device__ ~MyArray() = default; +}; + +using StrideIndexType = int; +using Strides5D = MyArray; +using Strides6D = MyArray; + +#else + +extern "C" typedef int StrideIndexType; + +extern "C" typedef struct +{ + StrideIndexType v[5]; +} Strides5D; + +extern "C" typedef struct +{ + StrideIndexType v[6]; +} Strides6D; + +#endif + +namespace internal { +template +struct ChooseStride +{ +}; + +template <> +struct ChooseStride<5u> +{ + using type = Strides5D; +}; + +template <> +struct ChooseStride<6u> +{ + using type = Strides6D; +}; + +} // end namespace internal + +template +auto MakeStrideArray(V vec) +{ + typename internal::ChooseStride::type ret; + assert(vec.size() == N); + + // MIOpen stores strides for NHWC in NCHW order, i.e. C stride in 2nd from left. + // We sort the input stride vector so that smallest stride is at index 0. This + // (little-endian) order is what naive convolution kernel expects for strides + std::sort(vec.begin(), vec.end()); + + for(unsigned i = 0; i < N; ++i) + { + ret[i] = static_cast(vec[i]); + } + return ret; +} + +/** + * split the strides for C dimension in a tensor descriptor into (G, C_per_group). + * Normally, (in packed case) num channels is a multiplying factor in the stride of + * whatever lies to the left of C, e.g., in NCHW, N's stride contains C as a + * factor. We output NGCHW for NCHW (and NHWGC for NHWC) + * where the stride[G] = stride[N] / num_groups + */ +template +V SplitStrideCtoGC(int num_groups, const V& orig_strides, int G_stride_idx) +{ + assert(G_stride_idx > 0 && G_stride_idx <= orig_strides.size()); + // (G_stride_idx - 1) is the stride index of whatever lies to the left and + // contains C or K as a multiplying factor. We divide this value by num_groups + // to get G_stride_val + assert(orig_strides[G_stride_idx - 1] % num_groups == 0); + + V ret{orig_strides}; + auto G_stride_val = orig_strides[G_stride_idx - 1] / num_groups; + + ret.insert(ret.begin() + G_stride_idx, G_stride_val); + + return ret; +} + +/** + * Weight tensor has original dims: [K, C_per_group, Y, X] (2D case) + * We return a new stride vector with strides for [G, K_per_group, C_per_group, Y, X] + * Stride for G is computed as stride[C_per_group] * K_per_group and inserted at + * left most position + */ +template +V SplitWeiStrideKtoGK(int k_per_group, const V& wei_strides) +{ + V ret{wei_strides}; + ret.insert(ret.begin(), wei_strides[0] * k_per_group); + return ret; +} + +template +void printStrideArray(const char* name, const StrideArray& sarr) +{ + printf("%s = [", name); + for(unsigned i = 0; i < StrideArray::SIZE; ++i) + { + printf("%d,", sarr[i]); + } + printf("]\n"); +} + +template +void printStrideArrays(const StrideArray& in_strides, + const StrideArray& wei_strides, + const StrideArray& out_strides) +{ + + printStrideArray("in_strides", in_strides); + printStrideArray("wei_strides", wei_strides); + printStrideArray("out_strides", out_strides); +} + } // namespace solver } // namespace miopen diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp index 24d7cd489e..6ffb0789c4 100644 --- a/src/kernels/gpu_reference_kernel/naive_conv.cpp +++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp @@ -114,10 +114,120 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val) return static_cast(val & 0xff); } -template +// TODO(Amber): this file is compiled via HIP RTC and includes don't work easily +// so currently duplicating content from miopen/common.hpp +// #include "miopen/common.hpp" +// #include +// TODO(Amber): HIP RTC redefines stuff from std library (I don't know why) +// #include +#if 1 +template +class MyArray +{ + T data_[N] = {}; + +public: + constexpr static const unsigned SIZE = N; + + __host__ __device__ constexpr unsigned size() const { return N; } + + __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; } + + __host__ __device__ T& operator[](unsigned i) { return data_[i]; } + + __host__ __device__ MyArray() = default; + __host__ __device__ MyArray(const MyArray&) = default; + __host__ __device__ MyArray(MyArray&&) noexcept = default; + __host__ __device__ MyArray& operator=(const MyArray&) = default; + __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default; + __host__ __device__ ~MyArray() = default; +}; + +using StrideIndexType = int; +using Strides5D = MyArray; +using Strides6D = MyArray; +#else + +extern "C" typedef int StrideIndexType; + +extern "C" typedef struct +{ + StrideIndexType v[5]; +} Strides5D; + +extern "C" typedef struct +{ + StrideIndexType v[6]; +} Strides6D; + +extern "C" __global__ void testKernel(void* ptr_a, + void* ptr_b, + void* ptr_c, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides) +{ + + if(blockIdx.x == 0 && threadIdx.x == 0) + { + printf("sizeof(Strides5D) = %lu\n", sizeof(Strides5D)); + printf("%p, %p, %p, %p\n", &ptr_a, &ptr_b, &ptr_c, &in_strides); + printf("in_strides = ["); + for(int i = 0; i < 5; ++i) + { + // printf("%d,", in_strides.v[i]); + printf("%d,", in_strides[i]); + } + printf("]\n"); + printf("wei_strides = ["); + for(int i = 0; i < 5; ++i) + { + // printf("%d,", wei_strides.v[i]); + printf("%d,", wei_strides[i]); + } + printf("]\n"); + printf("out_strides = ["); + for(int i = 0; i < 5; ++i) + { + // printf("%d,", out_strides.v[i]); + printf("%d,", out_strides[i]); + } + printf("]\n"); + } +} +#endif + +template +__device__ void printStrideArray(const char* name, const StrideArray& sarr) +{ + printf("%s = [", name); + for(int i = 0; i < StrideArray::SIZE; ++i) + { + printf("%d,", sarr[i]); + } + printf("]\n"); +} + +// TODO(Amber): remove template parameter 'bool ASSUME_PACKED' in a follow up PR +// Notes (Amber): +// * The following code used to assume that group (G) is an implicit +// dimension, i.e. c= c_per_group * group and k = k_per_group * group. This is not +// true for non-packed case because group (G) dimension needs to have its stride +// explicitly specified for address math to make sense. This is also how +// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw, +// and nhwc should be nhwgc. Same follows for the 3D case. +// * strides here are in the little-endian order, i.e., for NHWC, stride for N is +// at index 3 while stride for C is at index 0. This is reverse of how strides are +// stored in tensor descriptors, which are big-endian. + +// TODO(Amber): Rename nchw to ngchw +template inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, dst_data_t* __restrict__ p_out, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides, int hi, int wi, int n, @@ -135,6 +245,43 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, int fx, int group) { + + // TODO(Amber): Remove this code + /* + if (blockIdx.x == 0 && threadIdx.x == 0) { + printStrideArray("in_strides", in_strides); + printStrideArray("wei_strides", wei_strides); + printStrideArray("out_strides", out_strides); + + printf("modified strides\n"); + Strides5D in_strd; + Strides5D wei_strd; + Strides5D out_strd; + + in_strd[0] = 1; + in_strd[1] = wi; + in_strd[2] = hi * wi; + in_strd[3] = c_per_group * hi * wi; + in_strd[4] = group * c_per_group * hi * wi; + + wei_strd[0] = 1; + wei_strd[1] = fx; + wei_strd[2] = fy * fx; + wei_strd[3] = c_per_group * fy * fx; + wei_strd[4] = k_per_group * c_per_group * fy * fx; + + out_strd[0] = 1; + out_strd[1] = wo; + out_strd[2] = ho * wo; + out_strd[3] = k_per_group * ho * wo; + out_strd[4] = group * k_per_group * ho * wo; + + printStrideArray("in_strd", in_strd); + printStrideArray("wei_strd", wei_strd); + printStrideArray("out_strd", out_strd); + } + */ + /* * need to compute total output pixel: `group * n * k_per_group * ho * wo`. * to distribute this workload, let one workgroup compute `ho * wo` pixel, @@ -148,18 +295,37 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, int in = (bid / k_per_group) % n; int ig = bid / (n * k_per_group); - p_in += static_cast(in) * c * hi * wi + static_cast(ig) * c_per_group * hi * wi; - p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + - static_cast(ik) * c_per_group * fy * fx; - p_out += static_cast(in) * k * ho * wo + - static_cast(ig) * k_per_group * ho * wo + static_cast(ik) * ho * wo; + if constexpr(ASSUME_PACKED) + { + p_in += + static_cast(in) * c * hi * wi + static_cast(ig) * c_per_group * hi * wi; + + p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + + static_cast(ik) * c_per_group * fy * fx; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += static_cast(in) * k * ho * wo + + static_cast(ig) * k_per_group * ho * wo + + static_cast(ik) * ho * wo; + } + else + { + p_in += static_cast(in) * in_strides[4] + static_cast(ig) * in_strides[3]; + + p_wei += + static_cast(ig) * wei_strides[4] + static_cast(ik) * wei_strides[3]; + + p_out += static_cast(in) * out_strides[4] + + static_cast(ig) * out_strides[3] + + static_cast(ik) * out_strides[2]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int iho = tid / wo; int iwo = tid % wo; - double value = .0f; + // double value = .0f; + acc_data_t value = 0; for(int ic = 0; ic < c_per_group; ic++) { @@ -178,25 +344,58 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, if(valid_w & valid_h) { - size_t i_idx = static_cast(ic) * hi * wi + - static_cast(cur_h) * wi + static_cast(cur_w); - size_t f_idx = static_cast(ic) * fy * fx + - static_cast(iy) * fx + static_cast(ix); - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(ic) * hi * wi + + static_cast(cur_h) * wi + + static_cast(cur_w); + + size_t f_idx = static_cast(ic) * fy * fx + + static_cast(iy) * fx + static_cast(ix); + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t i_idx = static_cast(ic) * in_strides[2] + + static_cast(cur_h) * in_strides[1] + + static_cast(cur_w) * in_strides[0]; + + size_t f_idx = static_cast(ic) * wei_strides[2] + + static_cast(iy) * wei_strides[1] + + static_cast(ix) * wei_strides[0]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } } } } } - size_t o_idx = static_cast(iho) * wo + static_cast(iwo); - p_out[o_idx] = cast_to(value); + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(iho) * wo + static_cast(iwo); + + p_out[o_idx] = cast_to(value); + } + else + { + size_t o_idx = static_cast(iho) * out_strides[1] + + static_cast(iwo) * out_strides[0]; + + p_out[o_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides, int hi, int wi, int n, @@ -227,19 +426,36 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in, int in = (bid / c_per_group) % n; int ig = bid / (n * c_per_group); - p_in += static_cast(in) * c * hi * wi + - static_cast(ig) * c_per_group * hi * wi + static_cast(ic) * hi * wi; - p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + - static_cast(ic) * fy * fx; - p_out += - static_cast(in) * k * ho * wo + static_cast(ig) * k_per_group * ho * wo; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * c * hi * wi + + static_cast(ig) * c_per_group * hi * wi + static_cast(ic) * hi * wi; + + p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + + static_cast(ic) * fy * fx; + + p_out += + static_cast(in) * k * ho * wo + static_cast(ig) * k_per_group * ho * wo; + } + else + { + p_in += static_cast(in) * in_strides[4] + static_cast(ig) * in_strides[3] + + static_cast(ic) * in_strides[2]; + + p_wei += + static_cast(ig) * wei_strides[4] + static_cast(ic) * wei_strides[2]; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += + static_cast(in) * out_strides[4] + static_cast(ig) * out_strides[3]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ihi = tid / wi; int iwi = tid % wi; - double value = .0f; + // double value = .0f; + acc_data_t value = 0; for(int ik = 0; ik < k_per_group; ik++) { @@ -264,26 +480,59 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in, if(valid_h & valid_w) { - size_t o_idx = static_cast(ik) * ho * wo + - static_cast(cur_ho) * wo + - static_cast(cur_wo); - size_t f_idx = static_cast(ik) * c_per_group * fy * fx + - static_cast(iy) * fx + static_cast(ix); - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(ik) * ho * wo + + static_cast(cur_ho) * wo + + static_cast(cur_wo); + + size_t f_idx = static_cast(ik) * c_per_group * fy * fx + + static_cast(iy) * fx + static_cast(ix); + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t o_idx = static_cast(ik) * out_strides[2] + + static_cast(cur_ho) * out_strides[1] + + static_cast(cur_wo) * out_strides[0]; + + size_t f_idx = static_cast(ik) * wei_strides[3] + + static_cast(iy) * wei_strides[1] + + static_cast(ix) * wei_strides[0]; + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } } } } } - size_t i_idx = static_cast(ihi) * wi + static_cast(iwi); - p_in[i_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(ihi) * wi + static_cast(iwi); + + p_in[i_idx] = cast_to(value); + } + else + { + size_t i_idx = + static_cast(ihi) * in_strides[1] + static_cast(iwi) * in_strides[0]; + + p_in[i_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in, dst_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides, int hi, int wi, int n, @@ -315,18 +564,35 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in, int ik = bid % k_per_group; int ig = bid / k_per_group; - p_in += static_cast(ig) * c_per_group * hi * wi; - p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + - static_cast(ik) * c_per_group * fy * fx; - p_out += static_cast(ig) * k_per_group * ho * wo + static_cast(ik) * ho * wo; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(ig) * c_per_group * hi * wi; + + p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + + static_cast(ik) * c_per_group * fy * fx; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += + static_cast(ig) * k_per_group * ho * wo + static_cast(ik) * ho * wo; + } + else + { + p_in += static_cast(ig) * in_strides[3]; + + p_wei += + static_cast(ig) * wei_strides[4] + static_cast(ik) * wei_strides[3]; + + p_out += + static_cast(ig) * out_strides[3] + static_cast(ik) * out_strides[2]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ix = tid % fx; int iy = (tid / fx) % fy; int ic = tid / (fx * fy); - double value = .0f; + // double value = .0f; + acc_data_t value = 0; for(int in = 0; in < n; in++) { @@ -345,28 +611,64 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in, if(valid_h & valid_w) { - size_t i_idx = static_cast(in) * c * hi * wi + - static_cast(ic) * hi * wi + - static_cast(cur_h) * wi + static_cast(cur_w); - size_t o_idx = static_cast(in) * k * ho * wo + - static_cast(iho) * wo + static_cast(iwo); - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(in) * c * hi * wi + + static_cast(ic) * hi * wi + + static_cast(cur_h) * wi + + static_cast(cur_w); + + size_t o_idx = static_cast(in) * k * ho * wo + + static_cast(iho) * wo + static_cast(iwo); + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } + else + { + size_t i_idx = static_cast(in) * in_strides[4] + + static_cast(ic) * in_strides[2] + + static_cast(cur_h) * in_strides[1] + + static_cast(cur_w) * in_strides[0]; + + size_t o_idx = static_cast(in) * out_strides[4] + + static_cast(iho) * out_strides[1] + + static_cast(iwo) * out_strides[0]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } } } } } - size_t f_idx = static_cast(ic) * fy * fx + static_cast(iy) * fx + - static_cast(ix); - p_wei[f_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t f_idx = static_cast(ic) * fy * fx + static_cast(iy) * fx + + static_cast(ix); + + p_wei[f_idx] = cast_to(value); + } + else + { + size_t f_idx = static_cast(ic) * wei_strides[2] + + static_cast(iy) * wei_strides[1] + + static_cast(ix) * wei_strides[0]; + + p_wei[f_idx] = cast_to(value); + } } } // design block_size 256 -template +template inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, dst_data_t* __restrict__ p_out, + Strides6D in_strides, + Strides6D wei_strides, + Strides6D out_strides, int di, int hi, int wi, @@ -405,21 +707,38 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in, int in = (bid / k_per_group) % n; int ig = bid / (n * k_per_group); - p_in += static_cast(in) * c * di * hi * wi + - static_cast(ig) * c_per_group * di * hi * wi; - p_wei += static_cast(ig) * k_per_group * c_per_group * fz * fy * fx + - static_cast(ik) * c_per_group * fz * fy * fx; - p_out += static_cast(in) * k * do_ * ho * wo + - static_cast(ig) * k_per_group * do_ * ho * wo + - static_cast(ik) * do_ * ho * wo; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * c * di * hi * wi + + static_cast(ig) * c_per_group * di * hi * wi; + + p_wei += static_cast(ig) * k_per_group * c_per_group * fz * fy * fx + + static_cast(ik) * c_per_group * fz * fy * fx; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += static_cast(in) * k * do_ * ho * wo + + static_cast(ig) * k_per_group * do_ * ho * wo + + static_cast(ik) * do_ * ho * wo; + } + else + { + p_in += static_cast(in) * in_strides[5] + static_cast(ig) * in_strides[4]; + + p_wei += + static_cast(ig) * wei_strides[5] + static_cast(ik) * wei_strides[4]; + + p_out += static_cast(in) * out_strides[5] + + static_cast(ig) * out_strides[4] + + static_cast(ik) * out_strides[3]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int iwo = tid % wo; int iho = (tid / wo) % ho; int ido = tid / (ho * wo); - double value = .0f; + // double value = .0f; + acc_data_t value = 0; for(int ic = 0; ic < c_per_group; ic++) { @@ -444,30 +763,67 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in, if(valid_d & valid_w & valid_h) { - size_t i_idx = static_cast(ic) * di * hi * wi + - static_cast(cur_d) * hi * wi + - static_cast(cur_h) * wi + - static_cast(cur_w); - size_t f_idx = static_cast(ic) * fz * fy * fx + - static_cast(iz) * fy * fx + - static_cast(iy) * fx + static_cast(ix); - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(ic) * di * hi * wi + + static_cast(cur_d) * hi * wi + + static_cast(cur_h) * wi + + static_cast(cur_w); + + size_t f_idx = static_cast(ic) * fz * fy * fx + + static_cast(iz) * fy * fx + + static_cast(iy) * fx + + static_cast(ix); + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t i_idx = static_cast(ic) * in_strides[3] + + static_cast(cur_d) * in_strides[2] + + static_cast(cur_h) * in_strides[1] + + static_cast(cur_w) * in_strides[0]; + + size_t f_idx = static_cast(ic) * wei_strides[3] + + static_cast(iz) * wei_strides[2] + + static_cast(iy) * wei_strides[1] + + static_cast(ix) * wei_strides[0]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } } } } } } - size_t o_idx = static_cast(ido) * ho * wo + static_cast(iho) * wo + - static_cast(iwo); - p_out[o_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(ido) * ho * wo + static_cast(iho) * wo + + static_cast(iwo); + + p_out[o_idx] = cast_to(value); + } + else + { + size_t o_idx = static_cast(ido) * out_strides[2] + + static_cast(iho) * out_strides[1] + + static_cast(iwo) * out_strides[0]; + + p_out[o_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides6D in_strides, + Strides6D wei_strides, + Strides6D out_strides, int di, int hi, int wi, @@ -506,21 +862,38 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in, int in = (bid / c_per_group) % n; int ig = bid / (n * c_per_group); - p_in += static_cast(in) * c * di * hi * wi + - static_cast(ig) * c_per_group * di * hi * wi + - static_cast(ic) * di * hi * wi; - p_wei += static_cast(ig) * k_per_group * c_per_group * fz * fy * fx + - static_cast(ic) * fz * fy * fx; - p_out += static_cast(in) * k * do_ * ho * wo + - static_cast(ig) * k_per_group * do_ * ho * wo; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * c * di * hi * wi + + static_cast(ig) * c_per_group * di * hi * wi + + static_cast(ic) * di * hi * wi; + + p_wei += static_cast(ig) * k_per_group * c_per_group * fz * fy * fx + + static_cast(ic) * fz * fy * fx; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += static_cast(in) * k * do_ * ho * wo + + static_cast(ig) * k_per_group * do_ * ho * wo; + } + else + { + p_in += static_cast(in) * in_strides[5] + static_cast(ig) * in_strides[4] + + static_cast(ic) * in_strides[3]; + + p_wei += + static_cast(ig) * wei_strides[5] + static_cast(ic) * wei_strides[3]; + + p_out += + static_cast(in) * out_strides[5] + static_cast(ig) * out_strides[4]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int iwi = tid % wi; int ihi = (tid / wi) % hi; int idi = tid / (hi * wi); - double value = .0f; + // double value = .0f; + acc_data_t value = 0; for(int ik = 0; ik < k_per_group; ik++) { @@ -554,30 +927,67 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in, if(valid_d & valid_h & valid_w) { - size_t o_idx = static_cast(ik) * do_ * ho * wo + - static_cast(cur_do) * ho * wo + - static_cast(cur_ho) * wo + - static_cast(cur_wo); - size_t f_idx = static_cast(ik) * c_per_group * fz * fy * fx + - static_cast(iz) * fy * fx + - static_cast(iy) * fx + static_cast(ix); - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(ik) * do_ * ho * wo + + static_cast(cur_do) * ho * wo + + static_cast(cur_ho) * wo + + static_cast(cur_wo); + + size_t f_idx = + static_cast(ik) * c_per_group * fz * fy * fx + + static_cast(iz) * fy * fx + + static_cast(iy) * fx + static_cast(ix); + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t o_idx = static_cast(ik) * out_strides[3] + + static_cast(cur_do) * out_strides[2] + + static_cast(cur_ho) * out_strides[1] + + static_cast(cur_wo) * out_strides[0]; + + size_t f_idx = static_cast(ik) * wei_strides[4] + + static_cast(iz) * wei_strides[2] + + static_cast(iy) * wei_strides[1] + + static_cast(ix) * wei_strides[0]; + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } } } } } } - size_t i_idx = static_cast(idi) * hi * wi + static_cast(ihi) * wi + - static_cast(iwi); - p_in[i_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(idi) * hi * wi + static_cast(ihi) * wi + + static_cast(iwi); + + p_in[i_idx] = cast_to(value); + } + else + { + size_t i_idx = static_cast(idi) * in_strides[2] + + static_cast(ihi) * in_strides[1] + + static_cast(iwi) * in_strides[0]; + + p_in[i_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in, dst_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides6D in_strides, + Strides6D wei_strides, + Strides6D out_strides, int di, int hi, int wi, @@ -615,20 +1025,36 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in, int ik = bid % k_per_group; int ig = bid / k_per_group; - p_in += static_cast(ig) * c_per_group * di * hi * wi; - p_wei += static_cast(ig) * k_per_group * c_per_group * fz * fy * fx + - static_cast(ik) * c_per_group * fz * fy * fx; - p_out += static_cast(ig) * k_per_group * do_ * ho * wo + - static_cast(ik) * do_ * ho * wo; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(ig) * c_per_group * di * hi * wi; + + p_wei += static_cast(ig) * k_per_group * c_per_group * fz * fy * fx + + static_cast(ik) * c_per_group * fz * fy * fx; + + p_out += static_cast(ig) * k_per_group * do_ * ho * wo + + static_cast(ik) * do_ * ho * wo; + } + else + { + p_in += static_cast(ig) * in_strides[4]; + + p_wei += + static_cast(ig) * wei_strides[5] + static_cast(ik) * wei_strides[4]; + + p_out += + static_cast(ig) * out_strides[4] + static_cast(ik) * out_strides[3]; + } - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ix = tid % fx; int iy = (tid / fx) % fy; int iz = (tid / (fx * fy)) % fz; int ic = tid / (fx * fy * fz); - double value = .0f; + // double value = .0f; + acc_data_t value = 0; for(int in = 0; in < n; in++) { @@ -653,33 +1079,73 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in, if(valid_d & valid_h & valid_w) { - size_t i_idx = static_cast(in) * c * di * hi * wi + - static_cast(ic) * di * hi * wi + - static_cast(cur_d) * hi * wi + - static_cast(cur_h) * wi + - static_cast(cur_w); - size_t o_idx = static_cast(in) * k * do_ * ho * wo + - static_cast(ido) * ho * wo + - static_cast(iho) * wo + static_cast(iwo); - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(in) * c * di * hi * wi + + static_cast(ic) * di * hi * wi + + static_cast(cur_d) * hi * wi + + static_cast(cur_h) * wi + + static_cast(cur_w); + + size_t o_idx = static_cast(in) * k * do_ * ho * wo + + static_cast(ido) * ho * wo + + static_cast(iho) * wo + + static_cast(iwo); + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } + else + { + size_t i_idx = static_cast(in) * in_strides[5] + + static_cast(ic) * in_strides[3] + + static_cast(cur_d) * in_strides[2] + + static_cast(cur_h) * in_strides[1] + + static_cast(cur_w) * in_strides[0]; + + size_t o_idx = static_cast(in) * out_strides[5] + + static_cast(ido) * out_strides[2] + + static_cast(iho) * out_strides[1] + + static_cast(iwo) * out_strides[0]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } } } } } } - size_t f_idx = static_cast(ic) * fz * fy * fx + static_cast(iz) * fy * fx + - static_cast(iy) * fx + static_cast(ix); - p_wei[f_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t f_idx = static_cast(ic) * fz * fy * fx + + static_cast(iz) * fy * fx + static_cast(iy) * fx + + static_cast(ix); + + p_wei[f_idx] = cast_to(value); + } + else + { + size_t f_idx = static_cast(ic) * wei_strides[3] + + static_cast(iz) * wei_strides[2] + + static_cast(iy) * wei_strides[1] + + static_cast(ix) * wei_strides[0]; + + p_wei[f_idx] = cast_to(value); + } } } /***************************** nhwc *****************************/ // design block_size 256 -template +template inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, dst_data_t* __restrict__ p_out, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides, int hi, int wi, int n, @@ -711,17 +1177,33 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in, int in = (bid / ho) % n; int ig = bid / (n * ho); - p_in += static_cast(in) * hi * wi * c + static_cast(ig) * c_per_group; - p_wei += static_cast(ig) * k_per_group * fy * fx * c_per_group; - p_out += static_cast(in) * ho * wo * k + static_cast(ig) * k_per_group + - static_cast(iho) * wo * k; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * hi * wi * c + static_cast(ig) * c_per_group; + + p_wei += static_cast(ig) * k_per_group * fy * fx * c_per_group; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += static_cast(in) * ho * wo * k + static_cast(iho) * wo * k + + static_cast(ig) * k_per_group; + } + else + { + p_in += static_cast(in) * in_strides[4] + static_cast(ig) * in_strides[1]; + + p_wei += static_cast(ig) * wei_strides[4]; + + p_out += static_cast(in) * out_strides[4] + + static_cast(iho) * out_strides[3] + + static_cast(ig) * out_strides[1]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int iwo = tid / k_per_group; int ik = tid % k_per_group; - double value = .0f; + // double value = .0f; + acc_data_t value = 0; for(int iy = 0; iy < fy; iy++) { @@ -740,27 +1222,61 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in, if(valid_w & valid_h) { - size_t i_idx = static_cast(cur_h) * wi * c + - static_cast(cur_w) * c + static_cast(ic); - size_t f_idx = static_cast(ik) * fy * fx * c_per_group + - static_cast(iy) * fx * c_per_group + - static_cast(ix) * c_per_group + - static_cast(ic); - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(cur_h) * wi * c + + static_cast(cur_w) * c + static_cast(ic); + + size_t f_idx = static_cast(ik) * fy * fx * c_per_group + + static_cast(iy) * fx * c_per_group + + static_cast(ix) * c_per_group + + static_cast(ic); + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t i_idx = static_cast(cur_h) * in_strides[3] + + static_cast(cur_w) * in_strides[2] + + static_cast(ic) * in_strides[0]; + + size_t f_idx = static_cast(ik) * wei_strides[3] + + static_cast(iy) * wei_strides[2] + + static_cast(ix) * wei_strides[1] + + static_cast(ic) * wei_strides[0]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } } } } } - size_t o_idx = static_cast(iwo) * k + static_cast(ik); - p_out[o_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(iwo) * k + static_cast(ik); + + p_out[o_idx] = cast_to(value); + } + else + { + size_t o_idx = static_cast(iwo) * out_strides[2] + + static_cast(ik) * out_strides[0]; + + p_out[o_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides, int hi, int wi, int n, @@ -778,6 +1294,41 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, int fx, int group) { + /* + if (blockIdx.x == 0 && threadIdx.x == 0) { + printStrideArray("in_strides", in_strides); + printStrideArray("wei_strides", wei_strides); + printStrideArray("out_strides", out_strides); + + printf("modified strides\n"); + Strides5D in_strd; + Strides5D wei_strd; + Strides5D out_strd; + + in_strd[0] = 1; + in_strd[1] = k_per_group; + in_strd[2] = group * k_per_group; + in_strd[3] = group * k_per_group * wo; + in_strd[4] = group * k_per_group * wo * ho; + + wei_strd[0] = 1; + wei_strd[1] = c_per_group; + wei_strd[2] = c_per_group * fx; + wei_strd[3] = c_per_group * fx * fy; + wei_strd[4] = c_per_group * fx * fy * k_per_group; + + out_strd[0] = 1; + out_strd[1] = c_per_group; + out_strd[2] = group * c_per_group; + out_strd[3] = group * c_per_group * wi; + out_strd[4] = group * c_per_group * wi * hi; + + printStrideArray("in_strd", in_strd); + printStrideArray("wei_strd", wei_strd); + printStrideArray("out_strd", out_strd); + } + */ + /* * need to compute total input pixel: `group * n * hi * wi * c_per_group`. * to distribute this workload, let one workgroup compute `wi * @@ -792,17 +1343,33 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, int in = (bid / hi) % n; int ig = bid / (n * hi); - p_in += static_cast(in) * hi * wi * c + static_cast(ihi) * wi * c + - static_cast(ig) * c_per_group; - p_wei += static_cast(ig) * k_per_group * fy * fx * c_per_group; - p_out += static_cast(in) * ho * wo * k + static_cast(ig) * k_per_group; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * hi * wi * c + static_cast(ihi) * wi * c + + static_cast(ig) * c_per_group; + + p_wei += static_cast(ig) * k_per_group * fy * fx * c_per_group; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += static_cast(in) * ho * wo * k + static_cast(ig) * k_per_group; + } + else + { + p_in += static_cast(in) * in_strides[4] + static_cast(ihi) * in_strides[3] + + static_cast(ig) * in_strides[1]; + + p_wei += static_cast(ig) * wei_strides[4]; + + p_out += + static_cast(in) * out_strides[4] + static_cast(ig) * out_strides[1]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int iwi = tid / c_per_group; int ic = tid % c_per_group; - double value = .0f; + // double value = .0f; + acc_data_t value = 0; for(int iy = 0; iy < fy; iy++) { @@ -827,27 +1394,61 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, if(valid_h & valid_w) { - size_t o_idx = static_cast(cur_ho) * wo * k + - static_cast(cur_wo) * k + static_cast(ik); - size_t f_idx = static_cast(ik) * fy * fx * c_per_group + - static_cast(iy) * fx * c_per_group + - static_cast(ix) * c_per_group + - static_cast(ic); - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(cur_ho) * wo * k + + static_cast(cur_wo) * k + + static_cast(ik); + + size_t f_idx = static_cast(ik) * fy * fx * c_per_group + + static_cast(iy) * fx * c_per_group + + static_cast(ix) * c_per_group + + static_cast(ic); + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t o_idx = static_cast(cur_ho) * out_strides[3] + + static_cast(cur_wo) * out_strides[2] + + static_cast(ik) * out_strides[0]; + + size_t f_idx = static_cast(ik) * wei_strides[3] + + static_cast(iy) * wei_strides[2] + + static_cast(ix) * wei_strides[1] + + static_cast(ic) * wei_strides[0]; + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } } } } } - size_t i_idx = static_cast(iwi) * c + static_cast(ic); - p_in[i_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(iwi) * c + static_cast(ic); + + p_in[i_idx] = cast_to(value); + } + else + { + size_t i_idx = + static_cast(iwi) * in_strides[2] + static_cast(ic) * in_strides[0]; + p_in[i_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in, dst_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides, int hi, int wi, int n, @@ -879,18 +1480,34 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in, int ik = bid % k_per_group; int ig = bid / k_per_group; - p_in += static_cast(ig) * c_per_group; - p_wei += static_cast(ig) * k_per_group * fy * fx * c_per_group + - static_cast(ik) * fy * fx * c_per_group; - p_out += static_cast(ig) * k_per_group + static_cast(ik); + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(ig) * c_per_group; + + p_wei += static_cast(ig) * k_per_group * fy * fx * c_per_group + + static_cast(ik) * fy * fx * c_per_group; + + p_out += static_cast(ig) * k_per_group + static_cast(ik); + } + else + { + p_in += static_cast(ig) * in_strides[1]; + + p_wei += + static_cast(ig) * wei_strides[4] + static_cast(ik) * wei_strides[3]; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += + static_cast(ig) * out_strides[1] + static_cast(ik) * out_strides[0]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ic = tid % c_per_group; int ix = (tid / c_per_group) % fx; int iy = tid / (c_per_group * fx); - double value = .0f; + // double value = .0f; + acc_data_t value = 0; for(int in = 0; in < n; in++) { @@ -909,29 +1526,65 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in, if(valid_h & valid_w) { - size_t i_idx = static_cast(in) * hi * wi * c + - static_cast(cur_h) * wi * c + - static_cast(cur_w) * c + static_cast(ic); - size_t o_idx = static_cast(in) * ho * wo * k + - static_cast(iho) * wo * k + - static_cast(iwo) * k; - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(in) * hi * wi * c + + static_cast(cur_h) * wi * c + + static_cast(cur_w) * c + static_cast(ic); + + size_t o_idx = static_cast(in) * ho * wo * k + + static_cast(iho) * wo * k + + static_cast(iwo) * k; + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } + else + { + size_t i_idx = static_cast(in) * in_strides[4] + + static_cast(cur_h) * in_strides[3] + + static_cast(cur_w) * in_strides[2] + + static_cast(ic) * in_strides[0]; + + size_t o_idx = static_cast(in) * out_strides[4] + + static_cast(iho) * out_strides[3] + + static_cast(iwo) * out_strides[2]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } } } } } - size_t f_idx = static_cast(iy) * fx * c_per_group + - static_cast(ix) * c_per_group + static_cast(ic); - p_wei[f_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t f_idx = static_cast(iy) * fx * c_per_group + + static_cast(ix) * c_per_group + static_cast(ic); + + p_wei[f_idx] = cast_to(value); + } + else + { + size_t f_idx = static_cast(iy) * wei_strides[2] + + static_cast(ix) * wei_strides[1] + + static_cast(ic) * wei_strides[0]; + + p_wei[f_idx] = cast_to(value); + } } } // design block_size 256 -template +template inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, dst_data_t* __restrict__ p_out, + Strides6D in_strides, + Strides6D wei_strides, + Strides6D out_strides, int di, int hi, int wi, @@ -970,18 +1623,37 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in, int in = (bid / do_) % n; int ig = bid / (n * do_); - p_in += static_cast(in) * di * hi * wi * c + static_cast(ig) * c_per_group; - p_wei += static_cast(ig) * k_per_group * fz * fy * fx * c_per_group; - p_out += static_cast(in) * do_ * ho * wo * k + static_cast(ido) * ho * wo * k + - static_cast(ig) * k_per_group; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * di * hi * wi * c + static_cast(ig) * c_per_group; + + p_wei += static_cast(ig) * k_per_group * fz * fy * fx * c_per_group; + + p_out += static_cast(in) * do_ * ho * wo * k + + static_cast(ido) * ho * wo * k + static_cast(ig) * k_per_group; + } + else + { + // dim order NDHWGC + // replace C and K with G * C_per_G and G * K_per_G + p_in += static_cast(in) * in_strides[5] + static_cast(ig) * in_strides[1]; + + // Assumes that group G is the highest dimension in the layout + p_wei += static_cast(ig) * wei_strides[5]; + + p_out += static_cast(in) * out_strides[5] + + static_cast(ido) * out_strides[4] + + static_cast(ig) * out_strides[1]; + } - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ik = tid % k_per_group; int iwo = (tid / k_per_group) % wo; int iho = tid / (k_per_group * wo); - double value = .0f; + // double value = .0f; + acc_data_t value = 0; for(int iz = 0; iz < fz; iz++) { @@ -1005,30 +1677,69 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in, { if(valid_d & valid_w & valid_h) { - size_t i_idx = static_cast(cur_d) * hi * wi * c + - static_cast(cur_h) * wi * c + - static_cast(cur_w) * c + static_cast(ic); - size_t f_idx = static_cast(ik) * fz * fy * fx * c_per_group + - static_cast(iz) * fy * fx * c_per_group + - static_cast(iy) * fx * c_per_group + - static_cast(ix) * c_per_group + - static_cast(ic); - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(cur_d) * hi * wi * c + + static_cast(cur_h) * wi * c + + static_cast(cur_w) * c + + static_cast(ic); + + size_t f_idx = + static_cast(ik) * fz * fy * fx * c_per_group + + static_cast(iz) * fy * fx * c_per_group + + static_cast(iy) * fx * c_per_group + + static_cast(ix) * c_per_group + static_cast(ic); + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t i_idx = static_cast(cur_d) * in_strides[4] + + static_cast(cur_h) * in_strides[3] + + static_cast(cur_w) * in_strides[2] + + static_cast(ic) * in_strides[0]; + + size_t f_idx = static_cast(ik) * wei_strides[4] + + static_cast(iz) * wei_strides[3] + + static_cast(iy) * wei_strides[2] + + static_cast(ix) * wei_strides[1] + + static_cast(ic) * wei_strides[0]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } } } } } } - size_t o_idx = static_cast(iho) * wo * k + static_cast(iwo) * k + - static_cast(ik); - p_out[o_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(iho) * wo * k + static_cast(iwo) * k + + static_cast(ik); + + p_out[o_idx] = cast_to(value); + } + else + { + size_t o_idx = static_cast(iho) * out_strides[3] + + static_cast(iwo) * out_strides[2] + + static_cast(ik) * out_strides[0]; + + p_out[o_idx] = cast_to(value); + } } } -template + +template inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides6D in_strides, + Strides6D wei_strides, + Strides6D out_strides, int di, int hi, int wi, @@ -1052,6 +1763,44 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, int fx, int group) { + /* + if (blockIdx.x == 0 && threadIdx.x == 0) { + printStrideArray("in_strides", in_strides); + printStrideArray("wei_strides", wei_strides); + printStrideArray("out_strides", out_strides); + + printf("modified strides\n"); + Strides6D in_strd; + Strides6D wei_strd; + Strides6D out_strd; + + in_strd[0] = 1; + in_strd[1] = k_per_group; + in_strd[2] = group * k_per_group; + in_strd[3] = group * k_per_group * wo; + in_strd[4] = group * k_per_group * wo * ho; + in_strd[5] = group * k_per_group * wo * ho * do_; + + wei_strd[0] = 1; + wei_strd[1] = c_per_group; + wei_strd[2] = c_per_group * fx; + wei_strd[3] = c_per_group * fx * fy; + wei_strd[4] = c_per_group * fx * fy * fz; + wei_strd[5] = k_per_group * c_per_group * fx * fy * fz; + + out_strd[0] = 1; + out_strd[1] = c_per_group; + out_strd[2] = group * c_per_group; + out_strd[3] = group * c_per_group * wi; + out_strd[4] = group * c_per_group * wi * hi; + out_strd[5] = group * c_per_group * wi * hi * di; + + printStrideArray("in_strd", in_strd); + printStrideArray("wei_strd", wei_strd); + printStrideArray("out_strd", out_strd); + } + */ + /* * need to compute total input pixel: `group * n * di * hi * wi * * c_per_group`. @@ -1067,18 +1816,35 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, int in = (bid / di) % n; int ig = bid / (n * di); - p_in += static_cast(in) * di * hi * wi * c + static_cast(idi) * hi * wi * c + - static_cast(ig) * c_per_group; - p_wei += static_cast(ig) * k_per_group * fz * fy * fx * c_per_group; - p_out += static_cast(in) * do_ * ho * wo * k + static_cast(ig) * k_per_group; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * di * hi * wi * c + + static_cast(idi) * hi * wi * c + static_cast(ig) * c_per_group; + + p_wei += static_cast(ig) * k_per_group * fz * fy * fx * c_per_group; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += + static_cast(in) * do_ * ho * wo * k + static_cast(ig) * k_per_group; + } + else + { + p_in += static_cast(in) * in_strides[5] + static_cast(idi) * in_strides[4] + + static_cast(ig) * in_strides[1]; + + p_wei += static_cast(ig) * wei_strides[5]; + + p_out += + static_cast(in) * out_strides[5] + static_cast(ig) * out_strides[1]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ic = tid % c_per_group; int iwi = (tid / c_per_group) % wi; int ihi = (tid / (c_per_group * wi)); - double value = .0f; + // double value = .0f; + acc_data_t value = 0; for(int iz = 0; iz < fz; iz++) { @@ -1111,32 +1877,69 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, { if(valid_d & valid_h & valid_w) { - size_t o_idx = static_cast(cur_do) * ho * wo * k + - static_cast(cur_ho) * wo * k + - static_cast(cur_wo) * k + - static_cast(ik); - size_t f_idx = static_cast(ik) * fz * fy * fx * c_per_group + - static_cast(iz) * fy * fx * c_per_group + - static_cast(iy) * fx * c_per_group + - static_cast(ix) * c_per_group + - static_cast(ic); - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(cur_do) * ho * wo * k + + static_cast(cur_ho) * wo * k + + static_cast(cur_wo) * k + + static_cast(ik); + + size_t f_idx = + static_cast(ik) * fz * fy * fx * c_per_group + + static_cast(iz) * fy * fx * c_per_group + + static_cast(iy) * fx * c_per_group + + static_cast(ix) * c_per_group + static_cast(ic); + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t o_idx = static_cast(cur_do) * out_strides[4] + + static_cast(cur_ho) * out_strides[3] + + static_cast(cur_wo) * out_strides[2] + + static_cast(ik) * out_strides[0]; + + size_t f_idx = static_cast(ik) * wei_strides[4] + + static_cast(iz) * wei_strides[3] + + static_cast(iy) * wei_strides[2] + + static_cast(ix) * wei_strides[1] + + static_cast(ic) * wei_strides[0]; + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } } } } } } - size_t i_idx = static_cast(ihi) * wi * c + static_cast(iwi) * c + - static_cast(ic); - p_in[i_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(ihi) * wi * c + static_cast(iwi) * c + + static_cast(ic); + + p_in[i_idx] = cast_to(value); + } + else + { + size_t i_idx = static_cast(ihi) * in_strides[3] + + static_cast(iwi) * in_strides[2] + + static_cast(ic) * in_strides[0]; + + p_in[i_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in, dst_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides6D in_strides, + Strides6D wei_strides, + Strides6D out_strides, int di, int hi, int wi, @@ -1174,19 +1977,35 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in, int ik = bid % k_per_group; int ig = bid / k_per_group; - p_in += static_cast(ig) * c_per_group; - p_wei += static_cast(ig) * k_per_group * fz * fy * fx * c_per_group + - static_cast(ik) * fz * fy * fx * c_per_group; - p_out += static_cast(ig) * k_per_group + static_cast(ik); + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(ig) * c_per_group; + + p_wei += static_cast(ig) * k_per_group * fz * fy * fx * c_per_group + + static_cast(ik) * fz * fy * fx * c_per_group; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += static_cast(ig) * k_per_group + static_cast(ik); + } + else + { + p_in += static_cast(ig) * in_strides[1]; + + p_wei += + static_cast(ig) * wei_strides[5] + static_cast(ik) * wei_strides[4]; + + p_out += + static_cast(ig) * out_strides[1] + static_cast(ik) * out_strides[0]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ic = tid % c_per_group; int ix = (tid / c_per_group) % fx; int iy = (tid / (c_per_group * fx)) % fy; int iz = (tid / (c_per_group * fx * fy)); - double value = .0f; + // double value = .0f; + acc_data_t value = 0; for(int in = 0; in < n; in++) { @@ -1211,374 +2030,336 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in, if(valid_d & valid_h & valid_w) { - size_t i_idx = static_cast(in) * di * hi * wi * c + - static_cast(cur_d) * hi * wi * c + - static_cast(cur_h) * wi * c + - static_cast(cur_w) * c + static_cast(ic); - size_t o_idx = static_cast(in) * do_ * ho * wo * k + - static_cast(ido) * ho * wo * k + - static_cast(iho) * wo * k + - static_cast(iwo) * k; - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(in) * di * hi * wi * c + + static_cast(cur_d) * hi * wi * c + + static_cast(cur_h) * wi * c + + static_cast(cur_w) * c + + static_cast(ic); + + size_t o_idx = static_cast(in) * do_ * ho * wo * k + + static_cast(ido) * ho * wo * k + + static_cast(iho) * wo * k + + static_cast(iwo) * k; + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } + else + { + + size_t i_idx = static_cast(in) * in_strides[5] + + static_cast(cur_d) * in_strides[4] + + static_cast(cur_h) * in_strides[3] + + static_cast(cur_w) * in_strides[2] + + static_cast(ic) * in_strides[0]; + + size_t o_idx = static_cast(in) * out_strides[5] + + static_cast(ido) * out_strides[4] + + static_cast(iho) * out_strides[3] + + static_cast(iwo) * out_strides[2]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } } } } } } - size_t f_idx = static_cast(iz) * fy * fx * c_per_group + - static_cast(iy) * fx * c_per_group + - static_cast(ix) * c_per_group + static_cast(ic); - p_wei[f_idx] = cast_to(value); - } -} - -#define DEFINE_2D_NAIVE_FWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_fwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - src_data_t* __restrict__ p_in, \ - src_data_t* __restrict__ p_wei, \ - dst_data_t* __restrict__ p_out, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int ho, \ - int wo, \ - int sy, \ - int sx, \ - int dy, \ - int dx, \ - int py, \ - int px, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_fwd_##tensor_layout(p_in, \ - p_wei, \ - p_out, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - ho, \ - wo, \ - sy, \ - sx, \ - dy, \ - dx, \ - py, \ - px, \ - fy, \ - fx, \ - group); \ - } -#define DEFINE_2D_NAIVE_BWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_bwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - dst_data_t* __restrict__ p_in, \ - src_data_t* __restrict__ p_wei, \ - src_data_t* __restrict__ p_out, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int ho, \ - int wo, \ - int sy, \ - int sx, \ - int dy, \ - int dx, \ - int py, \ - int px, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_bwd_##tensor_layout(p_in, \ - p_wei, \ - p_out, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - ho, \ - wo, \ - sy, \ - sx, \ - dy, \ - dx, \ - py, \ - px, \ - fy, \ - fx, \ - group); \ - } + if constexpr(ASSUME_PACKED) + { + size_t f_idx = static_cast(iz) * fy * fx * c_per_group + + static_cast(iy) * fx * c_per_group + + static_cast(ix) * c_per_group + static_cast(ic); -#define DEFINE_2D_NAIVE_WRW_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_wrw_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - src_data_t* __restrict__ p_in, \ - dst_data_t* __restrict__ p_wei, \ - src_data_t* __restrict__ p_out, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int ho, \ - int wo, \ - int sy, \ - int sx, \ - int dy, \ - int dx, \ - int py, \ - int px, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_wrw_##tensor_layout(p_in, \ - p_wei, \ - p_out, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - ho, \ - wo, \ - sy, \ - sx, \ - dy, \ - dx, \ - py, \ - px, \ - fy, \ - fx, \ - group); \ - } + p_wei[f_idx] = cast_to(value); + } + else + { + size_t f_idx = static_cast(iz) * wei_strides[3] + + static_cast(iy) * wei_strides[2] + + static_cast(ix) * wei_strides[1] + + static_cast(ic) * wei_strides[0]; -#define DEFINE_3D_NAIVE_FWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_fwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - src_data_t* __restrict__ p_in, \ - src_data_t* __restrict__ p_wei, \ - dst_data_t* __restrict__ p_out, \ - int di, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int do_, \ - int ho, \ - int wo, \ - int sz, \ - int sy, \ - int sx, \ - int dz, \ - int dy, \ - int dx, \ - int pz, \ - int py, \ - int px, \ - int fz, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_fwd_##tensor_layout(p_in, \ - p_wei, \ - p_out, \ - di, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - do_, \ - ho, \ - wo, \ - sz, \ - sy, \ - sx, \ - dz, \ - dy, \ - dx, \ - pz, \ - py, \ - px, \ - fz, \ - fy, \ - fx, \ - group); \ + p_wei[f_idx] = cast_to(value); + } } +} -#define DEFINE_3D_NAIVE_BWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_bwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - dst_data_t* __restrict__ p_in, \ - src_data_t* __restrict__ p_wei, \ - src_data_t* __restrict__ p_out, \ - int di, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int do_, \ - int ho, \ - int wo, \ - int sz, \ - int sy, \ - int sx, \ - int dz, \ - int dy, \ - int dx, \ - int pz, \ - int py, \ - int px, \ - int fz, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_bwd_##tensor_layout(p_in, \ - p_wei, \ - p_out, \ - di, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - do_, \ - ho, \ - wo, \ - sz, \ - sy, \ - sx, \ - dz, \ - dy, \ - dx, \ - pz, \ - py, \ - px, \ - fz, \ - fy, \ - fx, \ - group); \ +#define DEFINE_2D_NAIVE_CONV_KERNEL(direction, tensor_layout, src_data_t, acc_data_t, dst_data_t) \ + extern "C" __global__ void \ + naive_conv_packed_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ + src_data_t* __restrict__ p_in, \ + src_data_t* __restrict__ p_wei, \ + dst_data_t* __restrict__ p_out, \ + Strides5D in_strides, \ + Strides5D wei_strides, \ + Strides5D out_strides, \ + int hi, \ + int wi, \ + int n, \ + int k_per_group, \ + int c_per_group, \ + int ho, \ + int wo, \ + int sy, \ + int sx, \ + int dy, \ + int dx, \ + int py, \ + int px, \ + int fy, \ + int fx, \ + int group) \ + { \ + naive_conv_##direction##_##tensor_layout( \ + p_in, \ + p_wei, \ + p_out, \ + in_strides, \ + wei_strides, \ + out_strides, \ + hi, \ + wi, \ + n, \ + k_per_group, \ + c_per_group, \ + ho, \ + wo, \ + sy, \ + sx, \ + dy, \ + dx, \ + py, \ + px, \ + fy, \ + fx, \ + group); \ + } \ + extern "C" __global__ void \ + naive_conv_nonpacked_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ + src_data_t* __restrict__ p_in, \ + src_data_t* __restrict__ p_wei, \ + dst_data_t* __restrict__ p_out, \ + Strides5D in_strides, \ + Strides5D wei_strides, \ + Strides5D out_strides, \ + int hi, \ + int wi, \ + int n, \ + int k_per_group, \ + int c_per_group, \ + int ho, \ + int wo, \ + int sy, \ + int sx, \ + int dy, \ + int dx, \ + int py, \ + int px, \ + int fy, \ + int fx, \ + int group) \ + { \ + naive_conv_##direction##_##tensor_layout( \ + p_in, \ + p_wei, \ + p_out, \ + in_strides, \ + wei_strides, \ + out_strides, \ + hi, \ + wi, \ + n, \ + k_per_group, \ + c_per_group, \ + ho, \ + wo, \ + sy, \ + sx, \ + dy, \ + dx, \ + py, \ + px, \ + fy, \ + fx, \ + group); \ } -#define DEFINE_3D_NAIVE_WRW_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_wrw_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - src_data_t* __restrict__ p_in, \ - dst_data_t* __restrict__ p_wei, \ - src_data_t* __restrict__ p_out, \ - int di, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int do_, \ - int ho, \ - int wo, \ - int sz, \ - int sy, \ - int sx, \ - int dz, \ - int dy, \ - int dx, \ - int pz, \ - int py, \ - int px, \ - int fz, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_wrw_##tensor_layout(p_in, \ - p_wei, \ - p_out, \ - di, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - do_, \ - ho, \ - wo, \ - sz, \ - sy, \ - sx, \ - dz, \ - dy, \ - dx, \ - pz, \ - py, \ - px, \ - fz, \ - fy, \ - fx, \ - group); \ +#define DEFINE_3D_NAIVE_CONV_KERNEL(direction, tensor_layout, src_data_t, acc_data_t, dst_data_t) \ + extern "C" __global__ void \ + naive_conv_packed_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ + src_data_t* __restrict__ p_in, \ + src_data_t* __restrict__ p_wei, \ + dst_data_t* __restrict__ p_out, \ + Strides6D in_strides, \ + Strides6D wei_strides, \ + Strides6D out_strides, \ + int di, \ + int hi, \ + int wi, \ + int n, \ + int k_per_group, \ + int c_per_group, \ + int do_, \ + int ho, \ + int wo, \ + int sz, \ + int sy, \ + int sx, \ + int dz, \ + int dy, \ + int dx, \ + int pz, \ + int py, \ + int px, \ + int fz, \ + int fy, \ + int fx, \ + int group) \ + { \ + naive_conv_##direction##_##tensor_layout( \ + p_in, \ + p_wei, \ + p_out, \ + in_strides, \ + wei_strides, \ + out_strides, \ + di, \ + hi, \ + wi, \ + n, \ + k_per_group, \ + c_per_group, \ + do_, \ + ho, \ + wo, \ + sz, \ + sy, \ + sx, \ + dz, \ + dy, \ + dx, \ + pz, \ + py, \ + px, \ + fz, \ + fy, \ + fx, \ + group); \ + } \ + extern "C" __global__ void \ + naive_conv_nonpacked_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ + src_data_t* __restrict__ p_in, \ + src_data_t* __restrict__ p_wei, \ + dst_data_t* __restrict__ p_out, \ + Strides6D in_strides, \ + Strides6D wei_strides, \ + Strides6D out_strides, \ + int di, \ + int hi, \ + int wi, \ + int n, \ + int k_per_group, \ + int c_per_group, \ + int do_, \ + int ho, \ + int wo, \ + int sz, \ + int sy, \ + int sx, \ + int dz, \ + int dy, \ + int dx, \ + int pz, \ + int py, \ + int px, \ + int fz, \ + int fy, \ + int fx, \ + int group) \ + { \ + naive_conv_##direction##_##tensor_layout( \ + p_in, \ + p_wei, \ + p_out, \ + in_strides, \ + wei_strides, \ + out_strides, \ + di, \ + hi, \ + wi, \ + n, \ + k_per_group, \ + c_per_group, \ + do_, \ + ho, \ + wo, \ + sz, \ + sy, \ + sx, \ + dz, \ + dy, \ + dx, \ + pz, \ + py, \ + px, \ + fz, \ + fy, \ + fx, \ + group); \ } -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, float, double, float) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, half, double, half) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, ushort, double, ushort) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, int8_t, int32_t, int32_t) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, int8_t, int32_t, float) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, float, double, float) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, half, double, half) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, ushort, double, ushort) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, int8_t, int32_t, int32_t) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, int8_t, int32_t, float) - -DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nchw, float, double, float) -DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nchw, half, double, half) -DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nchw, ushort, double, ushort) -DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nhwc, float, double, float) -DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nhwc, half, double, half) -DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nhwc, ushort, double, ushort) - -DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nchw, float, double, float) -DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nchw, half, double, half) -DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nchw, ushort, double, ushort) -DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nhwc, float, double, float) -DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nhwc, half, double, half) -DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nhwc, ushort, double, ushort) - -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, float, double, float) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, half, double, half) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, ushort, double, ushort) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, int8_t, int32_t, int32_t) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, int8_t, int32_t, float) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, float, double, float) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, half, double, half) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, ushort, double, ushort) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, int8_t, int32_t, int32_t) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, int8_t, int32_t, float) - -DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ncdhw, float, double, float) -DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ncdhw, half, double, half) -DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ncdhw, ushort, double, ushort) -DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ndhwc, float, double, float) -DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ndhwc, half, double, half) -DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ndhwc, ushort, double, ushort) - -DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ncdhw, float, double, float) -DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ncdhw, half, double, half) -DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ncdhw, ushort, double, ushort) -DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ndhwc, float, double, float) -DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ndhwc, half, double, half) -DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ndhwc, ushort, double, ushort) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, float, double, float) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, half, double, half) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, ushort, double, ushort) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, int32_t) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, float) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, float, double, float) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, half, double, half) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, ushort, double, ushort) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, int32_t) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, float) + +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, float, double, float) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, half, double, half) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, ushort, double, ushort) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, float, double, float) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, half, double, half) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, ushort, double, ushort) + +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, float, double, float) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, half, double, half) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, ushort, double, ushort) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, float, double, float) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, half, double, half) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, ushort, double, ushort) + +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, float, double, float) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, half, double, half) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, ushort, double, ushort) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, int8_t, int32_t, int32_t) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, int8_t, int32_t, float) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, float, double, float) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, half, double, half) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, ushort, double, ushort) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, int8_t, int32_t, int32_t) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, int8_t, int32_t, float) + +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, float, double, float) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, half, double, half) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, ushort, double, ushort) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, float, double, float) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, half, double, half) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, ushort, double, ushort) + +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, float, double, float) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, half, double, half) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, ushort, double, ushort) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, float, double, float) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, half, double, half) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, ushort, double, ushort) diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp index 64c95257e6..425b65c6de 100644 --- a/src/solver/conv_direct_naive_conv.cpp +++ b/src/solver/conv_direct_naive_conv.cpp @@ -24,6 +24,7 @@ * *******************************************************************************/ +#include "miopen/env.hpp" #include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include namespace miopen { @@ -108,7 +110,15 @@ bool IsOutputInt32(const ProblemDescription& problem) std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem) { std::ostringstream kernel_name; - kernel_name << "naive_conv_"; + if(miopen::IsEnvvarValueEnabled("MIOPEN_USE_PACKED_CONV_REF_KERNEL")) + { + kernel_name << "naive_conv_packed_"; + } + else + { + kernel_name << "naive_conv_nonpacked_"; + } + if(problem.direction.IsForward()) kernel_name << "fwd_"; else if(problem.direction.IsBackwardData()) @@ -173,6 +183,8 @@ std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem) else MIOPEN_THROW("unsupported data type:"); + // TODO(Amber): Left for debugging. Will remove in the future. + // std::cout << "############ kernel_name = " << kernel_name.str() << std::endl; return kernel_name.str(); } @@ -244,5 +256,49 @@ bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext& ctx, return true; } +// figure out the index of C (channel) stride so we can expand it into +// (G, C_per_group). Return value G_stride_idx is the position of G stride +// in the stride vector, such that the (G_stride_idx - 1) is the index that +// contains C's stride as a multiplying factor +int GetGroupStrideIndex(const ProblemDescription& problem) +{ + int G_stride_idx = -1; + if(problem.IsLayoutDefault()) + { + G_stride_idx = 1; + } + else + { + assert(problem.IsLayoutNHWC()); + assert(problem.Is2d() || problem.Is3d()); + // + // G_stride_idx = problem.Is2d() ? 3 : 4; + // For NHWC, MIOpen stores strides in NCHW order, so we are interested in 1 + W's + // stride as that will be the value of G_stride_idx; + G_stride_idx = problem.Is2d() ? 4 : 5; + } + assert(G_stride_idx != -1); + return G_stride_idx; +} + +void printTensorStrides(const TensorDescriptor& inDesc, + const TensorDescriptor& wDesc, + const TensorDescriptor& outDesc) +{ + + auto printOneStrideVec = [](const char* name, const auto& vec) { + printf("%s = [", name); + for(const size_t v : vec) + { + printf("%zu,", v); + } + printf("]\n"); + }; + + printOneStrideVec("inDesc = ", inDesc.GetStrides()); + printOneStrideVec("wDesc = ", wDesc.GetStrides()); + printOneStrideVec("outDesc = ", outDesc.GetStrides()); +} + } // namespace solver } // namespace miopen diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp index c5d793860c..d0687b64ba 100644 --- a/src/solver/conv_direct_naive_conv_bwd.cpp +++ b/src/solver/conv_direct_naive_conv_bwd.cpp @@ -142,17 +142,35 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, }(); kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); + int G_stride_idx = GetGroupStrideIndex(problem); + if(problem.Is2d()) + { result.invoker_factory = [=](const std::vector& kernels) { const auto kern = kernels[0]; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; float elapsed = 0; - if(is_f8) + auto in_strides = MakeStrideArray<5>( + SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx)); + // For weights, we split K to (G, K_per_group), which is always index 0 + auto wei_strides = MakeStrideArray<5>( + SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); + auto out_strides = MakeStrideArray<5>( + SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx)); + // TODO(Amber): Someone made the silly decision of swapping in and + // out pointers in ConvTensors for backward pass, so now I have to + // pass out in place of in, out_strides in place of in_strides and + // vice-versa + if(is_f8) + { handle.Run(kern)(tensors.out, tensors.w, tensors.in, + out_strides, + wei_strides, + in_strides, hi, wi, n, @@ -172,10 +190,15 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, problem.GetConv().attribute.fp8rounding_mode.Get() == miopenF8RoundingModeStochastic, problem.GetConv().attribute.fp8rounding_mode.GetSeed()); + } else + { handle.Run(kern)(tensors.out, tensors.w, tensors.in, + out_strides, + wei_strides, + in_strides, hi, wi, n, @@ -192,6 +215,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, fy, fx, group); + } if(handle.IsProfilingEnabled()) elapsed += handle.GetKernelTime(); @@ -202,7 +226,9 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, } }; }; + } else + { result.invoker_factory = [=](const std::vector& kernels) { const auto kern = kernels[0]; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { @@ -210,9 +236,27 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, const auto& tensors = data_ctx.tensors; float elapsed = 0; + auto in_strides = MakeStrideArray<6>( + SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx)); + // For weights, we split K to (G, K_per_group), which is always index 0 + auto wei_strides = MakeStrideArray<6>( + SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); + auto out_strides = MakeStrideArray<6>( + SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx)); + + // printTensorStrides(tensors.inDesc, tensors.wDesc, tensors.outDesc); + // printStrideArrays(in_strides, wei_strides, out_strides); + + // TODO(Amber): Someone made the silly decision of swapping in and + // out pointers in ConvTensors for backward pass, so now I have to + // pass out in place of in, out_strides in place of in_strides and + // vice-versa handle.Run(kern)(tensors.out, tensors.w, tensors.in, + out_strides, + wei_strides, + in_strides, di, hi, wi, @@ -245,6 +289,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, } }; }; + } result.construction_params.push_back(kernel); return result; } diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp index fc8d8e77fd..6f06a2251b 100644 --- a/src/solver/conv_direct_naive_conv_fwd.cpp +++ b/src/solver/conv_direct_naive_conv_fwd.cpp @@ -28,6 +28,7 @@ #include #include #include +#include MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD) @@ -142,18 +143,33 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); + int G_stride_idx = GetGroupStrideIndex(problem); + if(problem.Is2d()) + { result.invoker_factory = [=](const std::vector& kernels) { const auto kern = kernels[0]; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; float elapsed = 0; + + auto in_strides = MakeStrideArray<5>( + SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx)); + // For weights, we split K to (G, K_per_group), which is always index 0 + auto wei_strides = MakeStrideArray<5>( + SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); + auto out_strides = MakeStrideArray<5>( + SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx)); + if(is_f8) { handle.Run(kern)(tensors.in, tensors.w, tensors.out, + in_strides, + wei_strides, + out_strides, hi, wi, n, @@ -179,6 +195,9 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, handle.Run(kern)(tensors.in, tensors.w, tensors.out, + in_strides, + wei_strides, + out_strides, hi, wi, n, @@ -206,7 +225,9 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, } }; }; + } else + { result.invoker_factory = [=](const std::vector& kernels) { const auto kern = kernels[0]; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { @@ -214,9 +235,19 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, const auto& tensors = data_ctx.tensors; float elapsed = 0; + auto in_strides = MakeStrideArray<6>( + SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx)); + // For weights, we split K to (G, K_per_group), which is always index 0 + auto wei_strides = MakeStrideArray<6>( + SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); + auto out_strides = MakeStrideArray<6>( + SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx)); handle.Run(kern)(tensors.in, tensors.w, tensors.out, + in_strides, + wei_strides, + out_strides, di, hi, wi, @@ -249,6 +280,7 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, } }; }; + } result.construction_params.push_back(kernel); return result; } diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp index 2c85949ad4..60bbe90411 100644 --- a/src/solver/conv_direct_naive_conv_wrw.cpp +++ b/src/solver/conv_direct_naive_conv_wrw.cpp @@ -129,17 +129,33 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, return false; }(); + int G_stride_idx = GetGroupStrideIndex(problem); + if(problem.Is2d()) + { result.invoker_factory = [=](const std::vector& kernels) { const auto kern = kernels[0]; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; float elapsed = 0; + + auto in_strides = MakeStrideArray<5>( + SplitStrideCtoGC(group, tensors.xDesc.GetStrides(), G_stride_idx)); + // For weights, we split K to (G, K_per_group), which is always index 0 + auto wei_strides = MakeStrideArray<5>( + SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides())); + auto out_strides = MakeStrideArray<5>( + SplitStrideCtoGC(group, tensors.dyDesc.GetStrides(), G_stride_idx)); + if(is_f8) + { handle.Run(kern)(tensors.x, tensors.dw, tensors.dy, + in_strides, + wei_strides, + out_strides, hi, wi, n, @@ -159,10 +175,15 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, problem.GetConv().attribute.fp8rounding_mode.Get() == miopenF8RoundingModeStochastic, problem.GetConv().attribute.fp8rounding_mode.GetSeed()); + } else + { handle.Run(kern)(tensors.x, tensors.dw, tensors.dy, + in_strides, + wei_strides, + out_strides, hi, wi, n, @@ -179,6 +200,7 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, fy, fx, group); + } if(handle.IsProfilingEnabled()) elapsed += handle.GetKernelTime(); @@ -189,7 +211,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, } }; }; + } else + { result.invoker_factory = [=](const std::vector& kernels) { const auto kern = kernels[0]; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { @@ -197,9 +221,20 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, const auto& tensors = data_ctx.tensors; float elapsed = 0; + auto in_strides = MakeStrideArray<6>( + SplitStrideCtoGC(group, tensors.xDesc.GetStrides(), G_stride_idx)); + // For weights, we split K to (G, K_per_group), which is always index 0 + auto wei_strides = MakeStrideArray<6>( + SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides())); + auto out_strides = MakeStrideArray<6>( + SplitStrideCtoGC(group, tensors.dyDesc.GetStrides(), G_stride_idx)); + handle.Run(kern)(tensors.x, tensors.dw, tensors.dy, + in_strides, + wei_strides, + out_strides, di, hi, wi, @@ -232,6 +267,7 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, } }; }; + } result.construction_params.push_back(kernel); return result; } diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp index c3b26a80a9..aa3dda788d 100644 --- a/test/gpu_reference_kernel.cpp +++ b/test/gpu_reference_kernel.cpp @@ -95,7 +95,8 @@ struct gpu_reference_kernel_base static std::vector get_image_size() { return {9, 14}; } - static std::vector get_channel_size() { return {3, 8}; } + // Warning: Channel size must be multiple of group size + static std::vector get_channel_size() { return {4, 8}; } static std::vector get_filter_depth() { return {1, 3}; } From 5ab82f31807caf2c3710677732be0f34fb16b145 Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Sat, 23 Sep 2023 11:51:34 +0000 Subject: [PATCH 02/23] fix formatting. disable strides for fp8 kernel for now --- src/kernels/gpu_reference_kernel/fp8_kern_types.h | 6 +++--- src/solver/conv_direct_naive_conv_bwd.cpp | 7 ++----- src/solver/conv_direct_naive_conv_fwd.cpp | 3 --- src/solver/conv_direct_naive_conv_wrw.cpp | 9 +++------ 4 files changed, 8 insertions(+), 17 deletions(-) diff --git a/src/kernels/gpu_reference_kernel/fp8_kern_types.h b/src/kernels/gpu_reference_kernel/fp8_kern_types.h index 3bac0a31f7..b14302e0c2 100644 --- a/src/kernels/gpu_reference_kernel/fp8_kern_types.h +++ b/src/kernels/gpu_reference_kernel/fp8_kern_types.h @@ -58,6 +58,6 @@ #define KERNEL_NAME_SUFFIX CAT(CAT(INPUT_TYPE, _), CAT(CAT(WEIGHTS_TYPE, _), OUTPUT_TYPE)) -#define FWD_KERNEL_NAME CAT(naive_conv_fwd_nchw_, KERNEL_NAME_SUFFIX) -#define BWD_KERNEL_NAME CAT(naive_conv_bwd_nchw_, KERNEL_NAME_SUFFIX) -#define WRW_KERNEL_NAME CAT(naive_conv_wrw_nchw_, KERNEL_NAME_SUFFIX) +#define FWD_KERNEL_NAME CAT(naive_conv_packed_fwd_nchw_, KERNEL_NAME_SUFFIX) +#define BWD_KERNEL_NAME CAT(naive_conv_packed_bwd_nchw_, KERNEL_NAME_SUFFIX) +#define WRW_KERNEL_NAME CAT(naive_conv_packed_wrw_nchw_, KERNEL_NAME_SUFFIX) diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp index d0687b64ba..3fe005e994 100644 --- a/src/solver/conv_direct_naive_conv_bwd.cpp +++ b/src/solver/conv_direct_naive_conv_bwd.cpp @@ -152,7 +152,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; float elapsed = 0; - auto in_strides = MakeStrideArray<5>( + auto in_strides = MakeStrideArray<5>( SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx)); // For weights, we split K to (G, K_per_group), which is always index 0 auto wei_strides = MakeStrideArray<5>( @@ -163,14 +163,11 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, // out pointers in ConvTensors for backward pass, so now I have to // pass out in place of in, out_strides in place of in_strides and // vice-versa - if(is_f8) + if(is_f8) { handle.Run(kern)(tensors.out, tensors.w, tensors.in, - out_strides, - wei_strides, - in_strides, hi, wi, n, diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp index 6f06a2251b..3d953de173 100644 --- a/src/solver/conv_direct_naive_conv_fwd.cpp +++ b/src/solver/conv_direct_naive_conv_fwd.cpp @@ -167,9 +167,6 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, handle.Run(kern)(tensors.in, tensors.w, tensors.out, - in_strides, - wei_strides, - out_strides, hi, wi, n, diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp index 60bbe90411..6d4f5e3126 100644 --- a/src/solver/conv_direct_naive_conv_wrw.cpp +++ b/src/solver/conv_direct_naive_conv_wrw.cpp @@ -153,9 +153,6 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, handle.Run(kern)(tensors.x, tensors.dw, tensors.dy, - in_strides, - wei_strides, - out_strides, hi, wi, n, @@ -181,9 +178,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, handle.Run(kern)(tensors.x, tensors.dw, tensors.dy, - in_strides, - wei_strides, - out_strides, + in_strides, + wei_strides, + out_strides, hi, wi, n, From be93522f0f111622a7b879bb7fd6d75c2ae51344 Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Sat, 23 Sep 2023 19:14:08 +0000 Subject: [PATCH 03/23] fix the lengths of weight tensor --- test/gtest/conv3d_test_case.hpp | 112 ++++++++++++++++++++++++++++++++ test/gtest/group_conv3d_bwd.cpp | 2 +- test/gtest/group_conv3d_bwd.hpp | 88 ++----------------------- test/gtest/group_conv3d_fwd.cpp | 2 +- test/gtest/group_conv3d_fwd.hpp | 88 ++----------------------- test/gtest/group_conv3d_wrw.cpp | 2 +- test/gtest/group_conv3d_wrw.hpp | 88 ++----------------------- test/gtest/group_solver.hpp | 6 +- 8 files changed, 132 insertions(+), 256 deletions(-) create mode 100644 test/gtest/conv3d_test_case.hpp diff --git a/test/gtest/conv3d_test_case.hpp b/test/gtest/conv3d_test_case.hpp new file mode 100644 index 0000000000..242615077f --- /dev/null +++ b/test/gtest/conv3d_test_case.hpp @@ -0,0 +1,112 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include + +#include "get_handle.hpp" +#include + +#include "../driver/tensor_driver.hpp" +#include "conv_common.hpp" + +template +miopenDataType_t GetDataType(); + +template <> +miopenDataType_t GetDataType() +{ + return miopenFloat; +} + +template <> +miopenDataType_t GetDataType() +{ + return miopenHalf; +} + +template <> +miopenDataType_t GetDataType() +{ + return miopenInt8; +} + +struct Conv3DTestCase +{ + size_t G; + size_t N; + size_t C; + size_t D; + size_t H; + size_t W; + size_t k; + size_t z; + size_t y; + size_t x; + size_t pad_x; + size_t pad_y; + size_t pad_z; + size_t stride_x; + size_t stride_y; + size_t stride_z; + size_t dilation_x; + size_t dilation_y; + size_t dilation_z; + miopenConvolutionMode_t conv_mode; + friend std::ostream& operator<<(std::ostream& os, const Conv3DTestCase& tc) + { + return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D + << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z + << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z + << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z + << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x + << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y + << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode; + } + + std::vector GetInput() { return {N, C, D, H, W}; } + std::vector GetWeights() + { + EXPECT_EQUAL(C % G, 0); + return {k, C / G, z, y, x}; + } + + miopen::ConvolutionDescriptor GetConv() + { + return miopen::ConvolutionDescriptor{ + 3, + miopenConvolution, + miopenPaddingDefault, + {static_cast(pad_z), static_cast(pad_y), static_cast(pad_x)}, + {static_cast(stride_z), static_cast(stride_y), static_cast(stride_x)}, + {static_cast(dilation_z), + static_cast(dilation_y), + static_cast(dilation_x)}, + {0, 0, 0}, + static_cast(G), + 1.0}; + } +}; diff --git a/test/gtest/group_conv3d_bwd.cpp b/test/gtest/group_conv3d_bwd.cpp index 8e794749dd..6d0c3c678b 100644 --- a/test/gtest/group_conv3d_bwd.cpp +++ b/test/gtest/group_conv3d_bwd.cpp @@ -44,7 +44,7 @@ void SolverBwd(const miopen::TensorDescriptor& inputDesc, const miopen::TensorDescriptor& outputDesc, ConstData_t output, const miopen::ConvolutionDescriptor& convDesc, - const ConvTestCase& conv_config, + const Conv3DTestCase& conv_config, bool& test_skipped) { auto&& handle = get_handle(); diff --git a/test/gtest/group_conv3d_bwd.hpp b/test/gtest/group_conv3d_bwd.hpp index 410d71e6d0..71702c5808 100644 --- a/test/gtest/group_conv3d_bwd.hpp +++ b/test/gtest/group_conv3d_bwd.hpp @@ -25,89 +25,9 @@ *******************************************************************************/ #pragma once -#include +#include "conv3d_test_case.hpp" -#include "get_handle.hpp" -#include - -#include "../driver/tensor_driver.hpp" -#include "conv_common.hpp" - -template -miopenDataType_t GetDataType(); - -template <> -miopenDataType_t GetDataType() -{ - return miopenFloat; -} - -template <> -miopenDataType_t GetDataType() -{ - return miopenHalf; -} - -template <> -miopenDataType_t GetDataType() -{ - return miopenInt8; -} - -struct ConvTestCase -{ - size_t G; - size_t N; - size_t C; - size_t D; - size_t H; - size_t W; - size_t k; - size_t z; - size_t y; - size_t x; - size_t pad_x; - size_t pad_y; - size_t pad_z; - size_t stride_x; - size_t stride_y; - size_t stride_z; - size_t dilation_x; - size_t dilation_y; - size_t dilation_z; - miopenConvolutionMode_t conv_mode; - friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc) - { - return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D - << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z - << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z - << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z - << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x - << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y - << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode; - } - - std::vector GetInput() { return {N, C, D, H, W}; } - std::vector GetWeights() { return {k, C, z, y, x}; } - - miopen::ConvolutionDescriptor GetConv() - { - return miopen::ConvolutionDescriptor{ - 3, - miopenConvolution, - miopenPaddingDefault, - {static_cast(pad_z), static_cast(pad_y), static_cast(pad_x)}, - {static_cast(stride_z), static_cast(stride_y), static_cast(stride_x)}, - {static_cast(dilation_z), - static_cast(dilation_y), - static_cast(dilation_x)}, - {0, 0, 0}, - static_cast(G), - 1.0}; - } -}; - -std::vector ConvTestConfigs() +std::vector ConvTestConfigs() { // g n c d h w k z y x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z return {{1, 128, 64, 14, 28, 28, 64, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, {1, 64, 32, 28, 28, 28, 32, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, @@ -133,7 +53,7 @@ inline int SetTensorLayout(miopen::TensorDescriptor& desc) template struct ConvBwdSolverTest : public ::testing::TestWithParam< - std::tuple> + std::tuple> { protected: void SetUp() override @@ -188,7 +108,7 @@ struct ConvBwdSolverTest EXPECT_TRUE(error < threshold) << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; } - ConvTestCase conv_config; + Conv3DTestCase conv_config; miopen::ConvolutionDescriptor conv_desc; tensor input; tensor weights; diff --git a/test/gtest/group_conv3d_fwd.cpp b/test/gtest/group_conv3d_fwd.cpp index 876f513258..d21bf6f67b 100644 --- a/test/gtest/group_conv3d_fwd.cpp +++ b/test/gtest/group_conv3d_fwd.cpp @@ -44,7 +44,7 @@ void SolverFwd(const miopen::TensorDescriptor& inputDesc, const miopen::TensorDescriptor& outputDesc, Data_t output, const miopen::ConvolutionDescriptor& convDesc, - const ConvTestCase& conv_config, + const Conv3DTestCase& conv_config, bool& test_skipped) { auto&& handle = get_handle(); diff --git a/test/gtest/group_conv3d_fwd.hpp b/test/gtest/group_conv3d_fwd.hpp index 983f897d78..c8767399a7 100644 --- a/test/gtest/group_conv3d_fwd.hpp +++ b/test/gtest/group_conv3d_fwd.hpp @@ -25,89 +25,9 @@ *******************************************************************************/ #pragma once -#include +#include "conv3d_test_case.hpp" -#include "get_handle.hpp" -#include - -#include "../driver/tensor_driver.hpp" -#include "conv_common.hpp" - -template -miopenDataType_t GetDataType(); - -template <> -miopenDataType_t GetDataType() -{ - return miopenFloat; -} - -template <> -miopenDataType_t GetDataType() -{ - return miopenHalf; -} - -template <> -miopenDataType_t GetDataType() -{ - return miopenInt8; -} - -struct ConvTestCase -{ - size_t G; - size_t N; - size_t C; - size_t D; - size_t H; - size_t W; - size_t k; - size_t z; - size_t y; - size_t x; - size_t pad_x; - size_t pad_y; - size_t pad_z; - size_t stride_x; - size_t stride_y; - size_t stride_z; - size_t dilation_x; - size_t dilation_y; - size_t dilation_z; - miopenConvolutionMode_t conv_mode; - friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc) - { - return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D - << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z - << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z - << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z - << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x - << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y - << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode; - } - - std::vector GetInput() { return {N, C, D, H, W}; } - std::vector GetWeights() { return {k, C, z, y, x}; } - - miopen::ConvolutionDescriptor GetConv() - { - return miopen::ConvolutionDescriptor{ - 3, - miopenConvolution, - miopenPaddingDefault, - {static_cast(pad_z), static_cast(pad_y), static_cast(pad_x)}, - {static_cast(stride_z), static_cast(stride_y), static_cast(stride_x)}, - {static_cast(dilation_z), - static_cast(dilation_y), - static_cast(dilation_x)}, - {0, 0, 0}, - static_cast(G), - 1.0}; - } -}; - -std::vector ConvTestConfigs() +std::vector ConvTestConfigs() { // g n c d h w k z y x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z return {{1, 128, 64, 14, 28, 28, 64, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, {1, 64, 32, 28, 28, 28, 32, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, @@ -139,7 +59,7 @@ inline int SetTensorLayout(miopen::TensorDescriptor& desc) template struct ConvFwdSolverTest : public ::testing::TestWithParam< - std::tuple> + std::tuple> { protected: void SetUp() override @@ -195,7 +115,7 @@ struct ConvFwdSolverTest EXPECT_TRUE(error < threshold) << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; } - ConvTestCase conv_config; + Conv3DTestCase conv_config; miopen::ConvolutionDescriptor conv_desc; tensor input; tensor weights; diff --git a/test/gtest/group_conv3d_wrw.cpp b/test/gtest/group_conv3d_wrw.cpp index 0fae3402d8..398bc3becd 100644 --- a/test/gtest/group_conv3d_wrw.cpp +++ b/test/gtest/group_conv3d_wrw.cpp @@ -44,7 +44,7 @@ void SolverWrw(const miopen::TensorDescriptor& inputDesc, const miopen::TensorDescriptor& outputDesc, ConstData_t output, // dy const miopen::ConvolutionDescriptor& convDesc, - const ConvTestCase& conv_config, + const Conv3DTestCase& conv_config, bool& test_skipped) { diff --git a/test/gtest/group_conv3d_wrw.hpp b/test/gtest/group_conv3d_wrw.hpp index 76d8ae5d90..bf5824b4fa 100644 --- a/test/gtest/group_conv3d_wrw.hpp +++ b/test/gtest/group_conv3d_wrw.hpp @@ -25,89 +25,9 @@ *******************************************************************************/ #pragma once -#include +#include "conv3d_test_case.hpp" -#include "get_handle.hpp" -#include - -#include "../driver/tensor_driver.hpp" -#include "conv_common.hpp" - -template -miopenDataType_t GetDataType(); - -template <> -miopenDataType_t GetDataType() -{ - return miopenFloat; -} - -template <> -miopenDataType_t GetDataType() -{ - return miopenHalf; -} - -template <> -miopenDataType_t GetDataType() -{ - return miopenInt8; -} - -struct ConvTestCase -{ - size_t G; - size_t N; - size_t C; - size_t D; - size_t H; - size_t W; - size_t k; - size_t z; - size_t y; - size_t x; - size_t pad_x; - size_t pad_y; - size_t pad_z; - size_t stride_x; - size_t stride_y; - size_t stride_z; - size_t dilation_x; - size_t dilation_y; - size_t dilation_z; - miopenConvolutionMode_t conv_mode; - friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc) - { - return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D - << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z - << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z - << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z - << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x - << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y - << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode; - } - - std::vector GetInput() { return {N, C, D, H, W}; } - std::vector GetWeights() { return {k, C, z, y, x}; } - - miopen::ConvolutionDescriptor GetConv() - { - return miopen::ConvolutionDescriptor{ - 3, - miopenConvolution, - miopenPaddingDefault, - {static_cast(pad_z), static_cast(pad_y), static_cast(pad_x)}, - {static_cast(stride_z), static_cast(stride_y), static_cast(stride_x)}, - {static_cast(dilation_z), - static_cast(dilation_y), - static_cast(dilation_x)}, - {0, 0, 0}, - static_cast(G), - 1.0}; - } -}; - -std::vector ConvTestConfigs() +std::vector ConvTestConfigs() { // g n c d h w k z y x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z return {{1, 128, 64, 14, 28, 28, 64, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, {1, 64, 32, 28, 28, 28, 32, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, @@ -135,7 +55,7 @@ inline int SetTensorLayout(miopen::TensorDescriptor& desc) template struct ConvWrwSolverTest : public ::testing::TestWithParam< - std::tuple> + std::tuple> { protected: void SetUp() override @@ -191,7 +111,7 @@ struct ConvWrwSolverTest EXPECT_TRUE(error < threshold) << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; } - ConvTestCase conv_config; + Conv3DTestCase conv_config; miopen::ConvolutionDescriptor conv_desc; tensor input; tensor weights; diff --git a/test/gtest/group_solver.hpp b/test/gtest/group_solver.hpp index 6fe02e00da..3d9ebddca3 100644 --- a/test/gtest/group_solver.hpp +++ b/test/gtest/group_solver.hpp @@ -80,7 +80,11 @@ struct ConvTestCase } std::vector GetInput() { return {N, C, H, W}; } - std::vector GetWeights() { return {k, C, y, x}; } + std::vector GetWeights() + { + EXPECT_EQUAL(C % G, 0); + return {k, C / G, y, x}; + } miopen::ConvolutionDescriptor GetConv() { From d85785b525435bbcc6f9e5d41dec5e3f0fabc521 Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Mon, 25 Sep 2023 15:09:57 +0000 Subject: [PATCH 04/23] use 64-bit integers for stride value --- src/include/miopen/solver/conv_direct_naive_conv.hpp | 2 +- src/kernels/gpu_reference_kernel/naive_conv.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp index 927a64feff..36ba8d6bc3 100644 --- a/src/include/miopen/solver/conv_direct_naive_conv.hpp +++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp @@ -92,7 +92,7 @@ class MyArray __host__ __device__ ~MyArray() = default; }; -using StrideIndexType = int; +using StrideIndexType = size_t; using Strides5D = MyArray; using Strides6D = MyArray; diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp index 6ffb0789c4..8f2ab87cf6 100644 --- a/src/kernels/gpu_reference_kernel/naive_conv.cpp +++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp @@ -143,7 +143,7 @@ class MyArray __host__ __device__ ~MyArray() = default; }; -using StrideIndexType = int; +using StrideIndexType = size_t; using Strides5D = MyArray; using Strides6D = MyArray; #else From ee6abb3d0e177bff55695fc7ad2ab9cf8f904aca Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Sun, 20 Aug 2023 04:07:34 +0000 Subject: [PATCH 05/23] Squash commits together replace constant with appropriate block dimension WIP: add strides as a kernel parameter for non-packed tensors WIP: created non-packed variants for remaining 3D conv kernels fix: treat input and output as 6D NDHWGC tensor use std::call_once for initializing random seed revamp naive kernels to use strides in the non-packed tensors case (controlled by a bool flag) WIP: fixed kernel compilation issues but unable to load kernel code object WIP: fixed issue with hip rtc split channel strides into group, channels_per_group in solver fix indexing to left-to-right order fix bug with too much padding between kernel args num channels should be a multiple of num groups re-enable naive ref kernels with strides array 2D forward tests are all working now WIP: debugging bwd tests WIP: tests up till 3D bwd conv passing fix bug in bwd ndhwc kernel fix formatting disable prints fix readability-inconsistent-declaration-parameter-name fix clang-format fix hip tidy issue reverting the change to static init of random seed address comments and tidy issues. Remove extra print removed blank line change remove unneeded include env var for choosing packed vs non-packed reference kernel fix warnings from hip-tidy address comment about array initialization clear a tiny hip tidy issue --- src/solver/conv_direct_naive_conv_bwd.cpp | 2 ++ src/solver/conv_direct_naive_conv_fwd.cpp | 2 ++ src/solver/conv_direct_naive_conv_wrw.cpp | 2 ++ 3 files changed, 6 insertions(+) diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp index 3fe005e994..cc30de3dff 100644 --- a/src/solver/conv_direct_naive_conv_bwd.cpp +++ b/src/solver/conv_direct_naive_conv_bwd.cpp @@ -144,6 +144,8 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, int G_stride_idx = GetGroupStrideIndex(problem); + int G_stride_idx = GetGroupStrideIndex(problem); + if(problem.Is2d()) { result.invoker_factory = [=](const std::vector& kernels) { diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp index 3d953de173..89f9e293dc 100644 --- a/src/solver/conv_direct_naive_conv_fwd.cpp +++ b/src/solver/conv_direct_naive_conv_fwd.cpp @@ -145,6 +145,8 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, int G_stride_idx = GetGroupStrideIndex(problem); + int G_stride_idx = GetGroupStrideIndex(problem); + if(problem.Is2d()) { result.invoker_factory = [=](const std::vector& kernels) { diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp index 6d4f5e3126..dc01ed93d4 100644 --- a/src/solver/conv_direct_naive_conv_wrw.cpp +++ b/src/solver/conv_direct_naive_conv_wrw.cpp @@ -131,6 +131,8 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, int G_stride_idx = GetGroupStrideIndex(problem); + int G_stride_idx = GetGroupStrideIndex(problem); + if(problem.Is2d()) { result.invoker_factory = [=](const std::vector& kernels) { From 4fbcd7716d589ff06976675946588bc74e0babbf Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Mon, 18 Sep 2023 13:23:11 +0000 Subject: [PATCH 06/23] fix test for non-packed strides --- driver/random.hpp | 1 + test/gpu_reference_kernel.cpp | 105 ++++++++++++++++++++++++++++++---- 2 files changed, 94 insertions(+), 12 deletions(-) diff --git a/driver/random.hpp b/driver/random.hpp index 6398048dde..45b46ce571 100644 --- a/driver/random.hpp +++ b/driver/random.hpp @@ -91,6 +91,7 @@ inline T gen_0_to_B(T B) template inline T gen_A_to_B(T A, T B) { + assert(B >= A); return gen_0_to_B(B - A) + A; } diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp index aa3dda788d..6de633f776 100644 --- a/test/gpu_reference_kernel.cpp +++ b/test/gpu_reference_kernel.cpp @@ -24,6 +24,8 @@ * *******************************************************************************/ +#include +#include #include #include #include @@ -310,6 +312,26 @@ static std::string miopen_type_to_string(miopenDataType_t type) return "n/a"; } +// input: a vector of lengths of dims in a tensor +// multiply each element with a random constant integer +void pad_tensor_strides(std::vector& strides) { + auto pvec = [] (const char* name, const auto& vec) { + std::cout << name << ": ["; + for (const auto& v: vec) { + std::cout << v << ", "; + } + std::cout << "]\n"; + }; + + pvec("orig strides", strides); + auto c = prng::gen_A_to_B(1, 3); + // int c = 2; + for (auto& v: strides) { + v = v * c; + } + pvec("new strides", strides); +} + template in_len({n, c, hi, wi}); std::vector wei_len({k, c_per_group, fy, fx}); @@ -362,9 +384,18 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base miopen::tensor_layout_to_strides(wei_len, layout_default, layout_string, wei_strides); miopen::tensor_layout_to_strides(out_len, layout_default, layout_string, out_strides); + pad_tensor_strides(in_strides); + pad_tensor_strides(wei_strides); + pad_tensor_strides(out_strides); + tensor in(in_len, in_strides); tensor wei(wei_len, wei_strides); tensor out(out_len, out_strides); + + auto in_sz = in.data.size(); + auto wei_sz = wei.data.size(); + auto out_sz = out.data.size(); + #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); @@ -447,6 +478,13 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base wei.data.data(), sizeof(TRef) * wei_sz, hipMemcpyHostToDevice) == hipSuccess); + // TODO(Amber): copy output before computation because output may + // be not be packed, and convolution may update only a subset of + // indices + EXPECT(hipMemcpy(out_dev, + out.data.data(), + sizeof(Tout) * out_sz, + hipMemcpyHostToDevice) == hipSuccess); #endif cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), in, @@ -518,9 +556,16 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base nullptr); EXPECT(status == CL_SUCCESS); #elif MIOPEN_BACKEND_HIP + // TODO(Amber): copy output before computation because output may + // be not be packed, and convolution may update only a subset of + // indices + EXPECT(hipMemcpy(in_dev, + in.data.data(), + sizeof(TRef) * in_sz, + hipMemcpyHostToDevice) == hipSuccess); EXPECT(hipMemcpy(out_dev, out.data.data(), - sizeof(TRef) * out_sz, + sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); EXPECT(hipMemcpy(wei_dev, wei.data.data(), @@ -600,9 +645,16 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); + // TODO(Amber): copy output before computation because output may + // be not be packed, and convolution may update only a subset of + // indices + EXPECT(hipMemcpy(wei_dev, + wei.data.data(), + sizeof(TRef) * wei_sz, + hipMemcpyHostToDevice) == hipSuccess); EXPECT(hipMemcpy(out_dev, out.data.data(), - sizeof(TRef) * out_sz, + sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); #endif cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), @@ -719,11 +771,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base int wo = conv_out_size(wi, px, dx, fx, sx); int do_ = conv_out_size(di, pz, dz, fz, sz); int c_per_group = c / g; - int k_per_group = k / g; + // int k_per_group = k / g; - int in_sz = g * n * c_per_group * di * hi * wi; - int wei_sz = g * k_per_group * c_per_group * fz * fy * fx; - int out_sz = g * n * k_per_group * do_ * ho * wo; + // int in_sz = g * n * c_per_group * di * hi * wi; + // int wei_sz = g * k_per_group * c_per_group * fz * fy * fx; + // int out_sz = g * n * k_per_group * do_ * ho * wo; std::vector in_len({n, c, di, hi, wi}); std::vector wei_len({k, c_per_group, fz, fy, fx}); @@ -740,9 +792,18 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base miopen::tensor_layout_to_strides(wei_len, layout_default, layout_string, wei_strides); miopen::tensor_layout_to_strides(out_len, layout_default, layout_string, out_strides); + pad_tensor_strides(in_strides); + pad_tensor_strides(wei_strides); + pad_tensor_strides(out_strides); + tensor in(in_len, in_strides); tensor wei(wei_len, wei_strides); tensor out(out_len, out_strides); + + auto in_sz = in.data.size(); + auto wei_sz = wei.data.size(); + auto out_sz = out.data.size(); + #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); @@ -821,6 +882,13 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); + // TODO(Amber): copy output before computation because output may + // be not be packed, and convolution may update only a subset of + // indices + EXPECT(hipMemcpy(out_dev, + out.data.data(), + sizeof(Tout) * out_sz, + hipMemcpyHostToDevice) == hipSuccess); EXPECT(hipMemcpy(wei_dev, wei.data.data(), sizeof(TRef) * wei_sz, @@ -898,9 +966,15 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base nullptr); EXPECT(status == CL_SUCCESS); #elif MIOPEN_BACKEND_HIP + // TODO(Amber): copy output before computation because output may + // be not be packed, and convolution may update only a subset of + // indices + EXPECT(hipMemcpy( + in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == + hipSuccess); EXPECT(hipMemcpy(out_dev, out.data.data(), - sizeof(TRef) * out_sz, + sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); EXPECT(hipMemcpy(wei_dev, wei.data.data(), @@ -980,9 +1054,16 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); + // TODO(Amber): copy output before computation because output may + // be not be packed, and convolution may update only a subset of + // indices + EXPECT(hipMemcpy(wei_dev, + wei.data.data(), + sizeof(TRef) * wei_sz, + hipMemcpyHostToDevice) == hipSuccess); EXPECT(hipMemcpy(out_dev, out.data.data(), - sizeof(TRef) * out_sz, + sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); #endif cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), From 82e0ccfda5f1de7a873ba7e6f60450cddb373174 Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Mon, 18 Sep 2023 13:24:43 +0000 Subject: [PATCH 07/23] fix format --- test/gpu_reference_kernel.cpp | 44 ++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp index 6de633f776..9dca52669c 100644 --- a/test/gpu_reference_kernel.cpp +++ b/test/gpu_reference_kernel.cpp @@ -314,22 +314,25 @@ static std::string miopen_type_to_string(miopenDataType_t type) // input: a vector of lengths of dims in a tensor // multiply each element with a random constant integer -void pad_tensor_strides(std::vector& strides) { - auto pvec = [] (const char* name, const auto& vec) { - std::cout << name << ": ["; - for (const auto& v: vec) { - std::cout << v << ", "; +void pad_tensor_strides(std::vector& strides) +{ + auto pvec = [](const char* name, const auto& vec) { + std::cout << name << ": ["; + for(const auto& v : vec) + { + std::cout << v << ", "; + } + std::cout << "]\n"; + }; + + pvec("orig strides", strides); + auto c = prng::gen_A_to_B(1, 3); + // int c = 2; + for(auto& v : strides) + { + v = v * c; } - std::cout << "]\n"; - }; - - pvec("orig strides", strides); - auto c = prng::gen_A_to_B(1, 3); - // int c = 2; - for (auto& v: strides) { - v = v * c; - } - pvec("new strides", strides); + pvec("new strides", strides); } template wei(wei_len, wei_strides); tensor out(out_len, out_strides); - auto in_sz = in.data.size(); + auto in_sz = in.data.size(); auto wei_sz = wei.data.size(); auto out_sz = out.data.size(); @@ -559,10 +562,9 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base // TODO(Amber): copy output before computation because output may // be not be packed, and convolution may update only a subset of // indices - EXPECT(hipMemcpy(in_dev, - in.data.data(), - sizeof(TRef) * in_sz, - hipMemcpyHostToDevice) == hipSuccess); + EXPECT(hipMemcpy( + in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == + hipSuccess); EXPECT(hipMemcpy(out_dev, out.data.data(), sizeof(Tout) * out_sz, @@ -800,7 +802,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base tensor wei(wei_len, wei_strides); tensor out(out_len, out_strides); - auto in_sz = in.data.size(); + auto in_sz = in.data.size(); auto wei_sz = wei.data.size(); auto out_sz = out.data.size(); From 7e8a258abd57e9873d4cfa401e7913e3900683d1 Mon Sep 17 00:00:00 2001 From: amberhassaan Date: Mon, 18 Sep 2023 11:18:39 -0400 Subject: [PATCH 08/23] Fix assertion check. Co-authored-by: Alex Eremin --- driver/random.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/random.hpp b/driver/random.hpp index 45b46ce571..b3be81f56e 100644 --- a/driver/random.hpp +++ b/driver/random.hpp @@ -91,7 +91,7 @@ inline T gen_0_to_B(T B) template inline T gen_A_to_B(T A, T B) { - assert(B >= A); + assert(B > A); return gen_0_to_B(B - A) + A; } From cadfb9583040ce36ed5c7b8960bacffa053bcb8a Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Tue, 19 Sep 2023 18:13:40 -0700 Subject: [PATCH 09/23] suppress cppcheck warning to test CI --- test/gpu_reference_kernel.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp index 9dca52669c..31061c346c 100644 --- a/test/gpu_reference_kernel.cpp +++ b/test/gpu_reference_kernel.cpp @@ -330,6 +330,7 @@ void pad_tensor_strides(std::vector& strides) // int c = 2; for(auto& v : strides) { + // cppcheck-suppress useStlAlgorithm v = v * c; } pvec("new strides", strides); From cde6e22cfef5e611a90925461ecec0698e7e53c6 Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Mon, 25 Sep 2023 15:52:56 +0000 Subject: [PATCH 10/23] fix build and remove a check that prevents non-strided inputs --- src/ocl/convolutionocl.cpp | 3 ++- src/solver/conv_direct_naive_conv_bwd.cpp | 10 ++-------- src/solver/conv_direct_naive_conv_fwd.cpp | 10 ++-------- src/solver/conv_direct_naive_conv_wrw.cpp | 10 ++-------- 4 files changed, 8 insertions(+), 25 deletions(-) diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp index 07e6f28b19..d012125499 100644 --- a/src/ocl/convolutionocl.cpp +++ b/src/ocl/convolutionocl.cpp @@ -417,11 +417,12 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle, size_t workSpaceSize) const { MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize); - +#if 0 if(!(xDesc.IsPacked() && wDesc.IsPacked() && yDesc.IsPacked())) { MIOPEN_THROW(miopenStatusNotImplemented, "Only fully packed tensors are supported"); } +#endif const auto tensors = ConvFwdTensors{xDesc, x, wDesc, w, yDesc, y}; ValidateConvTensors(tensors); diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp index cc30de3dff..5bb765c56a 100644 --- a/src/solver/conv_direct_naive_conv_bwd.cpp +++ b/src/solver/conv_direct_naive_conv_bwd.cpp @@ -134,15 +134,9 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, kernel.l_wk.push_back(1); kernel.l_wk.push_back(1); - const auto is_f8 = [&]() { - if(kernel.kernel_file == "fp8_naive_conv.cpp") - return true; - else - return false; - }(); - kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); + const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp"); - int G_stride_idx = GetGroupStrideIndex(problem); + kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); int G_stride_idx = GetGroupStrideIndex(problem); diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp index 89f9e293dc..12ba295934 100644 --- a/src/solver/conv_direct_naive_conv_fwd.cpp +++ b/src/solver/conv_direct_naive_conv_fwd.cpp @@ -124,12 +124,6 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, KernelInfo kernel; kernel.kernel_file = ConvDirectNaiveConvKernelFile(ctx, problem); - const auto is_f8 = [&]() { - if(kernel.kernel_file == "fp8_naive_conv.cpp") - return true; - else - return false; - }(); kernel.kernel_name = ConvDirectNaiveConvKernelName(problem); kernel.g_wk.clear(); @@ -141,9 +135,9 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, kernel.l_wk.push_back(1); kernel.l_wk.push_back(1); - kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); + const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp"); - int G_stride_idx = GetGroupStrideIndex(problem); + kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); int G_stride_idx = GetGroupStrideIndex(problem); diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp index dc01ed93d4..e9fb0dee57 100644 --- a/src/solver/conv_direct_naive_conv_wrw.cpp +++ b/src/solver/conv_direct_naive_conv_wrw.cpp @@ -121,15 +121,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, kernel.l_wk.push_back(1); kernel.l_wk.push_back(1); - kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); - const auto is_f8 = [&]() { - if(kernel.kernel_file == "fp8_naive_conv.cpp") - return true; - else - return false; - }(); + const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp"); - int G_stride_idx = GetGroupStrideIndex(problem); + kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); int G_stride_idx = GetGroupStrideIndex(problem); From e06c523aaccd7a460ef6b78678df9b348917c91b Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Tue, 26 Sep 2023 22:21:43 +0000 Subject: [PATCH 11/23] addressed comments. Moved common code into an include file --- src/CMakeLists.txt | 1 + src/include/miopen/hipoc_kernel.hpp | 31 +-- .../miopen/solver/conv_direct_naive_conv.hpp | 73 +----- .../gpu_reference_kernel/naive_conv.cpp | 219 ++---------------- src/kernels/stride_array.hpp | 86 +++++++ src/solver/conv_direct_naive_conv.cpp | 21 +- src/solver/conv_direct_naive_conv_bwd.cpp | 17 +- src/solver/conv_direct_naive_conv_fwd.cpp | 2 - 8 files changed, 133 insertions(+), 317 deletions(-) create mode 100644 src/kernels/stride_array.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4ffed2b4c8..f64feac4e5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -388,6 +388,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/workaround_issue_1431.hpp kernels/hip_f8_impl.hpp kernels/hip_float8.hpp + kernels/stride_array.hpp ) set(MIOPEN_KERNELS diff --git a/src/include/miopen/hipoc_kernel.hpp b/src/include/miopen/hipoc_kernel.hpp index b18955b5c2..938d20d9e6 100644 --- a/src/include/miopen/hipoc_kernel.hpp +++ b/src/include/miopen/hipoc_kernel.hpp @@ -26,14 +26,15 @@ #ifndef GUARD_MIOPEN_HIPOC_KERNEL_HPP #define GUARD_MIOPEN_HIPOC_KERNEL_HPP -#include -#include #include #include #include #include + +#include +#include +#include #include -#include namespace miopen { @@ -47,36 +48,20 @@ inline HipEventPtr make_hip_event() #if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017 -#if 0 // Keep around other storage techinques -- @pfultz2 27.03.2017 -template -struct KernelArgsPair -{ - static const int alignment = sizeof(U); - static const int padding = (alignment - sizeof(T) % alignment) % alignment; - static const int second_index = sizeof(T) + padding; - KernelArgsPair(T x, U y) - { - new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew) - new(buffer + second_index) U(y); - } - alignas(U) char buffer[second_index + sizeof(U)] = {}; -}; -#else template struct KernelArgsPair { - static const int alignment = alignof(U); - static const int padding = (alignment - (sizeof(T) % alignment)) % alignment; - static_assert(padding >= 0, "padding cannot be negative"); - static const int second_index = sizeof(T) + padding; + constexpr static const auto alignU = alignof(U); + constexpr static const auto padding = (alignU - (sizeof(T) % alignU)) % alignU; + constexpr static const auto second_index = sizeof(T) + padding; KernelArgsPair(T x, U y) { new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew) new(buffer + second_index) U(y); } + alignas(U) char buffer[second_index + sizeof(U)] = {}; }; -#endif template struct KernelArgsPack; diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp index 31b399c409..0d0029d814 100644 --- a/src/include/miopen/solver/conv_direct_naive_conv.hpp +++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp @@ -27,12 +27,13 @@ #include #include +#include "miopen/../../kernels/stride_array.hpp" #include #include +#include #include #include -#include namespace miopen { @@ -65,54 +66,6 @@ void printTensorStrides(const TensorDescriptor& inDesc, const TensorDescriptor& wDesc, const TensorDescriptor& outDesc); -// TODO(Amber): Uncomment when hip RTC accepts std::array -// using StrideIndexType = int; -// using Strides3D = std::array; -// using Strides4D = std::array; -// using Strides5D = std::array; -// using Strides6D = std::array; -#if 1 -template -class MyArray -{ - T data_[N] = {}; - -public: - constexpr static const unsigned SIZE = N; - __host__ __device__ constexpr unsigned size() const { return N; } - - __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; } - - __host__ T& operator[](unsigned i) { return data_[i]; } - - __host__ __device__ MyArray() = default; - __host__ __device__ MyArray(const MyArray&) = default; - __host__ __device__ MyArray(MyArray&&) noexcept = default; - __host__ __device__ MyArray& operator=(const MyArray&) = default; - __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default; - __host__ __device__ ~MyArray() = default; -}; - -using StrideIndexType = size_t; -using Strides5D = MyArray; -using Strides6D = MyArray; - -#else - -extern "C" typedef int StrideIndexType; - -extern "C" typedef struct -{ - StrideIndexType v[5]; -} Strides5D; - -extern "C" typedef struct -{ - StrideIndexType v[6]; -} Strides6D; - -#endif - namespace internal { template struct ChooseStride @@ -189,27 +142,5 @@ V SplitWeiStrideKtoGK(int k_per_group, const V& wei_strides) return ret; } -template -void printStrideArray(const char* name, const StrideArray& sarr) -{ - printf("%s = [", name); - for(unsigned i = 0; i < StrideArray::SIZE; ++i) - { - printf("%d,", sarr[i]); - } - printf("]\n"); -} - -template -void printStrideArrays(const StrideArray& in_strides, - const StrideArray& wei_strides, - const StrideArray& out_strides) -{ - - printStrideArray("in_strides", in_strides); - printStrideArray("wei_strides", wei_strides); - printStrideArray("out_strides", out_strides); -} - } // namespace solver } // namespace miopen diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp index 8f2ab87cf6..60c3583e77 100644 --- a/src/kernels/gpu_reference_kernel/naive_conv.cpp +++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp @@ -46,6 +46,8 @@ typedef float float_t; #endif #endif // __HIPCC_RTC__ +#include "stride_array.hpp" + // hcc seems need __device__ __host__ together to compile, and no extern "C" typedef union value_bf16_fp32_t { @@ -114,13 +116,8 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val) return static_cast(val & 0xff); } +#if 0 // TODO(Amber): this file is compiled via HIP RTC and includes don't work easily -// so currently duplicating content from miopen/common.hpp -// #include "miopen/common.hpp" -// #include -// TODO(Amber): HIP RTC redefines stuff from std library (I don't know why) -// #include -#if 1 template class MyArray { @@ -146,81 +143,22 @@ class MyArray using StrideIndexType = size_t; using Strides5D = MyArray; using Strides6D = MyArray; -#else - -extern "C" typedef int StrideIndexType; - -extern "C" typedef struct -{ - StrideIndexType v[5]; -} Strides5D; - -extern "C" typedef struct -{ - StrideIndexType v[6]; -} Strides6D; - -extern "C" __global__ void testKernel(void* ptr_a, - void* ptr_b, - void* ptr_c, - Strides5D in_strides, - Strides5D wei_strides, - Strides5D out_strides) -{ - - if(blockIdx.x == 0 && threadIdx.x == 0) - { - printf("sizeof(Strides5D) = %lu\n", sizeof(Strides5D)); - printf("%p, %p, %p, %p\n", &ptr_a, &ptr_b, &ptr_c, &in_strides); - printf("in_strides = ["); - for(int i = 0; i < 5; ++i) - { - // printf("%d,", in_strides.v[i]); - printf("%d,", in_strides[i]); - } - printf("]\n"); - printf("wei_strides = ["); - for(int i = 0; i < 5; ++i) - { - // printf("%d,", wei_strides.v[i]); - printf("%d,", wei_strides[i]); - } - printf("]\n"); - printf("out_strides = ["); - for(int i = 0; i < 5; ++i) - { - // printf("%d,", out_strides.v[i]); - printf("%d,", out_strides[i]); - } - printf("]\n"); - } -} #endif -template -__device__ void printStrideArray(const char* name, const StrideArray& sarr) -{ - printf("%s = [", name); - for(int i = 0; i < StrideArray::SIZE; ++i) - { - printf("%d,", sarr[i]); - } - printf("]\n"); -} +/// \todo remove template parameter 'bool ASSUME_PACKED' in a follow up PR +/// --amberhassaan +/// Notes (Amber): +/// - The following code used to assume that group (G) is an implicit +/// dimension, i.e. c= c_per_group * group and k = k_per_group * group. This is not +/// true for non-packed case because group (G) dimension needs to have its stride +/// explicitly specified for address math to make sense. This is also how +/// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw, +/// and nhwc should be nhwgc. Same follows for the 3D case. +/// +/// - strides here are in the little-endian order, i.e., for NHWC, stride for N is +/// at index 3 while stride for C is at index 0. This is reverse of how strides are +/// stored in tensor descriptors, which are big-endian. -// TODO(Amber): remove template parameter 'bool ASSUME_PACKED' in a follow up PR -// Notes (Amber): -// * The following code used to assume that group (G) is an implicit -// dimension, i.e. c= c_per_group * group and k = k_per_group * group. This is not -// true for non-packed case because group (G) dimension needs to have its stride -// explicitly specified for address math to make sense. This is also how -// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw, -// and nhwc should be nhwgc. Same follows for the 3D case. -// * strides here are in the little-endian order, i.e., for NHWC, stride for N is -// at index 3 while stride for C is at index 0. This is reverse of how strides are -// stored in tensor descriptors, which are big-endian. - -// TODO(Amber): Rename nchw to ngchw template inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, @@ -245,43 +183,6 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, int fx, int group) { - - // TODO(Amber): Remove this code - /* - if (blockIdx.x == 0 && threadIdx.x == 0) { - printStrideArray("in_strides", in_strides); - printStrideArray("wei_strides", wei_strides); - printStrideArray("out_strides", out_strides); - - printf("modified strides\n"); - Strides5D in_strd; - Strides5D wei_strd; - Strides5D out_strd; - - in_strd[0] = 1; - in_strd[1] = wi; - in_strd[2] = hi * wi; - in_strd[3] = c_per_group * hi * wi; - in_strd[4] = group * c_per_group * hi * wi; - - wei_strd[0] = 1; - wei_strd[1] = fx; - wei_strd[2] = fy * fx; - wei_strd[3] = c_per_group * fy * fx; - wei_strd[4] = k_per_group * c_per_group * fy * fx; - - out_strd[0] = 1; - out_strd[1] = wo; - out_strd[2] = ho * wo; - out_strd[3] = k_per_group * ho * wo; - out_strd[4] = group * k_per_group * ho * wo; - - printStrideArray("in_strd", in_strd); - printStrideArray("wei_strd", wei_strd); - printStrideArray("out_strd", out_strd); - } - */ - /* * need to compute total output pixel: `group * n * k_per_group * ho * wo`. * to distribute this workload, let one workgroup compute `ho * wo` pixel, @@ -324,7 +225,6 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, int iho = tid / wo; int iwo = tid % wo; - // double value = .0f; acc_data_t value = 0; for(int ic = 0; ic < c_per_group; ic++) @@ -454,7 +354,6 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in, int ihi = tid / wi; int iwi = tid % wi; - // double value = .0f; acc_data_t value = 0; for(int ik = 0; ik < k_per_group; ik++) @@ -591,7 +490,6 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in, int iy = (tid / fx) % fy; int ic = tid / (fx * fy); - // double value = .0f; acc_data_t value = 0; for(int in = 0; in < n; in++) @@ -737,7 +635,6 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in, int iho = (tid / wo) % ho; int ido = tid / (ho * wo); - // double value = .0f; acc_data_t value = 0; for(int ic = 0; ic < c_per_group; ic++) @@ -892,7 +789,6 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in, int ihi = (tid / wi) % hi; int idi = tid / (hi * wi); - // double value = .0f; acc_data_t value = 0; for(int ik = 0; ik < k_per_group; ik++) @@ -1053,7 +949,6 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in, int iz = (tid / (fx * fy)) % fz; int ic = tid / (fx * fy * fz); - // double value = .0f; acc_data_t value = 0; for(int in = 0; in < n; in++) @@ -1202,7 +1097,6 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in, int iwo = tid / k_per_group; int ik = tid % k_per_group; - // double value = .0f; acc_data_t value = 0; for(int iy = 0; iy < fy; iy++) @@ -1294,41 +1188,6 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, int fx, int group) { - /* - if (blockIdx.x == 0 && threadIdx.x == 0) { - printStrideArray("in_strides", in_strides); - printStrideArray("wei_strides", wei_strides); - printStrideArray("out_strides", out_strides); - - printf("modified strides\n"); - Strides5D in_strd; - Strides5D wei_strd; - Strides5D out_strd; - - in_strd[0] = 1; - in_strd[1] = k_per_group; - in_strd[2] = group * k_per_group; - in_strd[3] = group * k_per_group * wo; - in_strd[4] = group * k_per_group * wo * ho; - - wei_strd[0] = 1; - wei_strd[1] = c_per_group; - wei_strd[2] = c_per_group * fx; - wei_strd[3] = c_per_group * fx * fy; - wei_strd[4] = c_per_group * fx * fy * k_per_group; - - out_strd[0] = 1; - out_strd[1] = c_per_group; - out_strd[2] = group * c_per_group; - out_strd[3] = group * c_per_group * wi; - out_strd[4] = group * c_per_group * wi * hi; - - printStrideArray("in_strd", in_strd); - printStrideArray("wei_strd", wei_strd); - printStrideArray("out_strd", out_strd); - } - */ - /* * need to compute total input pixel: `group * n * hi * wi * c_per_group`. * to distribute this workload, let one workgroup compute `wi * @@ -1368,7 +1227,6 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, int iwi = tid / c_per_group; int ic = tid % c_per_group; - // double value = .0f; acc_data_t value = 0; for(int iy = 0; iy < fy; iy++) @@ -1506,7 +1364,6 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in, int ix = (tid / c_per_group) % fx; int iy = tid / (c_per_group * fx); - // double value = .0f; acc_data_t value = 0; for(int in = 0; in < n; in++) @@ -1652,7 +1509,6 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in, int iwo = (tid / k_per_group) % wo; int iho = tid / (k_per_group * wo); - // double value = .0f; acc_data_t value = 0; for(int iz = 0; iz < fz; iz++) @@ -1763,43 +1619,6 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, int fx, int group) { - /* - if (blockIdx.x == 0 && threadIdx.x == 0) { - printStrideArray("in_strides", in_strides); - printStrideArray("wei_strides", wei_strides); - printStrideArray("out_strides", out_strides); - - printf("modified strides\n"); - Strides6D in_strd; - Strides6D wei_strd; - Strides6D out_strd; - - in_strd[0] = 1; - in_strd[1] = k_per_group; - in_strd[2] = group * k_per_group; - in_strd[3] = group * k_per_group * wo; - in_strd[4] = group * k_per_group * wo * ho; - in_strd[5] = group * k_per_group * wo * ho * do_; - - wei_strd[0] = 1; - wei_strd[1] = c_per_group; - wei_strd[2] = c_per_group * fx; - wei_strd[3] = c_per_group * fx * fy; - wei_strd[4] = c_per_group * fx * fy * fz; - wei_strd[5] = k_per_group * c_per_group * fx * fy * fz; - - out_strd[0] = 1; - out_strd[1] = c_per_group; - out_strd[2] = group * c_per_group; - out_strd[3] = group * c_per_group * wi; - out_strd[4] = group * c_per_group * wi * hi; - out_strd[5] = group * c_per_group * wi * hi * di; - - printStrideArray("in_strd", in_strd); - printStrideArray("wei_strd", wei_strd); - printStrideArray("out_strd", out_strd); - } - */ /* * need to compute total input pixel: `group * n * di * hi * wi * @@ -1843,7 +1662,6 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, int iwi = (tid / c_per_group) % wi; int ihi = (tid / (c_per_group * wi)); - // double value = .0f; acc_data_t value = 0; for(int iz = 0; iz < fz; iz++) @@ -2004,7 +1822,6 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in, int iy = (tid / (c_per_group * fx)) % fy; int iz = (tid / (c_per_group * fx * fy)); - // double value = .0f; acc_data_t value = 0; for(int in = 0; in < n; in++) @@ -2363,3 +2180,7 @@ DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, ushort, double, ushort) DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, float, double, float) DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, half, double, half) DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, ushort, double, ushort) + +/// \todo discuss whether we should split the kernels into separate files, or +/// figure out a mechanism to compile each kernel separately to reduce hipRTC +/// compilation times. --amberhassaan diff --git a/src/kernels/stride_array.hpp b/src/kernels/stride_array.hpp new file mode 100644 index 0000000000..32cb1f85b6 --- /dev/null +++ b/src/kernels/stride_array.hpp @@ -0,0 +1,86 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#ifdef __HIPCC_RTC__ +#ifndef WORKAROUND_ISSUE_HIPRTC_TRUE_TYPE +#include +#endif +#endif // __HIPCC_RTC__ + +/// \todo Uncomment when hip RTC accepts std::array -- amberhassaan +// #include +// using StrideIndexType = int; +// using Strides3D = std::array; +// using Strides4D = std::array; +// using Strides5D = std::array; +// using Strides6D = std::array; +template +class MyArray +{ + T data_[N] = {}; + +public: + constexpr static const unsigned SIZE = N; + __host__ __device__ constexpr unsigned size() const { return N; } + + __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; } + + __host__ T& operator[](unsigned i) { return data_[i]; } + + __host__ __device__ MyArray() = default; + __host__ __device__ MyArray(const MyArray&) = default; + __host__ __device__ MyArray(MyArray&&) noexcept = default; + __host__ __device__ MyArray& operator=(const MyArray&) = default; + __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default; + __host__ __device__ ~MyArray() = default; +}; + +using StrideIndexType = size_t; +using Strides5D = MyArray; +using Strides6D = MyArray; + +template +__host__ __device__ void printStrideArray(const char* name, const StrideArray& sarr) +{ + printf("%s = [", name); + for(int i = 0; i < StrideArray::SIZE; ++i) + { + printf("%zu,", sarr[i]); + } + printf("]\n"); +} + +template +__host__ __device__ void printStrideArrays(const StrideArray& in_strides, + const StrideArray& wei_strides, + const StrideArray& out_strides) +{ + + printStrideArray("in_strides", in_strides); + printStrideArray("wei_strides", wei_strides); + printStrideArray("out_strides", out_strides); +} diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp index a0d6637978..e39ee06a66 100644 --- a/src/solver/conv_direct_naive_conv.cpp +++ b/src/solver/conv_direct_naive_conv.cpp @@ -33,7 +33,6 @@ #include #include #include -#include namespace miopen { @@ -107,10 +106,12 @@ bool IsOutputInt32(const ProblemDescription& problem) problem.GetOutDataType() == miopenInt32; } +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS); + std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem) { std::ostringstream kernel_name; - if(miopen::IsEnvvarValueEnabled("MIOPEN_USE_PACKED_CONV_REF_KERNEL")) + if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS())) { kernel_name << "naive_conv_packed_"; } @@ -183,8 +184,6 @@ std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem) else MIOPEN_THROW("unsupported data type:"); - // TODO(Amber): Left for debugging. Will remove in the future. - // std::cout << "############ kernel_name = " << kernel_name.str() << std::endl; return kernel_name.str(); } @@ -256,10 +255,10 @@ bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext& ctx, return true; } -// figure out the index of C (channel) stride so we can expand it into -// (G, C_per_group). Return value G_stride_idx is the position of G stride -// in the stride vector, such that the (G_stride_idx - 1) is the index that -// contains C's stride as a multiplying factor +/// Figure out the index of C (channel) stride so we can expand it into +/// (G, C_per_group). Return value G_stride_idx is the position of G stride +/// in the stride vector, such that the (G_stride_idx - 1) is the index that +/// contains C's stride as a multiplying factor int GetGroupStrideIndex(const ProblemDescription& problem) { int G_stride_idx = -1; @@ -287,12 +286,12 @@ void printTensorStrides(const TensorDescriptor& inDesc, { auto printOneStrideVec = [](const char* name, const auto& vec) { - printf("%s = [", name); + MIOPEN_LOG_I(name << " = ["); for(const size_t v : vec) { - printf("%zu,", v); + MIOPEN_LOG_I(v << ","); } - printf("]\n"); + MIOPEN_LOG_I("]\n"); }; printOneStrideVec("inDesc = ", inDesc.GetStrides()); diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp index d0d5f579a1..c0aa1385f8 100644 --- a/src/solver/conv_direct_naive_conv_bwd.cpp +++ b/src/solver/conv_direct_naive_conv_bwd.cpp @@ -159,10 +159,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); auto out_strides = MakeStrideArray<5>( SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx)); - // TODO(Amber): Someone made the silly decision of swapping in and - // out pointers in ConvTensors for backward pass, so now I have to - // pass out in place of in, out_strides in place of in_strides and - // vice-versa + /// \ref backward_tensors_reversed_why if(is_f8) { handle.Run(kern)(tensors.out, @@ -241,13 +238,11 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, auto out_strides = MakeStrideArray<6>( SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx)); - // printTensorStrides(tensors.inDesc, tensors.wDesc, tensors.outDesc); - // printStrideArrays(in_strides, wei_strides, out_strides); - - // TODO(Amber): Someone made the silly decision of swapping in and - // out pointers in ConvTensors for backward pass, so now I have to - // pass out in place of in, out_strides in place of in_strides and - // vice-versa + /// \anchor backward_tensors_reversed_why + /// \todo Someone made the silly decision of swapping in and + /// out pointers in ConvTensors for backward pass, so now I have to + /// pass out in place of in, out_strides in place of in_strides and + /// vice-versa --amberhassaan handle.Run(kern)(tensors.out, tensors.w, tensors.in, diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp index 49cbe49a1c..11ac4dd191 100644 --- a/src/solver/conv_direct_naive_conv_fwd.cpp +++ b/src/solver/conv_direct_naive_conv_fwd.cpp @@ -27,8 +27,6 @@ #include #include #include -#include -#include MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD) From 67d9a77f0810e4672ecec65e1ebc591c06ff36f6 Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Tue, 26 Sep 2023 22:44:33 +0000 Subject: [PATCH 12/23] address comments --- src/ocl/convolutionocl.cpp | 6 ------ test/gpu_reference_kernel.cpp | 37 +++++++++-------------------------- 2 files changed, 9 insertions(+), 34 deletions(-) diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp index 741f499703..48e14e3f30 100644 --- a/src/ocl/convolutionocl.cpp +++ b/src/ocl/convolutionocl.cpp @@ -416,12 +416,6 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle, size_t workSpaceSize) const { MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize); -#if 0 - if(!(xDesc.IsPacked() && wDesc.IsPacked() && yDesc.IsPacked())) - { - MIOPEN_THROW(miopenStatusNotImplemented, "Only fully packed tensors are supported"); - } -#endif const auto tensors = ConvFwdTensors{xDesc, x, wDesc, w, yDesc, y}; ValidateConvTensors(tensors); diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp index 31061c346c..13eae6f01e 100644 --- a/test/gpu_reference_kernel.cpp +++ b/test/gpu_reference_kernel.cpp @@ -367,11 +367,6 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base int ho = conv_out_size(hi, py, dy, fy, sy); int wo = conv_out_size(wi, px, dx, fx, sx); int c_per_group = c / g; - // int k_per_group = k / g; - - // int in_sz = g * n * c_per_group * hi * wi; - // int wei_sz = g * k_per_group * c_per_group * fy * fx; - // int out_sz = g * n * k_per_group * ho * wo; std::vector in_len({n, c, hi, wi}); std::vector wei_len({k, c_per_group, fy, fx}); @@ -482,9 +477,10 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base wei.data.data(), sizeof(TRef) * wei_sz, hipMemcpyHostToDevice) == hipSuccess); - // TODO(Amber): copy output before computation because output may - // be not be packed, and convolution may update only a subset of - // indices + /// \anchor copy_non_packed_output_before_convolution + /// \note copy output before computation because output may + /// be not be packed, and convolution may update only a subset of + /// indices EXPECT(hipMemcpy(out_dev, out.data.data(), sizeof(Tout) * out_sz, @@ -560,9 +556,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base nullptr); EXPECT(status == CL_SUCCESS); #elif MIOPEN_BACKEND_HIP - // TODO(Amber): copy output before computation because output may - // be not be packed, and convolution may update only a subset of - // indices + /// \ref copy_non_packed_output_before_convolution EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -648,9 +642,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); - // TODO(Amber): copy output before computation because output may - // be not be packed, and convolution may update only a subset of - // indices + /// \ref copy_non_packed_output_before_convolution EXPECT(hipMemcpy(wei_dev, wei.data.data(), sizeof(TRef) * wei_sz, @@ -774,11 +766,6 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base int wo = conv_out_size(wi, px, dx, fx, sx); int do_ = conv_out_size(di, pz, dz, fz, sz); int c_per_group = c / g; - // int k_per_group = k / g; - - // int in_sz = g * n * c_per_group * di * hi * wi; - // int wei_sz = g * k_per_group * c_per_group * fz * fy * fx; - // int out_sz = g * n * k_per_group * do_ * ho * wo; std::vector in_len({n, c, di, hi, wi}); std::vector wei_len({k, c_per_group, fz, fy, fx}); @@ -885,9 +872,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); - // TODO(Amber): copy output before computation because output may - // be not be packed, and convolution may update only a subset of - // indices + /// \ref copy_non_packed_output_before_convolution EXPECT(hipMemcpy(out_dev, out.data.data(), sizeof(Tout) * out_sz, @@ -969,9 +954,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base nullptr); EXPECT(status == CL_SUCCESS); #elif MIOPEN_BACKEND_HIP - // TODO(Amber): copy output before computation because output may - // be not be packed, and convolution may update only a subset of - // indices + /// \ref copy_non_packed_output_before_convolution EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -1057,9 +1040,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); - // TODO(Amber): copy output before computation because output may - // be not be packed, and convolution may update only a subset of - // indices + /// \ref copy_non_packed_output_before_convolution EXPECT(hipMemcpy(wei_dev, wei.data.data(), sizeof(TRef) * wei_sz, From 0f16c625af2debfb9e757661a22199a029c11b3a Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Thu, 28 Sep 2023 17:30:39 +0000 Subject: [PATCH 13/23] address review comments --- src/include/miopen/hipoc_kernel.hpp | 6 +- .../miopen/solver/conv_direct_naive_conv.hpp | 87 ++++++++++--------- .../gpu_reference_kernel/naive_conv.cpp | 29 ------- src/solver/conv_direct_naive_conv.cpp | 8 +- src/solver/conv_direct_naive_conv_bwd.cpp | 28 +++--- src/solver/conv_direct_naive_conv_fwd.cpp | 28 +++--- src/solver/conv_direct_naive_conv_wrw.cpp | 28 +++--- 7 files changed, 98 insertions(+), 116 deletions(-) diff --git a/src/include/miopen/hipoc_kernel.hpp b/src/include/miopen/hipoc_kernel.hpp index 938d20d9e6..73ac77f160 100644 --- a/src/include/miopen/hipoc_kernel.hpp +++ b/src/include/miopen/hipoc_kernel.hpp @@ -51,9 +51,9 @@ inline HipEventPtr make_hip_event() template struct KernelArgsPair { - constexpr static const auto alignU = alignof(U); - constexpr static const auto padding = (alignU - (sizeof(T) % alignU)) % alignU; - constexpr static const auto second_index = sizeof(T) + padding; + constexpr static auto alignU = alignof(U); + constexpr static auto padding = (alignU - (sizeof(T) % alignU)) % alignU; + constexpr static auto second_index = sizeof(T) + padding; KernelArgsPair(T x, U y) { new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew) diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp index 0d0029d814..6d935b249d 100644 --- a/src/include/miopen/solver/conv_direct_naive_conv.hpp +++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp @@ -60,49 +60,18 @@ bool IsOutputBfp16(const ProblemDescription&); bool IsOutputInt8(const ProblemDescription&); bool IsOutputInt32(const ProblemDescription&); -int GetGroupStrideIndex(const ProblemDescription& problem); - -void printTensorStrides(const TensorDescriptor& inDesc, - const TensorDescriptor& wDesc, - const TensorDescriptor& outDesc); - -namespace internal { -template -struct ChooseStride -{ -}; - -template <> -struct ChooseStride<5u> -{ - using type = Strides5D; -}; - -template <> -struct ChooseStride<6u> -{ - using type = Strides6D; -}; - -} // end namespace internal +namespace conv_internal { -template -auto MakeStrideArray(V vec) -{ - typename internal::ChooseStride::type ret; - assert(vec.size() == N); - - // MIOpen stores strides for NHWC in NCHW order, i.e. C stride in 2nd from left. - // We sort the input stride vector so that smallest stride is at index 0. This - // (little-endian) order is what naive convolution kernel expects for strides - std::sort(vec.begin(), vec.end()); +void DebugPrintTensorStrides(const TensorDescriptor& inDesc, + const TensorDescriptor& wDesc, + const TensorDescriptor& outDesc); - for(unsigned i = 0; i < N; ++i) - { - ret[i] = static_cast(vec[i]); - } - return ret; -} +/** + * Get the index where group (G) stride should go. For NCHW, we want to convert + * its strides to NGCHW, and for NHWC, we want to convert its strides to NHWGC. + * Same applies for the 3D case. + */ +int GetGroupStrideIndex(const ProblemDescription& problem); /** * split the strides for C dimension in a tensor descriptor into (G, C_per_group). @@ -142,5 +111,41 @@ V SplitWeiStrideKtoGK(int k_per_group, const V& wei_strides) return ret; } +template +struct ChooseStride +{ +}; + +template <> +struct ChooseStride<5u> +{ + using type = Strides5D; +}; + +template <> +struct ChooseStride<6u> +{ + using type = Strides6D; +}; + +template +auto MakeStrideArray(V vec) +{ + typename ChooseStride::type ret; + assert(vec.size() == N); + + // MIOpen stores strides for NHWC in NCHW order, i.e. C stride in 2nd from left. + // We sort the input stride vector so that smallest stride is at index 0. This + // (little-endian) order is what naive convolution kernel expects for strides + std::sort(vec.begin(), vec.end()); + + for(unsigned i = 0; i < N; ++i) + { + ret[i] = static_cast(vec[i]); + } + return ret; +} +} // end namespace conv_internal + } // namespace solver } // namespace miopen diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp index 60c3583e77..b243b1234a 100644 --- a/src/kernels/gpu_reference_kernel/naive_conv.cpp +++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp @@ -116,35 +116,6 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val) return static_cast(val & 0xff); } -#if 0 -// TODO(Amber): this file is compiled via HIP RTC and includes don't work easily -template -class MyArray -{ - T data_[N] = {}; - -public: - constexpr static const unsigned SIZE = N; - - __host__ __device__ constexpr unsigned size() const { return N; } - - __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; } - - __host__ __device__ T& operator[](unsigned i) { return data_[i]; } - - __host__ __device__ MyArray() = default; - __host__ __device__ MyArray(const MyArray&) = default; - __host__ __device__ MyArray(MyArray&&) noexcept = default; - __host__ __device__ MyArray& operator=(const MyArray&) = default; - __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default; - __host__ __device__ ~MyArray() = default; -}; - -using StrideIndexType = size_t; -using Strides5D = MyArray; -using Strides6D = MyArray; -#endif - /// \todo remove template parameter 'bool ASSUME_PACKED' in a follow up PR /// --amberhassaan /// Notes (Amber): diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp index e39ee06a66..63c308c0fb 100644 --- a/src/solver/conv_direct_naive_conv.cpp +++ b/src/solver/conv_direct_naive_conv.cpp @@ -259,7 +259,7 @@ bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext& ctx, /// (G, C_per_group). Return value G_stride_idx is the position of G stride /// in the stride vector, such that the (G_stride_idx - 1) is the index that /// contains C's stride as a multiplying factor -int GetGroupStrideIndex(const ProblemDescription& problem) +int conv_internal::GetGroupStrideIndex(const ProblemDescription& problem) { int G_stride_idx = -1; if(problem.IsLayoutDefault()) @@ -280,9 +280,9 @@ int GetGroupStrideIndex(const ProblemDescription& problem) return G_stride_idx; } -void printTensorStrides(const TensorDescriptor& inDesc, - const TensorDescriptor& wDesc, - const TensorDescriptor& outDesc) +void conv_internal::DebugPrintTensorStrides(const TensorDescriptor& inDesc, + const TensorDescriptor& wDesc, + const TensorDescriptor& outDesc) { auto printOneStrideVec = [](const char* name, const auto& vec) { diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp index c0aa1385f8..1a28f8aae6 100644 --- a/src/solver/conv_direct_naive_conv_bwd.cpp +++ b/src/solver/conv_direct_naive_conv_bwd.cpp @@ -142,7 +142,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, }(); kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); - int G_stride_idx = GetGroupStrideIndex(problem); + int G_stride_idx = conv_internal::GetGroupStrideIndex(problem); if(problem.Is2d()) { @@ -152,13 +152,14 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; float elapsed = 0; - auto in_strides = MakeStrideArray<5>( - SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx)); + auto in_strides = conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC( + group, tensors.inDesc.GetStrides(), G_stride_idx)); // For weights, we split K to (G, K_per_group), which is always index 0 - auto wei_strides = MakeStrideArray<5>( - SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); - auto out_strides = MakeStrideArray<5>( - SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx)); + auto wei_strides = conv_internal::MakeStrideArray<5>( + conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); + auto out_strides = + conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC( + group, tensors.outDesc.GetStrides(), G_stride_idx)); /// \ref backward_tensors_reversed_why if(is_f8) { @@ -230,13 +231,14 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, const auto& tensors = data_ctx.tensors; float elapsed = 0; - auto in_strides = MakeStrideArray<6>( - SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx)); + auto in_strides = conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC( + group, tensors.inDesc.GetStrides(), G_stride_idx)); // For weights, we split K to (G, K_per_group), which is always index 0 - auto wei_strides = MakeStrideArray<6>( - SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); - auto out_strides = MakeStrideArray<6>( - SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx)); + auto wei_strides = conv_internal::MakeStrideArray<6>( + conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); + auto out_strides = + conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC( + group, tensors.outDesc.GetStrides(), G_stride_idx)); /// \anchor backward_tensors_reversed_why /// \todo Someone made the silly decision of swapping in and diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp index 11ac4dd191..a4656d929a 100644 --- a/src/solver/conv_direct_naive_conv_fwd.cpp +++ b/src/solver/conv_direct_naive_conv_fwd.cpp @@ -141,7 +141,7 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx, kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); - int G_stride_idx = GetGroupStrideIndex(problem); + int G_stride_idx = conv_internal::GetGroupStrideIndex(problem); if(problem.Is2d()) { @@ -152,13 +152,14 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx, const auto& tensors = data_ctx.tensors; float elapsed = 0; - auto in_strides = MakeStrideArray<5>( - SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx)); + auto in_strides = conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC( + group, tensors.inDesc.GetStrides(), G_stride_idx)); // For weights, we split K to (G, K_per_group), which is always index 0 - auto wei_strides = MakeStrideArray<5>( - SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); - auto out_strides = MakeStrideArray<5>( - SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx)); + auto wei_strides = conv_internal::MakeStrideArray<5>( + conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); + auto out_strides = + conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC( + group, tensors.outDesc.GetStrides(), G_stride_idx)); if(is_f8) { @@ -230,13 +231,14 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx, const auto& tensors = data_ctx.tensors; float elapsed = 0; - auto in_strides = MakeStrideArray<6>( - SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx)); + auto in_strides = conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC( + group, tensors.inDesc.GetStrides(), G_stride_idx)); // For weights, we split K to (G, K_per_group), which is always index 0 - auto wei_strides = MakeStrideArray<6>( - SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); - auto out_strides = MakeStrideArray<6>( - SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx)); + auto wei_strides = conv_internal::MakeStrideArray<6>( + conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); + auto out_strides = + conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC( + group, tensors.outDesc.GetStrides(), G_stride_idx)); handle.Run(kern)(tensors.in, tensors.w, tensors.out, diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp index d02e1fbdb5..dfe1c342b0 100644 --- a/src/solver/conv_direct_naive_conv_wrw.cpp +++ b/src/solver/conv_direct_naive_conv_wrw.cpp @@ -129,7 +129,7 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, return false; }(); - int G_stride_idx = GetGroupStrideIndex(problem); + int G_stride_idx = conv_internal::GetGroupStrideIndex(problem); if(problem.Is2d()) { @@ -140,13 +140,14 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, const auto& tensors = data_ctx.tensors; float elapsed = 0; - auto in_strides = MakeStrideArray<5>( - SplitStrideCtoGC(group, tensors.xDesc.GetStrides(), G_stride_idx)); + auto in_strides = conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC( + group, tensors.xDesc.GetStrides(), G_stride_idx)); // For weights, we split K to (G, K_per_group), which is always index 0 - auto wei_strides = MakeStrideArray<5>( - SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides())); - auto out_strides = MakeStrideArray<5>( - SplitStrideCtoGC(group, tensors.dyDesc.GetStrides(), G_stride_idx)); + auto wei_strides = conv_internal::MakeStrideArray<5>( + conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides())); + auto out_strides = + conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC( + group, tensors.dyDesc.GetStrides(), G_stride_idx)); if(is_f8) { @@ -218,13 +219,14 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, const auto& tensors = data_ctx.tensors; float elapsed = 0; - auto in_strides = MakeStrideArray<6>( - SplitStrideCtoGC(group, tensors.xDesc.GetStrides(), G_stride_idx)); + auto in_strides = conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC( + group, tensors.xDesc.GetStrides(), G_stride_idx)); // For weights, we split K to (G, K_per_group), which is always index 0 - auto wei_strides = MakeStrideArray<6>( - SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides())); - auto out_strides = MakeStrideArray<6>( - SplitStrideCtoGC(group, tensors.dyDesc.GetStrides(), G_stride_idx)); + auto wei_strides = conv_internal::MakeStrideArray<6>( + conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides())); + auto out_strides = + conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC( + group, tensors.dyDesc.GetStrides(), G_stride_idx)); handle.Run(kern)(tensors.x, tensors.dw, From dec88a7ca57fa71f645c5ba81a601e0a2a273c4c Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Fri, 29 Sep 2023 08:28:03 +0000 Subject: [PATCH 14/23] add more checks for strides --- src/include/miopen/convolution.hpp | 4 ++ src/ocl/convolutionocl.cpp | 103 ++++++++++++++++++++++------- test/gpu_reference_kernel.cpp | 41 +++++++----- 3 files changed, 106 insertions(+), 42 deletions(-) diff --git a/src/include/miopen/convolution.hpp b/src/include/miopen/convolution.hpp index bac0133106..d94211ed2e 100644 --- a/src/include/miopen/convolution.hpp +++ b/src/include/miopen/convolution.hpp @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -404,6 +405,9 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor friend void to_json(nlohmann::json& json, const ConvolutionDescriptor& conv); friend void from_json(const nlohmann::json& json, ConvolutionDescriptor& conv); + +private: + void ValidateConvTensors(const ConvTensors& conv_tensors) const; }; void ConvolutionBackwardBias(const Handle& handle, diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp index 48e14e3f30..b9d221f8a9 100644 --- a/src/ocl/convolutionocl.cpp +++ b/src/ocl/convolutionocl.cpp @@ -287,31 +287,6 @@ void ConvolutionDescriptor::FindConvFwdAlgorithm(Handle& handle, namespace { -void ValidateConvTensors(const ConvTensors& tensors) -{ - const auto invalid_buffers = - tensors.x == nullptr || tensors.w == nullptr || tensors.y == nullptr; - - const auto tensor_sizes_not_matched = tensors.xDesc.GetSize() != tensors.yDesc.GetSize() || - tensors.xDesc.GetSize() != tensors.wDesc.GetSize(); - - const auto trivial_tensor_types_not_matched = - tensors.xDesc.GetType() != tensors.yDesc.GetType() && - tensors.xDesc.GetType() != miopenInt8 && tensors.xDesc.GetType() != miopenInt8x4; - - // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) { - // MIOPEN_THROW(miopenStatusBadParm); - //} - - const auto x_tensor_invalid = tensors.xDesc.GetSize() < 3; - - const auto bad_parameters = invalid_buffers || tensor_sizes_not_matched || - trivial_tensor_types_not_matched || x_tensor_invalid; - - if(bad_parameters) - MIOPEN_THROW(miopenStatusBadParm); -} - void ValidateAlphaBeta(const void* alpha, const void* beta) { if(!float_equal(*(static_cast(alpha)), 1.0) || @@ -402,6 +377,84 @@ static void ConvForwardCheckNumerics(const Handle& handle, } } +void ConvolutionDescriptor::ValidateConvTensors(const ConvTensors& tensors) const +{ + + // Group stride in current TensorDescriptor is implicit. When invoking kernels, + // we need to add the group dimension G and compute its stride. We want the stride + // left of C to be a multiple of group count G. e.g. for NCHW, the stride for N + // should be a multiple of G so that we can compute the strides for NGCHW + auto bad_group_stride = [this](const TensorDescriptor& td) { + auto l = td.GetLayout_t(); + int g_stride_index = -1; + if(l == miopenTensorNCHW || l == miopenTensorNCDHW) + { + g_stride_index = 0; // stride index for N; + } + else if(l == miopenTensorNHWC || l == miopenTensorNDHWC) + { + // stride index for W. Normally this would be 2nd-last stride but we store + // strides in NCHW order for some weird reason. + g_stride_index = td.GetStrides().size() - 1; + } + + if(g_stride_index != 1) + { + return (td.GetStrides()[g_stride_index] % this->group_count) != 0; + } + + return false; + }; + + // invalid_buffers + if(tensors.x == nullptr || tensors.w == nullptr || tensors.y == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "One of the convolution tensors is null"); + } + + // x_tensor_invalid = + if(tensors.xDesc.GetSize() < 3) + { + MIOPEN_THROW(miopenStatusBadParm, "input tensor's number of dimensions is wrong"); + } + + // tensor_sizes_not_matched = + if(tensors.xDesc.GetSize() != tensors.yDesc.GetSize() || + tensors.xDesc.GetSize() != tensors.wDesc.GetSize()) + { + MIOPEN_THROW(miopenStatusBadParm, + "number of dimensions mismatch between input, output and weights tensors"); + } + + // trivial_tensor_types_not_matched = + if(tensors.xDesc.GetType() != tensors.yDesc.GetType() && + tensors.xDesc.GetType() != miopenInt8 && tensors.xDesc.GetType() != miopenInt8x4) + { + MIOPEN_THROW(miopenStatusBadParm, "input/output tensor data types do not match"); + } + + // check for bad_group_stride. This applies for input and output only. There + // is no check for weight tensor currently. + // no need to check for group_count == 1 + + if((this->group_count > 1) && bad_group_stride(tensors.xDesc)) + { + MIOPEN_THROW( + miopenStatusBadParm, + "Invalid input tensor strides. Channel stride must be a multiple of group count"); + } + if((this->group_count > 1) && bad_group_stride(tensors.yDesc)) + { + MIOPEN_THROW( + miopenStatusBadParm, + "Invalid output tensor strides. Channel stride must be a multiple of group count"); + } + + // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) { + // MIOPEN_THROW(miopenStatusBadParm); + //} +} + void ConvolutionDescriptor::ConvolutionForward(Handle& handle, const void* alpha, const TensorDescriptor& xDesc, diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp index 13eae6f01e..f8bbaa8e86 100644 --- a/test/gpu_reference_kernel.cpp +++ b/test/gpu_reference_kernel.cpp @@ -312,28 +312,19 @@ static std::string miopen_type_to_string(miopenDataType_t type) return "n/a"; } -// input: a vector of lengths of dims in a tensor -// multiply each element with a random constant integer +/// input: a vector of lengths of dims in a tensor +/// multiply each element with a random constant integer void pad_tensor_strides(std::vector& strides) { - auto pvec = [](const char* name, const auto& vec) { - std::cout << name << ": ["; - for(const auto& v : vec) - { - std::cout << v << ", "; - } - std::cout << "]\n"; - }; + constexpr int min_stride_multiplier = 1; + constexpr int max_stride_multiplier = 5; - pvec("orig strides", strides); - auto c = prng::gen_A_to_B(1, 3); - // int c = 2; + auto c = prng::gen_A_to_B(min_stride_multiplier, max_stride_multiplier); for(auto& v : strides) { // cppcheck-suppress useStlAlgorithm v = v * c; } - pvec("new strides", strides); } template Date: Tue, 3 Oct 2023 14:30:51 +0000 Subject: [PATCH 15/23] fix test now that strides are supported --- test/gtest/conv_api_strided_tensors.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/gtest/conv_api_strided_tensors.cpp b/test/gtest/conv_api_strided_tensors.cpp index 2a59dcd696..9a2876b3f0 100644 --- a/test/gtest/conv_api_strided_tensors.cpp +++ b/test/gtest/conv_api_strided_tensors.cpp @@ -139,7 +139,6 @@ class ConvStridedTensors : public ::testing::Test std::vector h_output; }; -// This test should be replaced when strided tensors are fully implemented TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented) { auto device = Device(handle); @@ -178,9 +177,8 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented) const float alpha = 1.f; const float beta = 0.f; - // miopenConvolutionForward() must return error if the format is not supported ASSERT_TRUE(device.Synchronize()); - ASSERT_NE(miopenConvolutionForward(handle, + ASSERT_EQ(miopenConvolutionForward(handle, &alpha, input_descr, d_input.Data(), From 39eee971062bdcaea82d48e00c6fe01e26842258 Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Wed, 4 Oct 2023 15:30:15 +0000 Subject: [PATCH 16/23] use C++17 to compile HIP Kernels --- src/hip/hip_build_utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip/hip_build_utils.cpp b/src/hip/hip_build_utils.cpp index 8f6f9f0c50..86cf3a7272 100644 --- a/src/hip/hip_build_utils.cpp +++ b/src/hip/hip_build_utils.cpp @@ -73,7 +73,7 @@ static boost::filesystem::path HipBuildImpl(boost::optional& tmp_dir, auto env = std::string(""); if(params.find("-std=") == std::string::npos) - params += " --std=c++11"; + params += " --std=c++17"; #if HIP_PACKAGE_VERSION_FLAT < 4001000000ULL params += " --cuda-gpu-arch=" + lots.device; From 57fdf6e40b351492633cf89bfa5d9c555273c6e9 Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Wed, 4 Oct 2023 16:59:02 +0000 Subject: [PATCH 17/23] address comments --- src/kernels/gpu_reference_kernel/naive_conv.cpp | 6 +++--- src/solver/conv_direct_naive_conv.cpp | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp index b243b1234a..531c7fd7bb 100644 --- a/src/kernels/gpu_reference_kernel/naive_conv.cpp +++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp @@ -126,9 +126,9 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val) /// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw, /// and nhwc should be nhwgc. Same follows for the 3D case. /// -/// - strides here are in the little-endian order, i.e., for NHWC, stride for N is -/// at index 3 while stride for C is at index 0. This is reverse of how strides are -/// stored in tensor descriptors, which are big-endian. +/// - strides here are stored right to left, i.e., for NHWC, stride for N is +/// at index 3 while stride for C is at index 0. This is different from how the +/// tensor descriptors store strides, which is always NCHW order, left-to-right. template inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp index 86a8a4161e..f4f2027293 100644 --- a/src/solver/conv_direct_naive_conv.cpp +++ b/src/solver/conv_direct_naive_conv.cpp @@ -111,11 +111,15 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS); std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem) { std::ostringstream kernel_name; + + /// \todo remove packed reference convolution kernels --amberhassaan +#ifndef NDEBUG// enable in debug mode only if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS())) { kernel_name << "naive_conv_packed_"; } else +#endif { kernel_name << "naive_conv_nonpacked_"; } From b5d8b0f51193a1c93ee7ec9dd5c65f265ec607c7 Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Wed, 4 Oct 2023 16:59:02 +0000 Subject: [PATCH 18/23] address comments --- src/kernels/gpu_reference_kernel/naive_conv.cpp | 6 +++--- src/solver/conv_direct_naive_conv.cpp | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp index b243b1234a..531c7fd7bb 100644 --- a/src/kernels/gpu_reference_kernel/naive_conv.cpp +++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp @@ -126,9 +126,9 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val) /// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw, /// and nhwc should be nhwgc. Same follows for the 3D case. /// -/// - strides here are in the little-endian order, i.e., for NHWC, stride for N is -/// at index 3 while stride for C is at index 0. This is reverse of how strides are -/// stored in tensor descriptors, which are big-endian. +/// - strides here are stored right to left, i.e., for NHWC, stride for N is +/// at index 3 while stride for C is at index 0. This is different from how the +/// tensor descriptors store strides, which is always NCHW order, left-to-right. template inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp index 86a8a4161e..f4f2027293 100644 --- a/src/solver/conv_direct_naive_conv.cpp +++ b/src/solver/conv_direct_naive_conv.cpp @@ -111,11 +111,15 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS); std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem) { std::ostringstream kernel_name; + + /// \todo remove packed reference convolution kernels --amberhassaan +#ifndef NDEBUG// enable in debug mode only if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS())) { kernel_name << "naive_conv_packed_"; } else +#endif { kernel_name << "naive_conv_nonpacked_"; } From 0fe1a55fb6c979d3b11ef69007f225cfe16577dd Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Wed, 4 Oct 2023 21:15:03 +0000 Subject: [PATCH 19/23] remove OpenCL code due to deprecation --- test/gpu_reference_kernel.cpp | 281 +++------------------------------- 1 file changed, 23 insertions(+), 258 deletions(-) diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp index f8bbaa8e86..b3677487fe 100644 --- a/test/gpu_reference_kernel.cpp +++ b/test/gpu_reference_kernel.cpp @@ -75,17 +75,9 @@ std::string tensor_layout_to_string(tensor_layout_t layout) struct gpu_reference_kernel_base { miopenHandle_t handle{}; -#if MIOPEN_BACKEND_OPENCL - cl_command_queue q{}; -#endif - gpu_reference_kernel_base() - { - miopenCreate(&handle); -#if MIOPEN_BACKEND_OPENCL - miopenGetStream(handle, &q); -#endif - } + gpu_reference_kernel_base() { miopenCreate(&handle); } + ~gpu_reference_kernel_base() { miopenDestroy(handle); } static int conv_out_size(int in_size, int pad, int dilation, int ksize, int stride) @@ -386,25 +378,13 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base auto wei_sz = wei.data.size(); auto out_sz = out.data.size(); -#if MIOPEN_BACKEND_OPENCL - cl_context ctx; - clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); - cl_int status = CL_SUCCESS; - cl_mem in_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * in_sz, nullptr, &status); - cl_mem wei_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * wei_sz, nullptr, nullptr); - cl_mem out_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(Tout) * out_sz, nullptr, nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP void* in_dev; void* wei_dev; void* out_dev; EXPECT(hipMalloc(&in_dev, sizeof(TRef) * in_sz) == hipSuccess); EXPECT(hipMalloc(&wei_dev, sizeof(TRef) * wei_sz) == hipSuccess); EXPECT(hipMalloc(&out_dev, sizeof(Tout) * out_sz) == hipSuccess); -#endif + EXPECT(miopenCreateConvolutionDescriptor(&convDesc) == miopenStatusSuccess); EXPECT(miopenInitConvolutionNdDescriptor(convDesc, 2, @@ -442,27 +422,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base rand_tensor_integer(wei); /// \ref copy_non_packed_output_before_convolution rand_tensor_integer(out); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueWriteBuffer(q, - in_dev, - CL_TRUE, - 0, - sizeof(TRef) * in_sz, - in.data.data(), - 0, - nullptr, - nullptr); - status |= clEnqueueWriteBuffer(q, - wei_dev, - CL_TRUE, - 0, - sizeof(TRef) * wei_sz, - wei.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -482,7 +442,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base out.data.data(), sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); -#endif + cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -507,23 +467,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base miopenStatusSuccess); tensor out_host(out_len, out_strides); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - out_dev, - CL_TRUE, - 0, - sizeof(Tout) * out_sz, - out_host.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP EXPECT(hipMemcpy(out_host.data.data(), out_dev, sizeof(Tout) * out_sz, hipMemcpyDeviceToHost) == hipSuccess); -#endif + // we expect excact match, since use integer valid_result = verify_tensor(out_host, out); } @@ -534,28 +482,8 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base rand_tensor_integer(wei); /// \ref copy_non_packed_output_before_convolution rand_tensor_integer(in); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueWriteBuffer(q, - out_dev, - CL_TRUE, - 0, - sizeof(TRef) * out_sz, - out.data.data(), - 0, - nullptr, - nullptr); - status |= clEnqueueWriteBuffer(q, - wei_dev, - CL_TRUE, - 0, - sizeof(TRef) * wei_sz, - wei.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP /// \ref copy_non_packed_output_before_convolution + EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -567,7 +495,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base wei.data.data(), sizeof(TRef) * wei_sz, hipMemcpyHostToDevice) == hipSuccess); -#endif + cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -592,23 +520,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base miopenStatusSuccess); tensor in_host(in_len, in_strides); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - in_dev, - CL_TRUE, - 0, - sizeof(TRef) * in_sz, - in_host.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy(in_host.data.data(), in_dev, sizeof(TRef) * in_sz, hipMemcpyDeviceToHost) == hipSuccess); -#endif // we expect excact match, since use integer valid_result = verify_tensor(in_host, in); @@ -619,27 +535,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base rand_tensor_integer(out); /// \ref copy_non_packed_output_before_convolution rand_tensor_integer(wei); -#if MIOPEN_BACKEND_OPENCL - status |= clEnqueueWriteBuffer(q, - in_dev, - CL_TRUE, - 0, - sizeof(TRef) * in_sz, - in.data.data(), - 0, - nullptr, - nullptr); - status |= clEnqueueWriteBuffer(q, - out_dev, - CL_TRUE, - 0, - sizeof(TRef) * out_sz, - out.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -652,7 +548,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base out.data.data(), sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); -#endif + cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -677,23 +573,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base miopenStatusSuccess); tensor wei_host(wei_len, wei_strides); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - wei_dev, - CL_TRUE, - 0, - sizeof(TRef) * wei_sz, - wei_host.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy(wei_host.data.data(), wei_dev, sizeof(TRef) * wei_sz, hipMemcpyDeviceToHost) == hipSuccess); -#endif // we expect excact match, since use integer valid_result = verify_tensor(wei_host, wei); @@ -715,15 +599,10 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base miopenDestroyTensorDescriptor(inDesc); miopenDestroyTensorDescriptor(weiDesc); miopenDestroyTensorDescriptor(outDesc); -#if MIOPEN_BACKEND_OPENCL - clReleaseMemObject(in_dev); - clReleaseMemObject(wei_dev); - clReleaseMemObject(out_dev); -#elif MIOPEN_BACKEND_HIP + hipFree(in_dev); hipFree(wei_dev); hipFree(out_dev); -#endif }; iterate_conv_2d(run_conv_2d); @@ -795,25 +674,14 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base auto wei_sz = wei.data.size(); auto out_sz = out.data.size(); -#if MIOPEN_BACKEND_OPENCL - cl_context ctx; - clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); - cl_int status = CL_SUCCESS; - cl_mem in_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * in_sz, nullptr, &status); - cl_mem wei_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * wei_sz, nullptr, nullptr); - cl_mem out_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(Tout) * out_sz, nullptr, nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP void* in_dev; void* wei_dev; void* out_dev; + EXPECT(hipMalloc(&in_dev, sizeof(TRef) * in_sz) == hipSuccess); EXPECT(hipMalloc(&wei_dev, sizeof(TRef) * wei_sz) == hipSuccess); EXPECT(hipMalloc(&out_dev, sizeof(Tout) * out_sz) == hipSuccess); -#endif + EXPECT(miopenCreateConvolutionDescriptor(&convDesc) == miopenStatusSuccess); EXPECT(miopenInitConvolutionNdDescriptor(convDesc, 3, @@ -851,27 +719,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base rand_tensor_integer(wei); /// \ref copy_non_packed_output_before_convolution rand_tensor_integer(out); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueWriteBuffer(q, - in_dev, - CL_TRUE, - 0, - sizeof(TRef) * in_sz, - in.data.data(), - 0, - nullptr, - nullptr); - status |= clEnqueueWriteBuffer(q, - wei_dev, - CL_TRUE, - 0, - sizeof(TRef) * wei_sz, - wei.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -884,7 +732,6 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base wei.data.data(), sizeof(TRef) * wei_sz, hipMemcpyHostToDevice) == hipSuccess); -#endif cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), in, @@ -910,23 +757,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base miopenStatusSuccess); tensor out_host(out_len, out_strides); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - out_dev, - CL_TRUE, - 0, - sizeof(Tout) * out_sz, - out_host.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy(out_host.data.data(), out_dev, sizeof(Tout) * out_sz, hipMemcpyDeviceToHost) == hipSuccess); -#endif // we expect excact match, since use integer valid_result = verify_tensor(out_host, out); @@ -938,27 +773,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base rand_tensor_integer(wei); /// \ref copy_non_packed_output_before_convolution rand_tensor_integer(in); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueWriteBuffer(q, - out_dev, - CL_TRUE, - 0, - sizeof(TRef) * out_sz, - out.data.data(), - 0, - nullptr, - nullptr); - status |= clEnqueueWriteBuffer(q, - wei_dev, - CL_TRUE, - 0, - sizeof(TRef) * wei_sz, - wei.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + /// \ref copy_non_packed_output_before_convolution EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == @@ -971,7 +786,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base wei.data.data(), sizeof(TRef) * wei_sz, hipMemcpyHostToDevice) == hipSuccess); -#endif + cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -996,23 +811,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base miopenStatusSuccess); tensor in_host(in_len, in_strides); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - in_dev, - CL_TRUE, - 0, - sizeof(TRef) * in_sz, - in_host.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy(in_host.data.data(), in_dev, sizeof(TRef) * in_sz, hipMemcpyDeviceToHost) == hipSuccess); -#endif // we expect excact match, since use integer valid_result = verify_tensor(in_host, in); @@ -1023,27 +826,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base rand_tensor_integer(out, 3, -2); /// \ref copy_non_packed_output_before_convolution rand_tensor_integer(wei); -#if MIOPEN_BACKEND_OPENCL - status |= clEnqueueWriteBuffer(q, - in_dev, - CL_TRUE, - 0, - sizeof(TRef) * in_sz, - in.data.data(), - 0, - nullptr, - nullptr); - status |= clEnqueueWriteBuffer(q, - out_dev, - CL_TRUE, - 0, - sizeof(TRef) * out_sz, - out.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -1056,7 +839,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base out.data.data(), sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); -#endif + cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -1081,23 +864,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base miopenStatusSuccess); tensor wei_host(wei_len, wei_strides); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - wei_dev, - CL_TRUE, - 0, - sizeof(TRef) * wei_sz, - wei_host.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy(wei_host.data.data(), wei_dev, sizeof(TRef) * wei_sz, hipMemcpyDeviceToHost) == hipSuccess); -#endif // we expect excact match, since use integer valid_result = verify_tensor(wei_host, wei, 8.0); // max possible int @@ -1123,15 +894,9 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base miopenDestroyTensorDescriptor(weiDesc); miopenDestroyTensorDescriptor(outDesc); -#if MIOPEN_BACKEND_OPENCL - clReleaseMemObject(in_dev); - clReleaseMemObject(wei_dev); - clReleaseMemObject(out_dev); -#elif MIOPEN_BACKEND_HIP hipFree(in_dev); hipFree(wei_dev); hipFree(out_dev); -#endif }; iterate_conv_3d(run_conv_3d); From 505f6abd240a9198bc0fc04761fc3c397e110895 Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Mon, 9 Oct 2023 09:26:53 +0000 Subject: [PATCH 20/23] fix build --- src/solver/conv_direct_naive_conv_wrw.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp index 4bb7f9c657..a8c4d40e0b 100644 --- a/src/solver/conv_direct_naive_conv_wrw.cpp +++ b/src/solver/conv_direct_naive_conv_wrw.cpp @@ -127,8 +127,6 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, int G_stride_idx = conv_internal::GetGroupStrideIndex(problem); - int G_stride_idx = conv_internal::GetGroupStrideIndex(problem); - if(problem.Is2d()) { result.invoker_factory = [=](const std::vector& kernels) { From c34c72a2ecf2fffcfb00fba176e7801e9dccbd35 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Mon, 9 Oct 2023 09:29:41 -0700 Subject: [PATCH 21/23] fix Clang Format issue --- src/kernels/gpu_reference_kernel/naive_conv.cpp | 2 +- src/solver/conv_direct_naive_conv.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp index 531c7fd7bb..125eff94f3 100644 --- a/src/kernels/gpu_reference_kernel/naive_conv.cpp +++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp @@ -128,7 +128,7 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val) /// /// - strides here are stored right to left, i.e., for NHWC, stride for N is /// at index 3 while stride for C is at index 0. This is different from how the -/// tensor descriptors store strides, which is always NCHW order, left-to-right. +/// tensor descriptors store strides, which is always NCHW order, left-to-right. template inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp index f4f2027293..fb3e42bfb1 100644 --- a/src/solver/conv_direct_naive_conv.cpp +++ b/src/solver/conv_direct_naive_conv.cpp @@ -113,7 +113,7 @@ std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem) std::ostringstream kernel_name; /// \todo remove packed reference convolution kernels --amberhassaan -#ifndef NDEBUG// enable in debug mode only +#ifndef NDEBUG // enable in debug mode only if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS())) { kernel_name << "naive_conv_packed_"; From 220e42323756426fdbddb4eb7ad76e340c69e355 Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Wed, 11 Oct 2023 20:09:37 +0000 Subject: [PATCH 22/23] resolve conflicts and handle feedback --- src/ocl/convolutionocl.cpp | 6 +++++- test/gtest/conv_api_strided_tensors.cpp | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp index bc60fc17f4..d66186577c 100644 --- a/src/ocl/convolutionocl.cpp +++ b/src/ocl/convolutionocl.cpp @@ -397,8 +397,12 @@ void ConvolutionDescriptor::ValidateTensors(const ConvTensors& tensors) const // strides in NCHW order for some weird reason. g_stride_index = td.GetStrides().size() - 1; } + else + { + MIOPEN_THROW(miopenStatusInternalError, "Layout not supported for grouped convolution"); + } - if(g_stride_index != 1) + if(g_stride_index != -1) { return (td.GetStrides()[g_stride_index] % this->group_count) != 0; } diff --git a/test/gtest/conv_api_strided_tensors.cpp b/test/gtest/conv_api_strided_tensors.cpp index 9a2876b3f0..04d56ec908 100644 --- a/test/gtest/conv_api_strided_tensors.cpp +++ b/test/gtest/conv_api_strided_tensors.cpp @@ -139,6 +139,9 @@ class ConvStridedTensors : public ::testing::Test std::vector h_output; }; +/// \todo re-enable this test after NCDHW grouped convolution lands (PR 2429) +/// \todo add cpu reference convolution for verification --amberhassaan +#if 0 TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented) { auto device = Device(handle); @@ -194,3 +197,4 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented) miopenStatusSuccess); ASSERT_TRUE(device.Synchronize()); } +#endif From baa2da45878f151db1ad1e517973aab109b5642f Mon Sep 17 00:00:00 2001 From: "M. Amber Hassaan" Date: Wed, 11 Oct 2023 21:47:44 +0000 Subject: [PATCH 23/23] advance fin --- fin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fin b/fin index b2f3f4db3c..afc1a8d87e 160000 --- a/fin +++ b/fin @@ -1 +1 @@ -Subproject commit b2f3f4db3c3d7dd757e6d9e68719a780d8114dfa +Subproject commit afc1a8d87e6d00c82903942007bb370ee1f6c760