diff --git a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp deleted file mode 100644 index 02e675203d..0000000000 --- a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp +++ /dev/null @@ -1,210 +0,0 @@ -#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP -#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP - -#include "static_kernel_common_header.hpp" -#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp" - -namespace ck { - -// OriginalTensorDesc : ConstantTensorDescriptor_deprecated<...> -// it's the tensor whose dimensions are to be merged -// OriginalDimMergeSeqs : Sequence<...>... -// each is a sequence of original dimensions (of OriginalTensorDesc) to be merged -template -struct ConstantMergedTensorDescriptor_deprecated -{ - using Type = ConstantMergedTensorDescriptor_deprecated; - - static constexpr auto mOriginalDimMergeSeqs = std::tuple{}; - - static constexpr index_t nDim = sizeof...(OriginalDimMergeSeqs); - static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension(); - - __host__ __device__ constexpr ConstantMergedTensorDescriptor_deprecated() - { - static_assert(nDim <= nOriginalDim, "wrong!"); - - // TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most - // OriginalTensorDesc::nDim number of dimensions - - // TODO: check OriginalDimMergeSeqs contains all original dimensions - - // TODO: check there is no duplication in OriginalDimMergeSeqs - } - - __host__ __device__ static constexpr auto GetOriginalTensorDescriptor() - { - return OriginalTensorDesc{}; - } - - __host__ __device__ static constexpr auto GetNumOfDimension() { return Number{}; } - - template - __host__ __device__ static constexpr auto GetContainedOriginalDimensions(Number) - { - return std::get(mOriginalDimMergeSeqs); - } - - template - __host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number) - { - return (std::get(mOriginalDimMergeSeqs).GetSize() > 1); - } - - template - __host__ __device__ static constexpr auto GetLength(Number) - { - constexpr auto original_dims_partial = std::get(mOriginalDimMergeSeqs); - - return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize(); - } - - template - __host__ __device__ static constexpr auto GetStride(Number) - { - static_assert(!ContainMultipleOriginalDimensions(Number{}), - "wrong! stride of a merged dimension is undefined"); - - constexpr auto idim_original = std::get(mOriginalDimMergeSeqs).Back(); - - return OriginalTensorDesc::GetStride(Number{}); - } - - // this is a hack to return the stride of the last original dimension of a merged dimension - // TODO: refactor this once the concept of "dimension" is used - template - __host__ __device__ static constexpr auto GetLastOriginalDimensionStride(Number) - { - constexpr auto idim_last_original = std::get(mOriginalDimMergeSeqs).Back(); - - return OriginalTensorDesc::GetStride(Number{}); - } - - __host__ __device__ static constexpr auto GetLengths() - { - return Sequence{}; - } - - __host__ __device__ static constexpr auto GetElementSize() - { - return OriginalTensorDesc::GetElementSize(); - } - - template - struct lambda_1_GetOriginalMultiIndexFromMultiIndex - { - const Array& original_multi_id_partial; - Array& original_multi_id; - - __host__ __device__ constexpr lambda_1_GetOriginalMultiIndexFromMultiIndex( - const Array& original_multi_id_partial_, - Array& original_multi_id_) - : original_multi_id_partial(original_multi_id_partial_), - original_multi_id(original_multi_id_) - { - } - - template - __host__ __device__ constexpr void operator()(Number) const - { - constexpr index_t idim_original = OriginalDimsPartial::Get(Number{}); - - index_t itmp = original_multi_id_partial[I]; - - original_multi_id(idim_original) = itmp; - } - }; - - struct lambda_0_GetOriginalMultiIndexFromMultiIndex - { - const Array& multi_id; - Array& original_multi_id; - - __host__ __device__ constexpr lambda_0_GetOriginalMultiIndexFromMultiIndex( - const Array& multi_id_, Array& original_multi_id_) - : multi_id(multi_id_), original_multi_id(original_multi_id_) - { - } - - template - __host__ __device__ constexpr void operator()(Number) const - { - constexpr auto original_dims_partial = std::get(Type::mOriginalDimMergeSeqs); - - // get partial original-multi-id corresponding to this merged dimension - const auto original_multi_id_partial = - OriginalTensorDesc::Extract(original_dims_partial) - .GetMultiIndexFrom1dIndex(multi_id[IDim]); - - static_for<0, original_dims_partial.GetSize(), 1>{}( - lambda_1_GetOriginalMultiIndexFromMultiIndex( - original_multi_id_partial, original_multi_id)); - } - }; - - // return type is Array<...> - __host__ __device__ static constexpr auto - GetOriginalMultiIndexFromMultiIndex(Array multi_id) - { - Array original_multi_id; - - static_for<0, nDim, 1>{}( - lambda_0_GetOriginalMultiIndexFromMultiIndex(multi_id, original_multi_id)); - - return original_multi_id; - } - - template - __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Sequence) - { - constexpr auto multi_id = sequence2array(Sequence{}); - - constexpr auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id); - - return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id); - } - - __host__ __device__ static constexpr index_t - GetOffsetFromMultiIndex(Array multi_id) - { - auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id); - - return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id); - } - - template - __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Is... is) - { - return GetOffsetFromMultiIndex(Array{is...}); - } - - __host__ __device__ static constexpr Array GetMultiIndexFrom1dIndex(index_t id) - { - constexpr auto packed_desc = make_ConstantTensorDescriptor_packed(GetLengths()); - - return packed_desc.GetMultiIndexFrom1dIndex(id); - } - - __host__ __device__ static constexpr auto Pack() - { - constexpr auto lengths = GetLengths(); - constexpr auto strides = calculate_tensor_strides_packed(lengths); - return ConstantTensorDescriptor_deprecated{}; - } -}; - -template -__host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, - OriginalDimMergeSeqs...) -{ - return ConstantMergedTensorDescriptor_deprecated{}; -} - -template -__host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDesc) -{ - print_ConstantTensorDescriptor(s, TDesc::GetOriginalTensorDescriptor()); -} - -} // namespace ck -#endif diff --git a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp deleted file mode 100644 index 494ef1ddd8..0000000000 --- a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp +++ /dev/null @@ -1,348 +0,0 @@ -#ifndef CK_TENSOR_COORDINATE_DEPRECATED_HPP -#define CK_TENSOR_COORDINATE_DEPRECATED_HPP - -#include "static_kernel_common_header.hpp" -#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp" -#include "static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp" - -namespace ck { - -// TensorDesc is ConstantTensorDescriptor_deprecated -template -struct NormalTensorCoordinate_deprecated -{ - using type = NormalTensorCoordinate_deprecated; - using tensor_desc_type = TensorDesc; - - static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); - - __host__ - __device__ constexpr NormalTensorCoordinate_deprecated(Array tensor_index) - : mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)} - { - } - - template - __host__ __device__ constexpr NormalTensorCoordinate_deprecated(Xs... xs) - : NormalTensorCoordinate_deprecated(Array{xs...}) - { - } - - template - __host__ __device__ constexpr NormalTensorCoordinate_deprecated(Sequence) - : NormalTensorCoordinate_deprecated(Array{Xs...}) - { - } - - __host__ __device__ constexpr index_t GetOffset() const { return mOffset; } - - // T is Array or Sequence - template - __host__ __device__ type operator+=(T step_sizes) - { - static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); - - mOffset += tensor_desc_type::GetOffsetFromMultiIndex(step_sizes); - - return *this; - } - - template - __host__ __device__ type operator-=(T step_sizes) - { - static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); - - mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(step_sizes); - - return *this; - } - - template - __host__ __device__ constexpr type operator+(T step_sizes) const - { - type coord = *this; - coord += step_sizes; - return coord; - } - - template - __host__ __device__ constexpr type operator-(T step_sizes) const - { - type coord = *this; - coord -= step_sizes; - return coord; - } - - // reposition point of origin, and return compensated offset. - // This is a hack to reduce index calculation during looping over - // a tensor whose origin is this TensorCoordinate. It does so, by spitting - // out the run-time offset to the pointer (to the tensor data) held by this - // TensorCoordiante, so the caller can add the offset into the run-time pointer of - // the data, so only 1 run-time variable (update pointer) is needed, instead - // of 2 run-time variables (old pointer and this offset) - // TODO: after introducing the concept of "run-time tensor view", which contains the - // run-time pointer to the data, always keep track of the pointer, instead of both - // offset and the pointer. This also bring additional benefit that we don't need to - // worry the offset might underflow (because offset is unsigned integer) when updating it. - __host__ __device__ constexpr index_t RepositionOrigin() - { - index_t offset_diff = mOffset; - mOffset = 0; - return offset_diff; - } - -private: - index_t mOffset; -}; - -// TensorDesc is ConstantMergedTensorDescriptor_deprecated -template -struct MergedTensorCoordinate_deprecated -{ - using type = MergedTensorCoordinate_deprecated; - using tensor_desc_type = TensorDesc; - - static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); - static constexpr index_t nOriginalDim = - tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension(); - - __host__ - __device__ constexpr MergedTensorCoordinate_deprecated(Array tensor_index) - : mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)} - { - // partial offset on each dimension - static_for<0, nDim, 1>{}([&](auto idim) { - constexpr auto partial_original_dims = - tensor_desc_type::GetContainedOriginalDimensions(idim); - - constexpr auto partial_original_desc = - tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims); - - mPartialOffsets(idim) = partial_original_desc.GetOffsetFromMultiIndex( - extract_array(mOriginalIndex, partial_original_dims)); - }); - - // complete offset - mOffset = - accumulate_on_array(mPartialOffsets, math::plus{}, static_cast(0)); - } - - template - __host__ __device__ constexpr MergedTensorCoordinate_deprecated(Xs... xs) - : MergedTensorCoordinate_deprecated(Array{xs...}) - { - } - - __host__ __device__ constexpr index_t GetOffset() const { return mOffset; } - - template - __host__ __device__ void - MoveOnDimension(IDim idim_, T step_size, integral_constant) - { - constexpr auto idim = idim_; - - // if step_size is known at compile time - static_if::value>{}( - [&](auto) { static_if{}([&](auto) { return; }); }); - - // update original index - static_if{}([&](auto) { - constexpr auto partial_original_dims = - tensor_desc_type::GetContainedOriginalDimensions(idim); - - constexpr index_t ndim_partial_original = partial_original_dims.GetSize(); - - constexpr auto partial_original_desc = - tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims); - - const auto partial_original_step_sizes = - partial_original_desc.GetMultiIndexFrom1dIndex(step_size); - - // update partial original multi-id - auto partial_original_id = extract_array(mOriginalIndex, partial_original_dims); - - static_if{}([&](auto) { - partial_original_id += partial_original_step_sizes; - - bool carry = false; - - // do carry check in reversed order, starting from lowest dimension - // don't check the highest dimension - static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) { - constexpr index_t i = ndim_partial_original - 1 - IReverse; - - if(carry) - { - ++partial_original_id(i); - } - - carry = false; - - if(partial_original_id[i] >= partial_original_desc.GetLength(i)) - { - partial_original_id(i) -= partial_original_desc.GetLength(i); - carry = true; - } - }); - - // highest dimension - if(carry) - { - ++partial_original_id(0); - } - }).Else([&](auto) { - // shift up multi-id to avoid unsigned integer underflow during intermediate - // calculations. After the shift, should have new_multi_id[...] >= 1 - partial_original_id += - partial_original_desc.GetLengths() - partial_original_step_sizes; - - bool borrow = false; - - // do borrow check in reversed order, starting from lowest dimension - // don't check the highest dimension - static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) { - constexpr index_t i = ndim_partial_original - 1 - IReverse; - - if(borrow) - { - --partial_original_id(i); - } - - borrow = false; - - if(partial_original_id[i] < partial_original_desc.GetLength(i)) - { - partial_original_id(i) += partial_original_desc.GetLength(i); - borrow = true; - } - }); - - // highest dimension - if(borrow) - { - --partial_original_id(0); - } - - // shift back down multi-id - // here, should have new_multi_id[...] >= GetLengths() - partial_original_id = partial_original_id - partial_original_desc.GetLengths(); - }); - - // update "mOriginalIndex" - static_for<0, ndim_partial_original, 1>{}([&](auto I) { - constexpr auto idim_original = partial_original_dims[I]; - - mOriginalIndex(idim_original) = partial_original_id[I]; - }); - - // calculate new partial offset on this merged dimension - const index_t old_partial_offset = mPartialOffsets[idim]; - - mPartialOffsets(idim) = - partial_original_desc.GetOffsetFromMultiIndex(partial_original_id); - - // update "mThreadSrcOffset", do "+" before "-" to avoid underflow - mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset; - }).Else([&](auto fwd) { - static_if{}([&](auto) { - mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim); - }).Else([&](auto) { mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); }); - }); - } - - // T is Array or Sequence - template - __host__ __device__ type operator+=(T step_sizes) - { - static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); - - static_for<0, nDim, 1>{}([&](auto idim) { - // compiler should remove dead code path, because step_sizes is known at - // compile time - if(step_sizes[idim] != 0) - { - this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); - } - }); - - return *this; - } - - template - __host__ __device__ type operator-=(T step_sizes) - { - static_assert(is_same{} && T::GetSize() == nDim, "wrong!"); - - static_for<0, nDim, 1>{}([&](auto idim) { - // compiler should remove dead code path, because step_sizes is known at - // compile time - if(step_sizes[idim] != 0) - { - this->MoveOnDimension(idim, step_sizes[idim], integral_constant{}); - } - }); - - return *this; - } - - template - __host__ __device__ constexpr type operator+(T step_sizes) const - { - type coord = *this; - coord += step_sizes; - return coord; - } - - template - __host__ __device__ constexpr type operator-(T step_sizes) const - { - type coord = *this; - coord -= step_sizes; - return coord; - } - - __host__ __device__ static constexpr index_t RepositionOrigin() { return 0; } - -private: - // Allocate register memory for all merged dimensions and normal dimensions. - // However, only those merged dimensions, whose index will be involved in arithmetic - // after the construction of this TensorCoordinate (e.g. when user move a slicing - // window on the merged dimension), will use these register memory. - // Let's hope compiler will optimize away those register memory allocated for normal - // dimensions, and those merged dimensions, that would never be involved in index - // arithmetic after construction of TensorCoordinate. - // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions" - // and simplify implementation of ConstantMergedTensorDescriptor_deprecated, so we don't need to - // count on compiler to optimize away those register memory for us - Array mOriginalIndex; - Array mPartialOffsets; - - // complete offset - index_t mOffset; -}; - -template -struct TensorCoordinate_deprecated -{ -private: - template - __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated) - { - return NormalTensorCoordinate_deprecated>(); - } - - template - __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated) - { - return MergedTensorCoordinate_deprecated< - ConstantMergedTensorDescriptor_deprecated>(); - } - -public: - using type = decltype(MakeDummyTensorCoordinate(TensorDesc{})); -}; - -} // namespace ck -#endif diff --git a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_helper.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_helper.hpp deleted file mode 100644 index 2cacb329cb..0000000000 --- a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_helper.hpp +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef CK_TENSOR_COORDINATE_HELPER_HPP -#define CK_TENSOR_COORDINATE_HELPER_HPP - -#include "tensor_coordiante_hpp" - -namespace ck { - -template -__host__ __device__ constexpr auto -make_tensor_coordinate(TensorDesc, MultiIndex idx) -{ - return typename TensorCoordinate::type(idx); -} - -} // namespace ck -#endif diff --git a/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy_deprecated.hpp deleted file mode 100644 index 806a38a0c7..0000000000 --- a/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy_deprecated.hpp +++ /dev/null @@ -1,613 +0,0 @@ -#ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP -#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP - -#include "static_kernel_common_header.hpp" -#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp" -#include "static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp" -#include "static_kernel_tensor_coordinate_deprecated.hpp" -#include "static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp" - -namespace ck { - -// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor -// memory layout (ordering of dimensions) can be different between src and dst. -// This functions assume each thread is reading and writing a normal (not merged) tensor, -// to simplify index calculations. To satisfy this assumption, the user need to make sure -// that, on a merged dimension that constains multiple original dimensions, the length of -// the last original dimension need to be evenly dividable by its sub-lengths. Also, the -// repeat-length on the merged dimension need to be 1. These sanity checks are performed -// in constructor of BlockwiseGenericTensorSliceCopy_v1_deprecated -template -struct BlockwiseGenericTensorSliceCopy_v1_deprecated -{ - static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); - - static constexpr index_t nOriginalDimSrc = - SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension(); - static constexpr index_t nOriginalDimDst = - DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension(); - - // per-thread offset - index_t mThreadSrcOffset; - index_t mThreadDstOffset; - - // "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId", - // "mThreadDstPartialOffsets" are always calculated inside constructor, and would be - // updated if slicing-window is moved. However, they will not be used if you always move - // the slicing-window along a non-merged dimension. In that case, compiler should be - // able to remove these calculation. - // TODO: make sure compiler would actually remove them in that case - - // partial offset in each (merged) dimension - Array mThreadSrcPartialOffsets; - Array mThreadDstPartialOffsets; - - // multi-id of original tensor - Array mThreadSrcOriginalMultiId; - Array mThreadDstOriginalMultiId; - - __device__ - BlockwiseGenericTensorSliceCopy_v1_deprecated(Array src_block_data_id_begin, - Array dst_block_data_id_begin) - { - // check NDim consistency - static_assert( - nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && - nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() && - nDim == ThreadClusterLengths::GetSize() && - nDim == ThreadClusterArrangeOrder::GetSize() && - nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(), - "wrong"); - - // check thread arrange order and read/write access order are valid - static_assert(is_valid_sequence_map::value && - is_valid_sequence_map::value && - is_valid_sequence_map::value, - "wrong!"); - - // thread cluster - constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( - ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); - - // BlockSize - static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize"); - - // divide work - constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{}; - - static_for<0, nDim, 1>{}([&](auto IDim) { - static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0, - "wrong! cannot evenly divide sliced tensor into cluster"); - }); - - constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims; - - // additional check for merged dimension - static_for<0, nDim, 1>{}([&](auto IDim_) { - // src - static_if{}([&](auto) { - constexpr auto IDim = decltype(IDim_){}; - - // on a merged dimension that constains multiple original dimensions, - // the length of the last original dimension need to evenly dividable by its - // sub-length, - // so each thread is effectively reading a normal (not merged) tensor - constexpr auto idim_last_original_src = - SrcDesc::GetContainedOriginalDimensions(IDim).Back(); - static_assert( - SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_src) % - SubLengths::Get(IDim) == - 0, - "wrong!"); - - // merged dimension should have repeat_lengths = 1 - static_assert(repeat_lengths[IDim] == 1, - "wrong! repeat_lengths shoud be 1 on merged dimension"); - }); - - // dst - static_if{}([&](auto) { - constexpr auto IDim = decltype(IDim_){}; - - // on a merged dimension that constains multiple original dimensions, - // the length of the last original dimension need to evenly dividable by its - // sub-length, - // so each thread is effectively reading a normal (not merged) tensor - constexpr auto idim_last_original_dst = - DstDesc::GetContainedOriginalDimensions(IDim).Back(); - static_assert( - DstDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_dst) % - SubLengths::Get(IDim) == - 0, - "wrong!"); - - // merged dimension should have repeat_lengths = 1 - static_assert(repeat_lengths[IDim] == 1, - "wrong! repeat_lengths shoud be 1 on merged dimension"); - }); - }); - - // calculate mThreadSrcOffset, mThreadDstOffset - const auto thread_cluster_id = - thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); - - const auto data_cluster_id = - reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{}); - - const auto thread_data_id_begin = data_cluster_id * SubLengths{}; - - // original multi-id - mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex( - src_block_data_id_begin + thread_data_id_begin); - - mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex( - dst_block_data_id_begin + thread_data_id_begin); - - // partial offset on each dimension - static_for<0, nDim, 1>{}([&](auto IDim) { - constexpr auto src_partial_original_dims = - SrcDesc::GetContainedOriginalDimensions(IDim); - - constexpr auto src_partial_original_desc = - SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims); - - mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex( - extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims)); - }); - - static_for<0, nDim, 1>{}([&](auto IDim) { - constexpr auto dst_partial_original_dims = - DstDesc::GetContainedOriginalDimensions(IDim); - - constexpr auto dst_partial_original_desc = - DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims); - - mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex( - extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims)); - }); - - // complete offset - mThreadSrcOffset = accumulate_on_array( - mThreadSrcPartialOffsets, math::plus{}, static_cast(0)); - - mThreadDstOffset = accumulate_on_array( - mThreadDstPartialOffsets, math::plus{}, static_cast(0)); - } - - __device__ static constexpr auto GetRegisterBufferDescriptor() - { - constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{}); - - return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths); - } - - __device__ static constexpr index_t GetThreadBufferSize() - { - return GetRegisterBufferDescriptor().GetElementSpace(); - } - - template - __device__ void RunLoadThreadBuffer(const TData* __restrict__ p_src, - TData* __restrict__ p_buffer) const - { - constexpr auto thread_sub_tensor_lengths = SubLengths{}; - - constexpr auto data_per_cluster_per_dims = - thread_sub_tensor_lengths * ThreadClusterLengths{}; - - constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{}); - - constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor(); - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 - static_ford{}([&](auto repeat_id) { - constexpr auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims; - - constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths; - - constexpr index_t src_offset = - SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin); - - constexpr index_t buffer_offset = - thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); -#else - ford{}([&](auto repeat_id) { - const auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims; - - const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths; - - const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin); - - const index_t buffer_offset = - thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); -#endif - - // By position the origin of the per-thread window at the point, where multi-index - // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy - // is assuming each thread is copy a noraml (not merged) tensor. - // To satisfy this assumption, the user need to make sure that, on a merged dimension - // that constains multiple original dimensions, the length of the last original - // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on - // the merged dimension need to be 1. These sanity checks are performed in constructor - // of BlockwiseGenericTensorSliceCopy_v1_deprecated - ThreadwiseGenericTensorSliceCopy_v1r2_deprecated(make_zero_array(), - make_zero_array()) - .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset); - }); - } - - template - __device__ void RunStoreThreadBuffer(const TData* __restrict__ p_buffer, - TData* __restrict__ p_dst) const - { - constexpr auto thread_sub_tensor_lengths = SubLengths{}; - - constexpr auto data_per_cluster_per_dims = - thread_sub_tensor_lengths * ThreadClusterLengths{}; - - constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{}); - - constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor(); - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 - static_ford{}([&](auto repeat_id) { - constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths; - - constexpr auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims; - - constexpr index_t buffer_offset = - thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); - - constexpr index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin); -#else - ford{}([&](auto repeat_id) { - const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths; - - const auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims; - - const index_t buffer_offset = - thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin); - - const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin); -#endif - - // By position the origin of the per-thread window at the point, where multi-index - // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy - // is assuming each thread is copy a noraml (not merged) tensor. - // To satisfy this assumption, the user need to make sure that, on a merged dimension - // that constains multiple original dimensions, the length of the last original - // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on - // the merged dimension need to be 1. These sanity checks are performed in constructor - // of BlockwiseGenericTensorSliceCopy_v1_deprecated - ThreadwiseGenericTensorSliceCopy_v1r2_deprecated( - make_zero_array(), make_zero_array()) - .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset); - }); - } - - template - __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const - { - TData p_buffer[GetThreadBufferSize()]; - - RunLoadThreadBuffer(p_src, p_buffer); - RunStoreThreadBuffer(p_buffer, p_dst); - } - - // When moving the slicing windows along a merged dimension, if the strides of the - // contained (by the merged dimension) original dimensions are not in descending order, - // then there is no guarantee that the new offset will be larger than the old offset - // for movement in positive direction (vice versue for movement in negative direction). - // As a result, there is the possiblity that the offset calculation may result in - // unsigned integer underflow (due to "-" operation). However, this hazard should not - // happen, as long as the users make sure the slicing window would not be moved out of - // the boundary of the tensor being sliced. This functions doesn't do runtime sanity - // check on out-of-bound slicing window, for performance reason - template - __device__ void MoveSlicingWindowOnSourceTensor( - Number, Number, integral_constant direction) - { - constexpr auto IDim = Number{}; - - static_if{}([&](auto) { - // logic for a merged dimension, also works for non-merged dimension, but its logic may - // be unncessarily complicated for compiler to remove calculations that are useless for - // a non-merged dimension - - // extract partial original dimensions - constexpr auto src_partial_original_dims = - SrcDesc::GetContainedOriginalDimensions(IDim); - - constexpr auto src_partial_original_desc = - SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims); - - // calculate new partial original multi-id - auto old_src_partial_original_id = - extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims); - - auto new_src_partial_original_id = - src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex( - old_src_partial_original_id, StepSize, direction); - - // update "mThreadSrcOriginalMultiId" - static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I) { - constexpr auto IDimOriginal = src_partial_original_dims[I]; - - mThreadSrcOriginalMultiId(IDimOriginal) = new_src_partial_original_id[I]; - }); - - // calculate new partial offset on this merged dimension - const index_t old_src_partial_offset = mThreadSrcPartialOffsets[IDim]; - - const index_t new_src_partial_offset = - src_partial_original_desc.GetOffsetFromMultiIndex(new_src_partial_original_id); - - // update "mThreadSrcPartialOffsets" - mThreadSrcPartialOffsets(IDim) = new_src_partial_offset; - - // update "mThreadSrcOffset", do "+" before "-" to avoid underflow - mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset; - }).Else([&](auto) { - // Logic for non-merged dimension. If you are never going to move the slicing window on - // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets", - // which are being calculated here, will never be used later. In this case, compiler - // should be able to remove these calculations. - // TODO: make sure compiler would actually remove them in this case. - - // It is the user's responsiblity to make sure the slicing window will not be moved out - // of the boundary of the tensor being sliced. Otherwise, there might be hazard like - // unsigned integer underflow. That is NO runtime sanity check to prevent the hazard - - constexpr auto IDimOriginal = SrcDesc::GetContainedOriginalDimensions(IDim).Front(); - - static_if{}([&](auto fwd) { - mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim); - - mThreadSrcOriginalMultiId(IDimOriginal) += StepSize; - - mThreadSrcPartialOffsets(IDim) += StepSize * fwd(SrcDesc{}).GetStride(IDim); - }).Else([&](auto fwd) { - mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim); - - mThreadSrcOriginalMultiId(IDimOriginal) -= StepSize; - - mThreadSrcPartialOffsets(IDim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim); - }); - }); - } - - template - __device__ void - MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) - { - static_for<0, nDim, 1>{}([&](auto idim) { - if(step_sizes[idim] != 0) - { - MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction); - } - }); - } -}; - -// This version use TensorCoordiante -// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor -// memory layout (ordering of dimensions) can be different between src and dst. -template -struct BlockwiseGenericTensorSliceCopy_v2_deprecated -{ - static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); - - using Index = MultiIndex; - - __device__ constexpr BlockwiseGenericTensorSliceCopy_v2_deprecated( - const Index& src_block_slice_origin, const Index& dst_block_slice_origin) - { - static_assert( - nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() && - nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() && - nDim == ThreadClusterLengths::GetSize() && - nDim == ThreadClusterArrangeOrder::GetSize() && - nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(), - "wrong! nDim not consistent"); - - static_assert(is_same{}, - "wrong! threads should be mapped to cover entire slicing window"); - - constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( - ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); - - static_assert(BlockSize == thread_cluster_desc.GetElementSize(), - "wrong! BlockSize not consistent with ThreadClusterLengths"); - - const auto thread_cluster_id = - thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); - - const auto data_cluster_id = - reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{}); - - const auto thread_data_id_begin = data_cluster_id * SubLengths{}; - - mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin); - mThreadwiseLoad.SetDstSliceOrigin(make_zero_array()); - - mThreadwiseStore.SetSrcSliceOrigin(make_zero_array()); - mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin); - } - - __device__ static constexpr index_t GetThreadBufferSize() - { - return ThreadBufferDesc::GetElementSpace(); - } - - template - __device__ void - RunLoadThreadBuffer(const BlockSrcData* p_block_src, - ThreadBufferData* p_thread_buffer, - integral_constant, - integral_constant) const - { - constexpr auto block_src_address_space = - integral_constant{}; - constexpr auto thread_buffer_address_space = - integral_constant{}; - - mThreadwiseLoad.Run( - p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space); - } - - template - __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, - ThreadBufferData* p_thread_buffer) const - { - constexpr auto generic_address_space = - integral_constant{}; - - RunLoadThreadBuffer( - p_block_src, p_thread_buffer, generic_address_space, generic_address_space); - } - - template - __device__ void - RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, - BlockDstData* p_block_dst, - integral_constant, - integral_constant) const - { - constexpr auto thread_buffer_address_space = - integral_constant{}; - constexpr auto block_dst_address_space = - integral_constant{}; - - mThreadwiseStore.Run( - p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space); - } - - template - __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, - BlockDstData* p_block_dst) const - { - constexpr auto generic_address_space = - integral_constant{}; - - RunStoreThreadBuffer( - p_thread_buffer, p_block_dst, generic_address_space, generic_address_space); - } - - template - __device__ void - Run(const BlockSrcData* p_block_src, - BlockDstData* p_block_dst, - integral_constant block_src_address_space, - integral_constant block_dst_address_space) const - { - BlockSrcData p_thread_buffer[GetThreadBufferSize()]; - - constexpr auto generic_address_space = - integral_constant{}; - - RunLoadThreadBuffer( - p_block_src, p_thread_buffer, block_src_address_space, generic_address_space); - - // if there is type conversion, it's done during store - RunStoreThreadBuffer( - p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space); - } - - template - __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const - { - constexpr auto generic_address_space = - integral_constant{}; - - Run(p_block_src, p_block_dst, generic_address_space, generic_address_space); - } - - template - __device__ void - MoveSrcSliceWindow(T step_sizes, integral_constant positive_direction) - { - mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction); - } - - template - __device__ void - MoveDstSliceWindow(T step_sizes, integral_constant positive_direction) - { - mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction); - } - -private: - using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{})); - - using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated; - - using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated; - - ThreadwiseLoad mThreadwiseLoad; - ThreadwiseStore mThreadwiseStore; -}; - -} // namespace ck - -#endif diff --git a/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops.hpp deleted file mode 100644 index 44652321f7..0000000000 --- a/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops.hpp +++ /dev/null @@ -1,650 +0,0 @@ -#ifndef CK_GRIDWISE_GEMM_XDLOPS_HPP -#define CK_GRIDWISE_GEMM_XDLOPS_HPP - -#include "static_kernel_common_header.hpp" -#include "static_kernel_tensor_descriptor.hpp" -#include "static_kernel_tensor_descriptor_helper.hpp" -#include "static_kernel_ConstantMatrixDescriptor.hpp" -#include "static_kernel_blockwise_generic_tensor_slice_copy.hpp" -#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp" -#include "static_kernel_blockwise_gemm_xdlops.hpp" - -namespace ck { - -template -struct GridwiseGemmTransposedANormalBNormalCXdlops_v1 -{ - __device__ void Run(const Float* const __restrict__ p_a_global, - const Float* const __restrict__ p_b_global, - Float* const __restrict__ p_c_global) const - { - - constexpr auto True = integral_constant{}; - - constexpr auto a_k_m_global_desc = AGlobalDesc{}; - constexpr auto b_k_n_global_desc = BGlobalDesc{}; - constexpr auto c_m_n_global_desc = CGlobalDesc{}; - - constexpr auto K = b_k_n_global_desc.GetLengths()[0]; - constexpr auto N = b_k_n_global_desc.GetLengths()[1]; - constexpr auto M = a_k_m_global_desc.GetLengths()[1]; - - // divide block work by [M, N] - static_assert(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0, - "wrong! cannot divide work evenly among block"); - - constexpr index_t MBlockWork = M / MPerBlock; - constexpr index_t NBlockWork = N / NPerBlock; - - static_assert(MPerBlock % MPerWave == 0 && NPerBlock % NPerWave == 0, - "wrong! M/NPerBlock % M/NPerWave != 0"); - - constexpr index_t MWaves = MPerBlock / MPerWave; - constexpr index_t NWaves = NPerBlock / NPerWave; - - constexpr auto block_work_desc = - make_cluster_descriptor(Sequence{}); - - const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); - - const index_t m_block_data_on_global = block_work_id[0] * MPerBlock; - const index_t n_block_data_on_global = block_work_id[1] * NPerBlock; - - // LDS mem - constexpr index_t max_align = math::lcm(BBlockCopyDstDataPerWrite_N, - ABlockCopyDstDataPerWrite_M, - GemmDataPerReadM, - GemmDataPerReadN); - - // LDS - // be careful of LDS alignment - constexpr auto a_k_m_block_desc = make_native_tensor_descriptor_aligned( - Sequence{}, Number{}); - - auto a_blockwise_copy = - BlockwiseGenericTensorSliceCopy_v4({0, m_block_data_on_global}, - {0, 0}); - - constexpr auto b_k_n_block_desc = make_native_tensor_descriptor_aligned( - Sequence{}, Number{}); - - auto b_blockwise_copy = - BlockwiseGenericTensorSliceCopy_v4({0, n_block_data_on_global}, - {0, 0}); - - // GEMM definition - // c_mtx += transpose(a_mtx) * b_mtx - // a_mtx[KPerBlock, MPerBlock] is in LDS - // b_mtx[EPerBlocl, NPerBlock] is in LDS - // c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in - // register - constexpr auto a_k_m_block_mtx_desc = make_ConstantMatrixDescriptor(a_k_m_block_desc); - constexpr auto b_k_n_block_mtx_desc = make_ConstantMatrixDescriptor(b_k_n_block_desc); - - const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_xdlops< - BlockSize, - decltype(a_k_m_block_mtx_desc), - decltype(b_k_n_block_mtx_desc), - Float, - MPerWave, - NPerWave, - MWaves, - NWaves, - GemmDataPerReadM, - GemmDataPerReadN>{}; - - constexpr index_t a_block_space = - math::integer_least_multiple(a_k_m_block_desc.GetElementSpace(), max_align); - - constexpr index_t b_block_space = - math::integer_least_multiple(b_k_n_block_desc.GetElementSpace(), max_align); - - __shared__ Float p_a_block_double[2 * a_block_space]; - __shared__ Float p_b_block_double[2 * b_block_space]; - - // get zero-initialized output register of vector type - auto c_thread_vec = blockwise_gemm.CreateOutputVecZero(); - - // LDS double buffer: preload data into LDS - { - a_blockwise_copy.Run(p_a_global, p_a_block_double); - b_blockwise_copy.Run(p_b_global, p_b_block_double); - } - - using b_blockwise_copy_src_step = Sequence; - using a_blockwise_copy_src_step = Sequence; - - // LDS double buffer: main body - for(index_t k_block_data_begin = 0; k_block_data_begin + 2 * KPerBlock < K; - k_block_data_begin += 2 * KPerBlock) - { -#pragma unroll - for(index_t iloop = 0; iloop < 2; ++iloop) - { - const bool even_loop = (iloop % 2 == 0); - - Float* p_a_block_now = - even_loop ? p_a_block_double : p_a_block_double + a_block_space; - Float* p_b_block_now = - even_loop ? p_b_block_double : p_b_block_double + b_block_space; - - Float* p_a_block_next = - even_loop ? p_a_block_double + a_block_space : p_a_block_double; - Float* p_b_block_next = - even_loop ? p_b_block_double + b_block_space : p_b_block_double; - - Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()]; - Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()]; - - a_blockwise_copy.MoveSrcSliceWindow(a_blockwise_copy_src_step{}, True); - b_blockwise_copy.MoveSrcSliceWindow(b_blockwise_copy_src_step{}, True); - - __syncthreads(); - - // LDS doubel buffer: load next data from device mem - a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer); - b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer); - - // LDS double buffer: GEMM on current data - c_thread_vec = blockwise_gemm.Run(p_a_block_now, p_b_block_now, c_thread_vec); - - // LDS double buffer: store next data to LDS - a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer, p_a_block_next); - b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, p_b_block_next); - } - } - - // LDS double buffer: tail - { - constexpr bool has_two_iteration_left = (K % (2 * KPerBlock) == 0); - - if(has_two_iteration_left) // if has 2 iteration left - { - Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()]; - Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()]; - - a_blockwise_copy.MoveSrcSliceWindow(a_blockwise_copy_src_step{}, True); - b_blockwise_copy.MoveSrcSliceWindow(b_blockwise_copy_src_step{}, True); - - __syncthreads(); - - // LDS double buffer: load last data from device mem - a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer); - b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer); - - // LDS double buffer: GEMM on 2nd-last data - c_thread_vec = blockwise_gemm.Run(p_a_block_double, p_b_block_double, c_thread_vec); - - // LDS double buffer: store last data to LDS - a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer, - p_a_block_double + a_block_space); - b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, - p_b_block_double + b_block_space); - - __syncthreads(); - - // LDS double buffer: GEMM on current data - c_thread_vec = blockwise_gemm.Run(p_a_block_double + a_block_space, - p_b_block_double + b_block_space, - c_thread_vec); - } - else // if has 1 iteration left - { - __syncthreads(); - - // LDS double buffer: GEMM on last data - c_thread_vec = blockwise_gemm.Run(p_a_block_double, p_b_block_double, c_thread_vec); - } - } - - // copy output: register to global memory - { - ///\todo inconsistent layout of xdlops and tensor - // xdlops layout - // M1 = num_groups; - // M0 = group_size; - // N1 = num_blks_per_wave; - // N0 = num_threads_per_blks; - constexpr auto CLayout = blockwise_gemm.GetOutputLayout(); - constexpr index_t M0 = CLayout.M1(); - constexpr index_t M1 = CLayout.N1(); - constexpr index_t M2 = CLayout.M0(); - - constexpr auto c_m0_m1_m2_n_global_desc = transform_tensor_descriptor( - c_m_n_global_desc, - make_tuple(UnMerge>{}, PassThrough{}), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{})); - - // src descriptor - constexpr auto c_m0_m1_m2_n_thread_desc = - make_native_tensor_descriptor_packed(Sequence{}); - - using CThreadCopySliceLengths = Sequence; - - constexpr index_t BlkSize = blockwise_gemm.GetBlkSize(); - constexpr index_t NumBlks = blockwise_gemm.GetNumBlks(); - - for(index_t i = 0; i < NumBlks; ++i) - { - // calculate origin of thread output tensor on global memory - // blockwise GEMM c matrix starting index - const auto c_thread_mtx_on_block = blockwise_gemm.GetBeginOfThreadMatrixC(i); - - const index_t m_thread_data_on_global = - m_block_data_on_global + c_thread_mtx_on_block.row; - - const index_t n_thread_data_on_global = - n_block_data_on_global + c_thread_mtx_on_block.col; - - ThreadwiseGenericTensorSliceCopy_v4r2::type, - 3, - 1, - 1, - AddressSpace::Vgpr, - AddressSpace::Global, - CGlobalMemoryDataOperation>( - {0, 0, 0, 0}, - {m_thread_data_on_global / (M2 * M1), - m_thread_data_on_global % (M2 * M1) / M2, - m_thread_data_on_global % M2, - n_thread_data_on_global}) - .Run(c_thread_vec.n + i * BlkSize, p_c_global); - } - } - } -}; - -template -struct GridwiseBatchedGemmTransposedANormalBNormalCXdlops_v1 -{ - __device__ void Run(const Float* const __restrict__ p_a_global, - const Float* const __restrict__ p_b_global, - Float* const __restrict__ p_c_global) const - { - - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - constexpr auto True = integral_constant{}; - - constexpr auto a_g_k_m_global_desc = AGlobalDesc{}; - constexpr auto b_g_k_n_global_desc = BGlobalDesc{}; - constexpr auto c_g_m_n_global_desc = CGlobalDesc{}; - - constexpr auto G = b_g_k_n_global_desc.GetLengths()[0]; - - constexpr auto K = b_g_k_n_global_desc.GetLengths()[1]; - constexpr auto N = b_g_k_n_global_desc.GetLengths()[2]; - constexpr auto M = a_g_k_m_global_desc.GetLengths()[2]; - - // divide block work by [M, N] - static_assert(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0, - "wrong! cannot divide work evenly among block"); - - constexpr index_t MBlockWork = M / MPerBlock; - constexpr index_t NBlockWork = N / NPerBlock; - - static_assert(MPerBlock % MPerWave == 0 && NPerBlock % NPerWave == 0, - "wrong! M/NPerBlock % M/NPerWave != 0"); - - constexpr index_t MWaves = MPerBlock / MPerWave; - constexpr index_t NWaves = NPerBlock / NPerWave; - - constexpr auto block_work_desc = - make_cluster_descriptor(Sequence{}); - - const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); - - const index_t group_id = block_work_id[0]; - const index_t m_block_data_on_global = block_work_id[1] * MPerBlock; - const index_t n_block_data_on_global = block_work_id[2] * NPerBlock; - - // LDS mem - constexpr index_t max_align = math::lcm(BBlockCopyDstDataPerWrite_N, - ABlockCopyDstDataPerWrite_M, - GemmDataPerReadM, - GemmDataPerReadN); - - // LDS - // be careful of LDS alignment - constexpr auto a_g_k_m_block_desc = make_native_tensor_descriptor_aligned( - Sequence<1, KPerBlock, MPerBlock>{}, Number{}); - - auto a_blockwise_copy = - BlockwiseGenericTensorSliceCopy_v4( - {group_id, 0, m_block_data_on_global}, {0, 0, 0}); - - constexpr auto b_g_k_n_block_desc = make_native_tensor_descriptor_aligned( - Sequence<1, KPerBlock, NPerBlock>{}, Number{}); - - auto b_blockwise_copy = - BlockwiseGenericTensorSliceCopy_v4( - {group_id, 0, n_block_data_on_global}, {0, 0, 0}); - - // GEMM definition - // c_mtx += transpose(a_mtx) * b_mtx - // a_mtx[KPerBlock, MPerBlock] is in LDS - // b_mtx[EPerBlocl, NPerBlock] is in LDS - // c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in - // register - constexpr auto a_k_m_block_mtx_desc = make_ConstantMatrixDescriptor_packed( - a_g_k_m_block_desc.GetLength(I1), a_g_k_m_block_desc.GetLength(I2)); - constexpr auto b_k_n_block_mtx_desc = make_ConstantMatrixDescriptor_packed( - b_g_k_n_block_desc.GetLength(I1), b_g_k_n_block_desc.GetLength(I2)); - - const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_xdlops< - BlockSize, - decltype(a_k_m_block_mtx_desc), - decltype(b_k_n_block_mtx_desc), - Float, - MPerWave, - NPerWave, - MWaves, - NWaves, - GemmDataPerReadM, - GemmDataPerReadN>{}; - - constexpr index_t a_block_space = - math::integer_least_multiple(a_g_k_m_block_desc.GetElementSpace(), max_align); - - constexpr index_t b_block_space = - math::integer_least_multiple(b_g_k_n_block_desc.GetElementSpace(), max_align); - - __shared__ Float p_a_block_double[2 * a_block_space]; - __shared__ Float p_b_block_double[2 * b_block_space]; - - // get zero-initialized output register of vector type - auto c_thread_vec = blockwise_gemm.CreateOutputVecZero(); - - // LDS double buffer: preload data into LDS - { - a_blockwise_copy.Run(p_a_global, p_a_block_double); - b_blockwise_copy.Run(p_b_global, p_b_block_double); - } - - using b_blockwise_copy_src_step = Sequence<0, KPerBlock, 0>; - using a_blockwise_copy_src_step = Sequence<0, KPerBlock, 0>; - - // LDS double buffer: main body - for(index_t k_block_data_begin = 0; k_block_data_begin + 2 * KPerBlock < K; - k_block_data_begin += 2 * KPerBlock) - { -#pragma unroll - for(index_t iloop = 0; iloop < 2; ++iloop) - { - const bool even_loop = (iloop % 2 == 0); - - Float* p_a_block_now = - even_loop ? p_a_block_double : p_a_block_double + a_block_space; - Float* p_b_block_now = - even_loop ? p_b_block_double : p_b_block_double + b_block_space; - - Float* p_a_block_next = - even_loop ? p_a_block_double + a_block_space : p_a_block_double; - Float* p_b_block_next = - even_loop ? p_b_block_double + b_block_space : p_b_block_double; - - Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()]; - Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()]; - - a_blockwise_copy.MoveSrcSliceWindow(a_blockwise_copy_src_step{}, True); - b_blockwise_copy.MoveSrcSliceWindow(b_blockwise_copy_src_step{}, True); - - __syncthreads(); - - // LDS doubel buffer: load next data from device mem - a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer); - b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer); - - // LDS double buffer: GEMM on current data - c_thread_vec = blockwise_gemm.Run(p_a_block_now, p_b_block_now, c_thread_vec); - - // LDS double buffer: store next data to LDS - a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer, p_a_block_next); - b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, p_b_block_next); - } - } - - // LDS double buffer: tail - { - constexpr bool has_two_iteration_left = (K % (2 * KPerBlock) == 0); - - if(has_two_iteration_left) // if has 2 iteration left - { - Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()]; - Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()]; - - a_blockwise_copy.MoveSrcSliceWindow(a_blockwise_copy_src_step{}, True); - b_blockwise_copy.MoveSrcSliceWindow(b_blockwise_copy_src_step{}, True); - - __syncthreads(); - - // LDS double buffer: load last data from device mem - a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer); - b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer); - - // LDS double buffer: GEMM on 2nd-last data - c_thread_vec = blockwise_gemm.Run(p_a_block_double, p_b_block_double, c_thread_vec); - - // LDS double buffer: store last data to LDS - a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer, - p_a_block_double + a_block_space); - b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, - p_b_block_double + b_block_space); - - __syncthreads(); - - // LDS double buffer: GEMM on current data - c_thread_vec = blockwise_gemm.Run(p_a_block_double + a_block_space, - p_b_block_double + b_block_space, - c_thread_vec); - } - else // if has 1 iteration left - { - __syncthreads(); - - // LDS double buffer: GEMM on last data - c_thread_vec = blockwise_gemm.Run(p_a_block_double, p_b_block_double, c_thread_vec); - } - } - - // copy output: register to global memory - { - ///\todo inconsistent layout of xdlops and tensor - // xdlops layout - // M1 = num_groups; - // M0 = group_size; - // N1 = num_blks_per_wave; - // N0 = num_threads_per_blks; - constexpr auto CLayout = blockwise_gemm.GetOutputLayout(); - constexpr index_t M0 = CLayout.M1(); - constexpr index_t M1 = CLayout.N1(); - constexpr index_t M2 = CLayout.M0(); - - constexpr auto c_g_m0_m1_m2_n_global_desc = transform_tensor_descriptor( - c_g_m_n_global_desc, - make_tuple(PassThrough{}, UnMerge>{}, PassThrough{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - - // src descriptor - constexpr auto c_g_m0_m1_m2_n_thread_desc = - make_native_tensor_descriptor_packed(Sequence<1, M0, 1, M2, 1>{}); - - using CThreadCopySliceLengths = Sequence<1, M0, 1, M2, 1>; - - constexpr index_t BlkSize = CLayout.GetBlkSize(); - constexpr index_t NumBlks = CLayout.GetNumBlks(); - - for(index_t i = 0; i < NumBlks; ++i) - { - // calculate origin of thread output tensor on global memory - // blockwise GEMM c matrix starting index - const auto c_thread_mtx_on_block = blockwise_gemm.GetBeginOfThreadMatrixC(i); - - const index_t m_thread_data_on_global = - m_block_data_on_global + c_thread_mtx_on_block.row; - - const index_t n_thread_data_on_global = - n_block_data_on_global + c_thread_mtx_on_block.col; - - ThreadwiseGenericTensorSliceCopy_v4r2::type, - 4, - 1, - 1, - AddressSpace::Vgpr, - AddressSpace::Global, - CGlobalMemoryDataOperation>( - {0, 0, 0, 0, 0}, - {group_id, - m_thread_data_on_global / (M2 * M1), - m_thread_data_on_global % (M2 * M1) / M2, - m_thread_data_on_global % M2, - n_thread_data_on_global}) - .Run(c_thread_vec.n + i * BlkSize, p_c_global); - } - } - } -}; - -} // namespace ck -#endif diff --git a/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp deleted file mode 100644 index cf9a8bcbc7..0000000000 --- a/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp +++ /dev/null @@ -1,443 +0,0 @@ -#ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP -#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP - -#include "static_kernel_common_header.hpp" -#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp" -#include "static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp" -#include "static_kernel_tensor_coordinate_deprecated.hpp" - -namespace ck { - -// This threadwise copy allow vector access of src and dst. -// It allows the vector size to be different on src and dst. -// The dimensions of vector access should be the same on src and dst. -// The dimension access order should be the same on src and dst. -// It is designed for cases, where one of src and dst is register, and -// the other is device memory or LDS -template -struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated -{ - static constexpr index_t nDim = SliceLengths::GetSize(); - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2_deprecated( - Array src_slice_origin, Array dst_slice_origin) - : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) - { - static_assert(nDim == SrcDesc::GetNumOfDimension() && - nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && - nDim == DimAccessOrder::GetSize(), - "wrong! # of dimensions not the same"); - - static_assert(is_valid_sequence_map::value, "wrong! map is not valid"); - - static_assert( - SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0, - "wrong! cannot evenly divide"); - - // check vectorized memory access - constexpr auto vector_access_dim = Number{}; - - static_if{}([&](auto fwd) { - static_assert( - (fwd(SrcDesc{}).GetStride(vector_access_dim) == 1 || SrcDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }).Else([&](auto fwd) { - static_assert((fwd(SrcDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 || - SrcDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }); - - static_if{}([&](auto fwd) { - static_assert( - (fwd(DstDesc{}).GetStride(vector_access_dim) == 1 || DstDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }).Else([&](auto fwd) { - static_assert((fwd(DstDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 || - DstDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }); - } - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2_deprecated() - : ThreadwiseGenericTensorSliceCopy_v1r2_deprecated(make_zero_array(), - make_zero_array()) - { - } - - __device__ void SetSrcSliceOrigin(Array src_slice_origin) - { - mSrcSliceOrigin = src_slice_origin; - } - - __device__ void SetDstSliceOrigin(Array dst_slice_origin) - { - mDstSliceOrigin = dst_slice_origin; - } - - template - __device__ void Run(const SrcData* p_src, DstData* p_dst) const - { - using src_vector_t = typename vector_type::MemoryType; - using dst_vector_t = typename vector_type::MemoryType; - - constexpr auto vector_access_dim = Number{}; - - constexpr auto src_data_per_access = Number{}; - constexpr auto dst_data_per_access = Number{}; - - constexpr auto long_vector_size = Number{}; - - constexpr auto long_vector_access_lengths = SliceLengths::Modify( - vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); - - ford{}( - [&](auto long_vector_access_id) { - // data id w.r.t slicing-window - auto long_vector_data_begin_id = long_vector_access_id; - long_vector_data_begin_id(vector_access_dim) = - long_vector_size * long_vector_access_id[vector_access_dim]; - - // buffer to hold a long-vector - SrcData p_src_long_vector[long_vector_size]; - DstData p_dst_long_vector[long_vector_size]; - - // load data from src to the long-vector buffer - for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(vector_access_dim) = i * src_data_per_access; - - const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex( - mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)); - - const index_t buffer_offset = i * src_data_per_access; - - *reinterpret_cast(&p_src_long_vector[buffer_offset]) = - *reinterpret_cast(&p_src[src_offset]); - } - - // type conversion - for(index_t i = 0; i < long_vector_size; ++i) - { - p_dst_long_vector[i] = type_convert{}(p_src_long_vector[i]); - } - - // store data from the long-vector buffer to dst - for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(vector_access_dim) = i * dst_data_per_access; - - const index_t buffer_offset = i * dst_data_per_access; - - const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex( - mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)); - - *reinterpret_cast(&p_dst[dst_offset]) = - *reinterpret_cast(&p_dst_long_vector[buffer_offset]); - } - }); - } - -private: - Array mSrcSliceOrigin; - Array mDstSliceOrigin; -}; - -// This version use TensorCoordinate_deprecated -// This threadwise copy allow vector access of src and dst. -// It allows the dimensions of vector access to be different on src and dst. -// It also allows the vector size to be different on src and dst. -// It also allows order of access to be different on src and dst. -// It use register as buffer to hold all data moving from src to dst. -// It is designed for copying small amount of data, and src and dst are -// device memory or LDS. -// When copying large amout of data, let's hope compiler will reduce register -// used for the buffer. -template -struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated -{ - static constexpr index_t nDim = SliceLengths::GetSize(); - - using Index = MultiIndex; - - using SrcCoordinate = typename TensorCoordinate_deprecated::type; - using DstCoordinate = typename TensorCoordinate_deprecated::type; - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1_deprecated( - const Index& src_slice_origin, const Index& dst_slice_origin) - : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) - { - static_assert(nDim == SrcDesc::GetNumOfDimension() && - nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && - nDim == SrcDimAccessOrder::GetSize() && - nDim == DstDimAccessOrder::GetSize(), - "wrong! # of dimensions not the same"); - - static_assert(is_valid_sequence_map::value && - is_valid_sequence_map::value, - "wrong! map is not valid"); - - static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 && - SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0, - "wrong! cannot evenly divide"); - - // check vectorized memory access - constexpr auto src_vector_access_dim = Number{}; - constexpr auto dst_vector_access_dim = Number{}; - - static_if{}( - [&](auto fwd) { - static_assert( - (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }) - .Else([&](auto fwd) { - static_assert( - (fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 || - SrcDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }); - - static_if{}( - [&](auto fwd) { - static_assert( - (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }) - .Else([&](auto fwd) { - static_assert( - (fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 || - DstDataPerAccess == 1), - "wrong! vectorized access is allowed only if stride == 1"); - }); - } - - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1_deprecated() - : ThreadwiseGenericTensorSliceCopy_v2r1_deprecated(make_zero_array(), - make_zero_array()) - { - } - - __device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin) - { - mSrcSliceOrigin = src_slice_origin; - } - - __device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin) - { - mDstSliceOrigin = dst_slice_origin; - } - - template - struct IsolateMergedDimLengths - { - template - __device__ constexpr index_t operator()(IDim idim) const - { - return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1; - } - }; - - template - __device__ void Run(const SrcData* p_src, - DstData* p_dst, - integral_constant, - integral_constant) const - { - constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); - - SrcData p_src_buffer_[buffer_desc.GetElementSpace()]; - SrcData* p_src_buffer = p_src_buffer_; - - // copy data from src into buffer - { - constexpr auto src_vector_access_dim = Number{}; - constexpr auto src_data_per_access = Number{}; - - constexpr auto src_access_lengths = SliceLengths::Modify( - src_vector_access_dim, - SliceLengths::Get(src_vector_access_dim) / src_data_per_access); - - // Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t - // normal dimensions is known at compile time. - // Below is a hack to isolate merged dimension id from normal dimension id, so the - // corresponding offset can be calculated seperately at run-time and compile-time. - // src_merged_dim_access_lengths has the same value as src_access_lengths on src's - // merged dimensions, and has value = 1 on normal dimensions; - // src_merged_dim_access_lengths has the same value as src_access_lengths on src's - // normal dimensions, and has value = 1 on merged dimensions; - constexpr auto src_merged_dim_access_lengths = typename sequence_gen< - nDim, - IsolateMergedDimLengths>::type{}; - - constexpr auto src_normal_dim_access_lengths = - src_access_lengths + Number<1>{} - src_merged_dim_access_lengths; - - ford{}( - [&](auto src_merged_dim_access_id) { - auto src_merged_dim_data_id = src_merged_dim_access_id; - src_merged_dim_data_id(src_vector_access_dim) = - src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access; - - // offset w.r.t. merged dimension need be computed at run-time, - const index_t src_merged_offset = - (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); - - ford{}( - [&](auto src_normal_dim_access_id) { - auto src_normal_dim_data_id = src_normal_dim_access_id; - src_normal_dim_data_id(src_vector_access_dim) = - src_normal_dim_access_id[src_vector_access_dim] * - src_data_per_access; - - // offset w.r.t. normal dimension is known at compile-time - const index_t src_normal_offset = - SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); - - SrcData p_src_vector_data[SrcDataPerAccess]; - - transfer_data( - p_src, src_normal_offset + src_merged_offset, p_src_vector_data, 0); - - // unpack vector into buffer - for(index_t i = 0; i < SrcDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(src_vector_access_dim) = i; - - const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); - - p_src_buffer[buffer_offset] = p_src_vector_data[i]; - } - }); - }); - } - - // type conversion - // TODO: would compiler do a good job reusing register for buffer? - DstData p_dst_buffer_[buffer_desc.GetElementSpace()]; - DstData* p_dst_buffer = p_dst_buffer_; - - ford{}([&](auto idx) { - p_dst_buffer[buffer_desc.GetOffsetFromMultiIndex(idx)] = - type_convert{}(p_src_buffer[buffer_desc.GetOffsetFromMultiIndex(idx)]); - }); - - // copy data from buffer into dst - { - constexpr auto dst_vector_access_dim = Number{}; - constexpr auto dst_data_per_access = Number{}; - - constexpr auto dst_access_lengths = SliceLengths::Modify( - dst_vector_access_dim, - SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); - - constexpr auto dst_merged_dim_access_lengths = typename sequence_gen< - nDim, - IsolateMergedDimLengths>::type{}; - - constexpr auto dst_normal_dim_access_lengths = - dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths; - - ford{}( - [&](auto dst_merged_dim_access_id) { - auto dst_merged_dim_data_id = dst_merged_dim_access_id; - dst_merged_dim_data_id(dst_vector_access_dim) = - dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access; - - // offset w.r.t. merged dimension need be computed at run-time, - const index_t dst_merged_offset = - (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); - - ford{}( - [&](auto dst_normal_dim_access_id) { - auto dst_normal_dim_data_id = dst_normal_dim_access_id; - dst_normal_dim_data_id(dst_vector_access_dim) = - dst_normal_dim_access_id[dst_vector_access_dim] * - dst_data_per_access; - - DstData p_dst_vector_data[DstDataPerAccess]; - - // pack vector from buffer - for(index_t i = 0; i < DstDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(dst_vector_access_dim) = i; - - const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); - - p_dst_vector_data[i] = p_dst_buffer[buffer_offset]; - } - - // offset w.r.t. normal dimension is known at compile-time - const index_t dst_normal_offset = - DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); - - transfer_data( - p_dst_vector_data, 0, p_dst, dst_normal_offset + dst_merged_offset); - }); - }); - } - } - - template - __device__ void Run(const SrcData* p_src, DstData* p_dst) const - { - constexpr auto generic_address_space = - integral_constant{}; - - Run(p_src, p_dst, generic_address_space, generic_address_space); - } - - // T can be Sequence or Array - template - __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) - { - static_if{}([&](auto) { - mSrcSliceOrigin += step_sizes; - }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; }); - } - - template - __device__ void MoveDstSliceWindow(T step_sizes, integral_constant) - { - static_if{}([&](auto) { - mDstSliceOrigin += step_sizes; - }).Else([&](auto) { mDstSliceOrigin -= step_sizes; }); - } - -private: - SrcCoordinate mSrcSliceOrigin; - DstCoordinate mDstSliceOrigin; -}; - -} // namespace ck -#endif diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp deleted file mode 100644 index 1c7fd21e3c..0000000000 --- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp +++ /dev/null @@ -1,329 +0,0 @@ -#include "static_kernel_common_header.hpp" -#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp" -#include "gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.hpp" -#include "gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_fp16_bfp16_fwd_nchw_kcyx_nkhw_lds_double_buffer.hpp" -#include "gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_fp16_bfp16_wrw_nchw_kcyx_nkhw_lds_double_buffer.hpp" -#include "float_types.h" - -extern "C" __global__ - __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer( - const FLOAT* const __restrict__ p_in_global, - const FLOAT* const __restrict__ p_wei_global, - FLOAT* const __restrict__ p_out_global) -{ - using namespace ck; - - // read params: problem decription - constexpr index_t N = CK_PARAM_PROBLEM_N; - constexpr index_t K = CK_PARAM_PROBLEM_K; - constexpr index_t C = CK_PARAM_PROBLEM_C; - constexpr index_t Hi = CK_PARAM_PROBLEM_HI; - constexpr index_t Wi = CK_PARAM_PROBLEM_WI; - constexpr index_t Ho = CK_PARAM_PROBLEM_HO; - constexpr index_t Wo = CK_PARAM_PROBLEM_WO; - constexpr index_t Y = CK_PARAM_PROBLEM_Y; - constexpr index_t X = CK_PARAM_PROBLEM_X; - - constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H; - constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W; - - constexpr index_t ConvDilationH = CK_PARAM_PROBLEM_CONV_DILATION_H; - constexpr index_t ConvDilationW = CK_PARAM_PROBLEM_CONV_DILATION_W; - - // read params: tunable params - constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE; - - constexpr index_t GemmMPerBlock = CK_PARAM_TUNABLE_GEMM_M_PER_BLOCK; - constexpr index_t GemmNPerBlock = CK_PARAM_TUNABLE_GEMM_N_PER_BLOCK; - constexpr index_t GemmKPerBlock = CK_PARAM_TUNABLE_GEMM_K_PER_BLOCK; - constexpr index_t GemmKBlocks = CK_PARAM_TUNABLE_GEMM_K_BLOCKS; - - // read params: dependent params - constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE; - - constexpr index_t LeftPadH = CK_PARAM_PROBLEM_LEFT_PAD_H; - constexpr index_t LeftPadW = CK_PARAM_PROBLEM_LEFT_PAD_W; - - constexpr index_t RightPadH = CK_PARAM_PROBLEM_RIGHT_PAD_H; - constexpr index_t RightPadW = CK_PARAM_PROBLEM_RIGHT_PAD_W; - - using LeftPads = Sequence; - using RightPads = Sequence; - -// calculate dependent params amd heuristic params -#if CK_PARAM_PROBLEM_DIRECTION == 2 - // In the WrW direction the filter is the output, while the output image is the input being - // convolved with the (original) input image. This requires that the tensordescriptors be - // swapped - // To reuse the fwd kernel for this operation we need to swap the n and c dimension of the - // input descriptor, the n and k dimension of the output descriptor - // This change is necessary so that reduction dimensions are consistent with the requirement - // of the wrw convolution when used in a fwd context - constexpr auto tmp_in_nchw_desc = - make_native_tensor_descriptor_packed(Sequence{}); - constexpr auto tmp_wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence{}); - constexpr auto tmp_out_nkhw_desc = - make_native_tensor_descriptor_packed(Sequence{}); - constexpr auto in_nchw_desc = - reorder_tensor_descriptor_given_upper2lower(tmp_in_nchw_desc, Sequence<1, 0, 2, 3>{}); - // wei and out are swapped in the solver - constexpr auto wei_kcyx_desc = - reorder_tensor_descriptor_given_upper2lower(tmp_out_nkhw_desc, Sequence<1, 0, 2, 3>{}); - constexpr auto out_nkhw_desc = - reorder_tensor_descriptor_given_upper2lower(tmp_wei_kcyx_desc, Sequence<1, 0, 2, 3>{}); - constexpr auto dir = ImplicitGemmDirection::BackwardWeight; - - // swap stride and dilation - using ConvDilations = Sequence; - using ConvStrides = Sequence; -#else - static_assert(GemmKBlocks == 1, "do not support GemmKBlocks > 1 for forward!"); - // calculate dependent params amd heuristic params - constexpr auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence{}); - constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence{}); - constexpr auto out_nkhw_desc = make_native_tensor_descriptor_packed(Sequence{}); - - constexpr auto dir = ImplicitGemmDirection::ForwardData; - using ConvStrides = Sequence; - using ConvDilations = Sequence; -#endif // CK_PARAM_PROBLEM_DIRECTION == 2 - - constexpr index_t GemmBBlockCopyClusterLengths_GemmK = - CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K; - constexpr index_t GemmBBlockCopyClusterLengths_GemmN = - CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N; - - constexpr index_t GemmBBlockCopyThreadSliceLengths_GemmK = - GemmKPerBlock / GemmBBlockCopyClusterLengths_GemmK; - constexpr index_t GemmBBlockCopyThreadSliceLengths_GemmN = - GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN; - - constexpr index_t GemmABlockCopyClusterLengths_GemmK = - CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K; - constexpr index_t GemmABlockCopyClusterLengths_GemmM = - CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M; - - constexpr index_t GemmABlockCopyThreadSliceLengths_GemmK = - GemmKPerBlock / GemmABlockCopyClusterLengths_GemmK; - constexpr index_t GemmABlockCopyThreadSliceLengths_GemmM = - GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM; - -#if MIOPEN_USE_FP32 - using GemmABlockCopyThreadSliceLengths_GemmG_GemmK_GemmM = - Sequence<1, GemmABlockCopyThreadSliceLengths_GemmK, GemmABlockCopyThreadSliceLengths_GemmM>; - using GemmABlockCopyThreadClusterLengths_GemmG_GemmK_GemmM = - Sequence<1, GemmABlockCopyClusterLengths_GemmK, GemmABlockCopyClusterLengths_GemmM>; - - using GemmABlockCopyThreadClusterArrangeOrder = Sequence<0, 2, 1>; // [E0, K, E1] - using GemmABlockCopySrcAccessOrder = Sequence<0, 2, 1>; // [E0, K, E1] - using GemmABlockCopyDstAccessOrder = Sequence<0, 1, 2>; // [E0, E1, K] - - constexpr index_t GemmABlockCopySrcDataPerRead_GemmM = - CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_M; - - using GemmBBlockCopyThreadSliceLengths_GemmG_GemmK_GemmN = - Sequence<1, GemmBBlockCopyThreadSliceLengths_GemmK, GemmBBlockCopyThreadSliceLengths_GemmN>; - using GemmBBlockCopyThreadClusterLengths_GemmG_GemmK_GemmN = - Sequence<1, GemmBBlockCopyClusterLengths_GemmK, GemmBBlockCopyClusterLengths_GemmN>; - - using GemmBBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 2>; // [E0, E1, B] - using GemmBBlockCopySrcAccessOrder = Sequence<0, 1, 2>; // [E0, E1, B] - using GemmBBlockCopyDstAccessOrder = Sequence<0, 1, 2>; // [E0, E1, B] - - constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = - CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_N; - - constexpr index_t GemmABlockCopySrcDataPerRead_GemmK = - CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_K; - -#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16 - constexpr index_t GemmKPACK = CK_PARAM_GEMM_KPACK_LENGTH; - - using GemmABlockCopySubLengths_GemmG_GemmK_GemmM_GemmKPACK = - Sequence<1, - GemmABlockCopyThreadSliceLengths_GemmK, - GemmABlockCopyThreadSliceLengths_GemmM, - GemmKPACK>; - using GemmABlockCopyClusterLengths_GemmG_GemmK_GemmM_GemmKPACK = - Sequence<1, GemmABlockCopyClusterLengths_GemmK, GemmABlockCopyClusterLengths_GemmM, 1>; - - using GemmABlockCopyThreadClusterArrangeOrder = Sequence<0, 2, 1, 3>; // [G, M, K, GemmKPACK] - using GemmABlockCopySrcAccessOrder = Sequence<0, 2, 1, 3>; // [G, M, K, GemmKPACK] - using GemmABlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [G, K, M, GemmKPACK] - - constexpr index_t GemmABlockCopyDstDataPerWrite_GemmKPACK = - CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_KPACK; - - using GemmBBlockCopySubLengths_GemmG_GemmK_GemmN_GemmKPACK = - Sequence<1, - GemmBBlockCopyThreadSliceLengths_GemmK, - GemmBBlockCopyThreadSliceLengths_GemmN, - GemmKPACK>; - using GemmBBlockCopyClusterLengths_GemmG_GemmK_GemmN_GemmKPACK = - Sequence<1, GemmBBlockCopyClusterLengths_GemmK, GemmBBlockCopyClusterLengths_GemmN, 1>; - - using GemmBBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [G, K, GemmKPACK, B] - using GemmBBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [G, K, GemmKPACK, B] - using GemmBBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [G, K, B, GemmKPACK] - - constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmKPACK = - CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_KPACK; - -#if CK_PARAM_PROBLEM_DIRECTION == 2 - constexpr index_t GemmABlockCopySrcDataPerRead_GemmK = - CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_K; -#else - constexpr index_t GemmABlockCopySrcDataPerRead_GemmKPACK = - CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_KPACK; -#endif // CK_PARAM_PROBLEM_DIRECTION - -#endif // MIOPEN_USE_FP16 || MIOPEN_USE_BFP16 - - constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = - CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_N; - - constexpr auto GemmMPerWave = CK_PARAM_GEMM_M_PER_WAVE; - constexpr auto GemmNPerWave = CK_PARAM_GEMM_N_PER_WAVE; - constexpr index_t GemmThreadGemmDataPerReadM = 1; - constexpr index_t GemmThreadGemmDataPerReadN = 1; - -#if MIOPEN_USE_FP32 - constexpr auto gridwise_conv = - GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer< - GridSize, - BlockSize, - FLOAT, - FLOAT_ACCUM, - decltype(in_nchw_desc), - decltype(wei_kcyx_desc), - decltype(out_nkhw_desc), - ConvStrides, - ConvDilations, - LeftPads, - RightPads, - GemmMPerBlock, - GemmNPerBlock, - GemmKPerBlock, - GemmKBlocks, - GemmMPerWave, - GemmNPerWave, - GemmThreadGemmDataPerReadM, - GemmThreadGemmDataPerReadN, - GemmABlockCopyThreadSliceLengths_GemmG_GemmK_GemmM, - GemmABlockCopyThreadClusterLengths_GemmG_GemmK_GemmM, - GemmABlockCopyThreadClusterArrangeOrder, - GemmABlockCopySrcAccessOrder, - GemmABlockCopyDstAccessOrder, - GemmABlockCopySrcDataPerRead_GemmK, - GemmABlockCopySrcDataPerRead_GemmM, - GemmBBlockCopyThreadSliceLengths_GemmG_GemmK_GemmN, - GemmBBlockCopyThreadClusterLengths_GemmG_GemmK_GemmN, - GemmBBlockCopyThreadClusterArrangeOrder, - GemmBBlockCopySrcAccessOrder, - GemmBBlockCopyDstAccessOrder, - GemmBBlockCopySrcDataPerRead_GemmN, - GemmBBlockCopyDstDataPerWrite_GemmN, - dir>{}; - gridwise_conv.Run(p_in_global, p_wei_global, p_out_global); - -#elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) && CK_PARAM_PROBLEM_DIRECTION == 2 - - // Backward weight in fp16/bfp16 uses atomic add to do reduction along K dimension - // It requires output blob to be of float as no atomic add exists for fp16/ushort - constexpr auto gridwise_conv = - GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_fp16_bfp16_wrw_nchw_kcyx_nkhw_lds_double_buffer< - GridSize, - BlockSize, - FLOAT, // Input data type = fp16 (fp16) or ushort (bfp16) - FLOAT_ACCUM, // Acc data type = float (see float_types.h) - float, // Output data type = float (not fp16/ushort) as no atomic add ISA exists for - // fp16/ushort. - decltype(in_nchw_desc), - decltype(wei_kcyx_desc), - decltype(out_nkhw_desc), - ConvStrides, - ConvDilations, - LeftPads, - RightPads, - GemmMPerBlock, - GemmNPerBlock, - GemmKPerBlock, - GemmKBlocks, - GemmKPACK, - GemmMPerWave, - GemmNPerWave, - GemmThreadGemmDataPerReadM, - GemmThreadGemmDataPerReadN, - GemmABlockCopySubLengths_GemmG_GemmK_GemmM_GemmKPACK, - GemmABlockCopyClusterLengths_GemmG_GemmK_GemmM_GemmKPACK, - GemmABlockCopyThreadClusterArrangeOrder, - GemmABlockCopySrcAccessOrder, - GemmABlockCopyDstAccessOrder, - GemmABlockCopySrcDataPerRead_GemmK, - GemmABlockCopyDstDataPerWrite_GemmKPACK, - GemmBBlockCopySubLengths_GemmG_GemmK_GemmN_GemmKPACK, - GemmBBlockCopyClusterLengths_GemmG_GemmK_GemmN_GemmKPACK, - GemmBBlockCopyThreadClusterArrangeOrder, - GemmBBlockCopySrcAccessOrder, - GemmBBlockCopyDstAccessOrder, - GemmBBlockCopySrcDataPerRead_GemmN, - GemmBBlockCopyDstDataPerWrite_GemmKPACK, - dir>{}; - - // Output blob is cast to float as no atomic add exists for fp16/ushort - gridwise_conv.Run(p_in_global, p_wei_global, reinterpret_cast(p_out_global)); -#elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) && CK_PARAM_PROBLEM_DIRECTION != 2 - // Forward data doesn't use any atomic add so output blob remains of the same type - // as input blob - - constexpr auto wkgrp_schd_order = -#if MIOPEN_USE_FP16 - NBlock1MBlock0; -#else - MBlock1NBlock0; -#endif // MIOPEN_USE_FP16 - - constexpr auto gridwise_conv = - GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_fp16_bfp16_fwd_nchw_kcyx_nkhw_lds_double_buffer< - GridSize, - BlockSize, - FLOAT, // Input data type = fp16 (fp16) or ushort (bfp16) - FLOAT_ACCUM, // Acc data type = float (see float_types.h) - FLOAT, // Input data type = fp16 (fp16) or ushort (bfp16) - decltype(in_nchw_desc), - decltype(wei_kcyx_desc), - decltype(out_nkhw_desc), - ConvStrides, - ConvDilations, - LeftPads, - RightPads, - GemmMPerBlock, - GemmNPerBlock, - GemmKPerBlock, - GemmKBlocks, - GemmKPACK, - GemmMPerWave, - GemmNPerWave, - GemmThreadGemmDataPerReadM, - GemmThreadGemmDataPerReadN, - GemmABlockCopySubLengths_GemmG_GemmK_GemmM_GemmKPACK, - GemmABlockCopyClusterLengths_GemmG_GemmK_GemmM_GemmKPACK, - GemmABlockCopyThreadClusterArrangeOrder, - GemmABlockCopySrcAccessOrder, - GemmABlockCopyDstAccessOrder, - GemmABlockCopySrcDataPerRead_GemmKPACK, - GemmABlockCopyDstDataPerWrite_GemmKPACK, - GemmBBlockCopySubLengths_GemmG_GemmK_GemmN_GemmKPACK, - GemmBBlockCopyClusterLengths_GemmG_GemmK_GemmN_GemmKPACK, - GemmBBlockCopyThreadClusterArrangeOrder, - GemmBBlockCopySrcAccessOrder, - GemmBBlockCopyDstAccessOrder, - GemmBBlockCopySrcDataPerRead_GemmN, - GemmBBlockCopyDstDataPerWrite_GemmKPACK, - dir, - wkgrp_schd_order>{}; - gridwise_conv.Run(p_in_global, p_wei_global, p_out_global); -#else - static_assert(false, "wrong! Only fp32, fp16 and bfp16 are supported."); -#endif // MIOPEN_USE_FP32 -}