Skip to content

Commit

Permalink
[OCL][MI100][MI200] Fix iGemm ASM GTC XDLOPS failures with OCL backen…
Browse files Browse the repository at this point in the history
…d (Staging 95b58f7) (#1317) and Implement abstraction for multi-buffer workspace (#1326)  (#1327)

* add a class to represent workspace buffer traits
* add workspace buffer alignment for bwd and wrw
* comment out W/A 1317
* remove WORKAROUND_ISSUE_1317 macro
* fix ctest bug for transpose+asm igemm case: weights c=c/groups
* Update src/conv/invokers/impl_gemm_dynamic.cpp
* Revert "[tests] Disable test_regression_opencl_float_mi100 if WORKAROUND_ISSUE_1317 is set. (#1352)"

Co-authored-by: Artem Tamazov <artem.tamazov@gmail.com>
Co-authored-by: Jun Liu <Liu.Jun@amd.com>
  • Loading branch information
3 people authored Dec 29, 2021
1 parent 7f590d2 commit 7416ce6
Show file tree
Hide file tree
Showing 8 changed files with 257 additions and 128 deletions.
23 changes: 23 additions & 0 deletions src/buffer_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,4 +148,27 @@ BuffInfo::BuffInfo(MemLayout_t layout, int nk, int c, int h, int w, int g, int _
}
}

MultiBufferWorkspaceTraits::MultiBufferWorkspaceTraits(std::initializer_list<size_t> v_size_,
size_t alignment_)
: v_size(v_size_), alignment(alignment_)
{
size_t each_offset = 0;
v_offset.push_back(each_offset);
for(auto each_size : v_size)
{
size_t padding = (alignment - (each_size % alignment)) % alignment;
each_offset += each_size + padding;
v_offset.push_back(each_offset);
}
}

size_t MultiBufferWorkspaceTraits::GetSize() const { return v_offset.back(); }

size_t MultiBufferWorkspaceTraits::GetOffset(size_t index) const
{
if(index >= v_offset.size())
MIOPEN_THROW("index given overflows");
return v_offset[index];
}

} // namespace miopen
36 changes: 24 additions & 12 deletions src/conv/invokers/impl_gemm_dynamic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,8 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(
int trans_weight_idx = -1;
int trans_output_idx = -1;

constexpr size_t buf_alignment = 256;

if(is_nchw)
{
TransposeSolutionDefault2Nhwc trans_input(ctx, ctx.in_data_type, n, c, hi, wi);
Expand All @@ -567,9 +569,6 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(
trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetSize();
trans_output_size = trans_output_skippable ? 0 : trans_output.GetSize();

trans_weight_offset = trans_input_offset + trans_input_size;
trans_output_offset = trans_weight_offset + trans_weight_size;

int idx = 0;
if(!trans_input_skippable)
trans_input_idx = idx++;
Expand All @@ -579,9 +578,16 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(
trans_output_idx = idx++;
}

// 4 bytes alignment to do atomic add
const size_t cast_offset = is_nchw ? (((trans_output_offset + trans_output_size + 3) >> 2) << 2) : 0;
const size_t cast_size = need_cast ? miopen::GetTypeSize(miopenFloat) * n * k * ho * wo : 0;
const size_t cast_size = need_cast ? miopen::GetTypeSize(miopenFloat) * n * k * ho * wo : 0;

MultiBufferWorkspaceTraits wt(
{trans_input_size, trans_weight_size, trans_output_size, cast_size}, buf_alignment);

trans_input_offset = wt.GetOffset(0);
trans_weight_offset = wt.GetOffset(1);
trans_output_offset = wt.GetOffset(2);

const size_t cast_offset = wt.GetOffset(3);

const int kID_trans_start = isGfx90aFp16altSupport ? 2 : 1;

Expand Down Expand Up @@ -849,6 +855,8 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory(
int trans_weight_idx = -1;
int trans_output_idx = -1;

constexpr size_t buf_alignment = 256;

if(is_nchw)
{
TransposeSolutionNhwc2Default trans_input(ctx, ctx.out_data_type, n, c, hi, wi);
Expand All @@ -875,9 +883,6 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory(
trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetSize();
trans_output_size = trans_output_skippable ? 0 : trans_output.GetSize();

trans_weight_offset = trans_input_offset + trans_input_size;
trans_output_offset = trans_weight_offset + trans_weight_size;

int idx = 0;
if(!trans_input_skippable)
trans_input_idx = idx++;
Expand All @@ -887,9 +892,16 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory(
trans_output_idx = idx++;
}

// 4 bytes alignment to do atomic add
const size_t cast_offset = is_nchw ? (((trans_output_offset + trans_output_size + 3) >> 2) << 2) : 0;
const size_t cast_size = need_cast ? miopen::GetTypeSize(miopenFloat) * n * c * hi * wi : 0;
const size_t cast_size = need_cast ? miopen::GetTypeSize(miopenFloat) * n * c * hi * wi : 0;

MultiBufferWorkspaceTraits wt(
{trans_input_size, trans_weight_size, trans_output_size, cast_size}, buf_alignment);

trans_input_offset = wt.GetOffset(0);
trans_weight_offset = wt.GetOffset(1);
trans_output_offset = wt.GetOffset(2);

const size_t cast_offset = wt.GetOffset(3);

const int kID_trans_start = isGfx90aFp16altSupport ? 2 : 1;

Expand Down
12 changes: 12 additions & 0 deletions src/include/miopen/buffer_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

#include <string>
#include <cassert>
#include <vector>

namespace miopen {

Expand Down Expand Up @@ -307,6 +308,17 @@ struct WinogradBufferInfo
}
};

struct MultiBufferWorkspaceTraits
{
MultiBufferWorkspaceTraits(std::initializer_list<size_t> v_size_, size_t alignment_);
size_t GetSize() const;
size_t GetOffset(size_t index) const;

std::vector<size_t> v_size;
std::vector<size_t> v_offset;
size_t alignment;
};

} // namespace miopen

#endif // GUARD_MIOPEN_BUFFER_INFO_HPP_
2 changes: 0 additions & 2 deletions src/include/miopen/conv/asm_implicit_gemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@
#include <vector>
#include <limits>

#define WORKAROUND_ISSUE_1317 (MIOPEN_BACKEND_OPENCL)

namespace miopen {

namespace solver {
Expand Down
55 changes: 30 additions & 25 deletions src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -867,11 +867,6 @@ ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::Search(const ConvolutionContext& ctx

bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable(const ConvolutionContext& ctx) const
{
#if WORKAROUND_ISSUE_1317
if(ctx.IsLayoutDefault())
if(!miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS_NHWC{}))
return false;
#endif
if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS_NHWC{}))
return false;

Expand Down Expand Up @@ -904,17 +899,25 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable(const ConvolutionC
size_t
ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetWorkspaceSize(const ConvolutionContext& ctx) const
{
const auto& hi = ctx.out_height;
const auto& wi = ctx.out_width;
const auto& n = ctx.batch_sz;
const auto& k = ctx.n_inputs;
const auto& c = ctx.n_outputs;
const auto& ho = ctx.in_height;
const auto& wo = ctx.in_width;
const auto& y = ctx.kernel_size_h;
const auto& x = ctx.kernel_size_w;
const auto& group = ctx.group_counts;
const auto is_nchw = ctx.IsLayoutDefault();
const auto& hi = ctx.out_height;
const auto& wi = ctx.out_width;
const auto& n = ctx.batch_sz;
const auto& k = ctx.n_inputs;
const auto& c = ctx.n_outputs;
const auto& ho = ctx.in_height;
const auto& wo = ctx.in_width;
const auto& y = ctx.kernel_size_h;
const auto& x = ctx.kernel_size_w;
const auto& group = ctx.group_counts;
const auto is_nchw = ctx.IsLayoutDefault();

size_t size_trans_input = 0;
size_t size_trans_weight = 0;
size_t size_trans_output = 0;
size_t size_tensor_cast = 0;

constexpr size_t buf_alignment = 256;

size_t workspace_size = 0;
if(is_nchw)
{
Expand All @@ -927,20 +930,22 @@ ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo
x); // group * k_per_group as batch for weight
TransposeSolutionDefault2Nhwc trans_output(ctx, ctx.in_data_type, n, k, ho, wo);
if(!trans_input.IsSkippable())
workspace_size += trans_input.GetSize();
size_trans_input = trans_input.GetSize();
if(!trans_weight.IsSkippable())
workspace_size += trans_weight.GetSize();
size_trans_weight = trans_weight.GetSize();
if(!trans_output.IsSkippable())
workspace_size += trans_output.GetSize();

// 4 bytes alignment to do atomic add
workspace_size = ((workspace_size + 3) >> 2) << 2;
size_trans_output = trans_output.GetSize();
}

if(!ctx.IsFp32())
workspace_size += miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
// kernel is FP32, when using FP32 atomic
* n * c * hi * wi;
size_tensor_cast =
miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
// kernel is FP32, when using FP32 atomic
* n * c * hi * wi;

MultiBufferWorkspaceTraits wt(
{size_trans_input, size_trans_weight, size_trans_output, size_tensor_cast}, buf_alignment);
workspace_size = wt.GetSize();

return workspace_size;
}
Expand Down
33 changes: 19 additions & 14 deletions src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,14 @@ ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo
const auto& group = ctx.group_counts;
const auto is_nchw = ctx.IsLayoutDefault();
size_t workspace_size = 0;

size_t size_trans_input = 0;
size_t size_trans_weight = 0;
size_t size_trans_output = 0;
size_t size_tensor_cast = 0;

constexpr size_t buf_alignment = 256;

if(is_nchw)
{

Expand All @@ -772,31 +780,28 @@ ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo
TransposeSolutionNhwc2Default trans_output(ctx, ctx.out_data_type, n, k, ho, wo);

if(!trans_input.IsSkippable())
workspace_size += trans_input.GetSize();
size_trans_input = trans_input.GetSize();
if(!trans_weight.IsSkippable())
workspace_size += trans_weight.GetSize();
size_trans_weight = trans_weight.GetSize();
if(!trans_output.IsSkippable())
workspace_size += trans_output.GetSize();

// 4 bytes alignment to do atomic add
workspace_size = ((workspace_size + 3) >> 2) << 2;
size_trans_output = trans_output.GetSize();
}

if(!ctx.IsFp32())
workspace_size += miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
// kernel is FP32, when using FP32 atomic
* n * k * ho * wo;
size_tensor_cast =
miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
// kernel is FP32, when using FP32 atomic
* n * k * ho * wo;

MultiBufferWorkspaceTraits wt(
{size_trans_input, size_trans_weight, size_trans_output, size_tensor_cast}, buf_alignment);
workspace_size = wt.GetSize();

return workspace_size;
}

bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsApplicable(const ConvolutionContext& ctx) const
{
#if WORKAROUND_ISSUE_1317
if(ctx.IsLayoutDefault())
if(!miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS_NHWC{}))
return false;
#endif
if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS_NHWC{}))
return false;

Expand Down
47 changes: 29 additions & 18 deletions src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ static inline std::size_t GetTypeSize(const std::string& s)
{
if(s == "fp32")
return miopen::GetTypeSize(miopenFloat);
if (s == "fp16")
if(s == "fp16")
return miopen::GetTypeSize(miopenHalf);
else
return miopen::GetTypeSize(miopenBFloat16);
Expand Down Expand Up @@ -795,11 +795,6 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::Search(const ConvolutionContext& ctx

bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsApplicable(const ConvolutionContext& ctx) const
{
#if WORKAROUND_ISSUE_1317
if(ctx.IsLayoutDefault())
if(!miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS_NHWC{}))
return false;
#endif
if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS_NHWC{}))
return false;

Expand Down Expand Up @@ -891,6 +886,14 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo
const auto& x = ctx.kernel_size_w;
const auto& group = ctx.group_counts;
const auto is_nchw = ctx.IsLayoutDefault();

size_t size_trans_input = 0;
size_t size_trans_weight = 0;
size_t size_trans_output = 0;
size_t size_tensor_cast = 0;

constexpr size_t buf_alignment = 256;

size_t workspace_size = 0;
if(is_nchw)
{
Expand All @@ -903,20 +906,22 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo
x); // group * k_per_group as batch for weight
TransposeSolutionDefault2Nhwc trans_output(ctx, ctx.in_data_type, n, k, ho, wo);
if(!trans_input.IsSkippable())
workspace_size += trans_input.GetSize();
size_trans_input = trans_input.GetSize();
if(!trans_weight.IsSkippable())
workspace_size += trans_weight.GetSize();
size_trans_weight = trans_weight.GetSize();
if(!trans_output.IsSkippable())
workspace_size += trans_output.GetSize();
size_trans_output = trans_output.GetSize();

// 4 bytes alignment to do atomic add
workspace_size = ((workspace_size + 3) >> 2) << 2;
}

if(!ctx.IsFp32())
workspace_size += miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
size_tensor_cast = miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
// kernel is FP32, when using FP32 atomic
* (k / group) * c * y * x;
* (k / group) * c * y * x;

MultiBufferWorkspaceTraits wt({size_trans_input, size_trans_weight, size_trans_output, size_tensor_cast}, buf_alignment);
workspace_size = wt.GetSize();

return workspace_size;
}

Expand Down Expand Up @@ -1027,6 +1032,9 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution(
int trans_input_idx = -1;
int trans_weight_idx = -1;
int trans_output_idx = -1;

constexpr size_t buf_alignment = 256;

if(is_nchw)
{
TransposeSolutionDefault2Nhwc trans_input(ctx, ctx.out_data_type, n, c, hi, wi);
Expand Down Expand Up @@ -1065,9 +1073,6 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution(
trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetSize();
trans_output_size = trans_output_skippable ? 0 : trans_output.GetSize();

trans_weight_offset = trans_input_offset + trans_input_size;
trans_output_offset = trans_weight_offset + trans_weight_size;

int idx = 0;
if(!trans_input_skippable)
trans_input_idx = idx++;
Expand All @@ -1079,11 +1084,17 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution(

MIOPEN_LOG_I2(SolverDbId(*this) << ": " << config.ToString() << msg.str());

// 4 bytes alignment to do atomic add
const size_t cast_offset = is_nchw ? (((trans_output_offset + trans_output_size + 3) >> 2) << 2) : 0;
const size_t cast_size = need_cast ?
miopen::GetTypeSize(miopenFloat) * k * (c / group) * y * x : 0;

MultiBufferWorkspaceTraits wt({trans_input_size, trans_weight_size, trans_output_size, cast_size}, buf_alignment);

trans_input_offset = wt.GetOffset(0);
trans_weight_offset = wt.GetOffset(1);
trans_output_offset = wt.GetOffset(2);

const size_t cast_offset = wt.GetOffset(3);

const int kID_trans_start = isGfx90aFp16altSupport ? 2 : 1;

const TensorDescriptor cast_desc(miopenFloat, ctx.conv_problem.GetWeights().GetLengths(), ctx.conv_problem.GetWeights().GetStrides());
Expand Down
Loading

0 comments on commit 7416ce6

Please sign in to comment.