Skip to content

Commit

Permalink
Merge branch 'develop' into sl/testpackage_nogpu
Browse files Browse the repository at this point in the history
  • Loading branch information
xinlipn authored Apr 25, 2024
2 parents a7750a3 + f10b809 commit 2db8ccd
Show file tree
Hide file tree
Showing 33 changed files with 228 additions and 212 deletions.
2 changes: 1 addition & 1 deletion driver/argmax_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ int32_t mloArgmaxForwardRunHost(miopenTensorDescriptor_t inputDesc,

int32_t reduce_size = static_cast<int32_t>(input_dims[dim]);
auto output_numel =
std::accumulate(output_dims.begin(), output_dims.end(), 1L, std::multiplies<int64_t>());
std::accumulate(output_dims.begin(), output_dims.end(), 1LL, std::multiplies<int64_t>());

auto inner_size = std::accumulate(
input_dims.begin() + dim + 1, input_dims.end(), 1ULL, std::multiplies<uint64_t>());
Expand Down
25 changes: 17 additions & 8 deletions driver/conv_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ class GpumemTensor
return;
}

for(int i = 0; i < sz; ++i)
for(size_t i = 0; i < sz; ++i)
{
/// \anchor move_rand
/// Generate random value, even if buffer is unused. This provides the same
Expand Down Expand Up @@ -1559,7 +1559,7 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
if(!biasFileName.empty())
read = readBufferFromFile<float>(b_int8.data(), b_sz, biasFileName.c_str());
if(!read)
for(int i = 0; i < b_sz; i++)
for(size_t i = 0; i < b_sz; ++i)
b_int8[i] = static_cast<float>(i % 8) + prng::gen_canonical<float>();
}
std::ignore = b.AllocOnDeviceAndInit(q, ctx, b_sz, b_int8);
Expand Down Expand Up @@ -1602,15 +1602,20 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()

if(!is_gpualloc)
{
for(int i = 0; i < b_sz; i++)
for(size_t i = 0; i < b_sz; ++i)
{
if(!b_read)
{
b.GetVector()[i] = static_cast<Tgpu>(i % 8) //
/// (i % 8) can't be converted to F8 type as there is no suitable
/// conversion, but we have conversions from int and from uint8_t.
/// int is not good as it would produce negative results
/// after truncation of size_t, while we want positive values.
/// uint8_t is fine because (i % 8) fits into 3 bits.
b.GetVector()[i] = static_cast<Tgpu>(static_cast<uint8_t>(i) % 8) //
+ (is_fp8 ? prng::gen_A_to_B(Data_min, Data_max) //
: prng::gen_canonical<Tgpu>());
}
db.GetVector()[i] = static_cast<Tgpu>(i % 8) //
db.GetVector()[i] = static_cast<Tgpu>(static_cast<uint8_t>(i) % 8) //
+ (is_fp8 ? prng::gen_A_to_B(Data_min, Data_max) //
: prng::gen_canonical<Tgpu>());
}
Expand Down Expand Up @@ -2415,7 +2420,7 @@ int ConvDriver<Tgpu, Tref>::RunForwardGPUReference()
{
auto out_tmp = tensor<Tgpu>(miopen::deref(outputTensor));
out.CopyFromDeviceToHost(GetStream(), out_tmp);
for(int i = 0; i < out_tmp.data.size(); i++)
for(size_t i = 0; i < out_tmp.data.size(); ++i)
{
outhost.data[i] = static_cast<Tref>(out_tmp.data[i]);
}
Expand Down Expand Up @@ -3326,7 +3331,7 @@ int ConvDriver<Tgpu, Tref>::RunBackwardWeightsGPUReference()
{
auto dwei_tmp = tensor<Tgpu>(miopen::deref(weightTensor));
dwei.CopyFromDeviceToHost(GetStream(), dwei_tmp);
for(int i = 0; i < dwei_tmp.data.size(); i++)
for(size_t i = 0; i < dwei_tmp.data.size(); ++i)
{
dwei_host.data[i] = static_cast<Tref>(dwei_tmp.data[i]);
}
Expand Down Expand Up @@ -3377,7 +3382,7 @@ int ConvDriver<Tgpu, Tref>::RunBackwardDataGPUReference()
{
auto din_tmp = tensor<Tgpu>(miopen::deref(inputTensor));
din.CopyFromDeviceToHost(GetStream(), din_tmp);
for(int i = 0; i < din_tmp.data.size(); i++)
for(size_t i = 0; i < din_tmp.data.size(); ++i)
{
din_host.data[i] = static_cast<Tref>(din_tmp.data[i]);
}
Expand Down Expand Up @@ -3433,6 +3438,10 @@ std::string ConvDriver<Tgpu, Tref>::GetVerificationCacheFileName(
{
return "int8";
}
if(std::is_same<decltype(type), int32_t>::value)
{
return "int32";
}
else if(std::is_same<decltype(type), float16>::value)
{
return "float16";
Expand Down
2 changes: 1 addition & 1 deletion driver/ctc_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ template <typename Tgpu, typename Tref>
int CTCDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
{
size_t probs_sz = batch_size * (num_class + 1) * max_time_step;
size_t labels_sz = std::accumulate(labelLengths.begin(), labelLengths.end(), 0);
size_t labels_sz = std::accumulate(labelLengths.begin(), labelLengths.end(), 0ULL);
size_t workSpaceSize;
size_t workSpaceSizeCPU;

Expand Down
2 changes: 1 addition & 1 deletion driver/rnn_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ int RNNDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
int nseq = inflags.GetValueInt("seq_len");
std::vector<int> in_n = GetInputTensorLengthsFromCmdLine();
std::size_t inputBatchLenSum;
inputBatchLenSum = std::accumulate(in_n.begin(), in_n.begin() + nseq, 0);
inputBatchLenSum = std::accumulate(in_n.begin(), in_n.begin() + nseq, 0ULL);

int hid_h = inflags.GetValueInt("hid_h");
int layer = inflags.GetValueInt("num_layer");
Expand Down
4 changes: 2 additions & 2 deletions driver/rnn_seq_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -766,7 +766,7 @@ inline size_t Get3DNoVECTensorSize(miopenTensorDescriptor_t& tensor)
assert(miopen::deref(tensor).IsPacked() &&
"GetTensorSize should not be used on an unpacked tensor.");
const auto len = GetTensorLengths(tensor);
size_t sz = std::accumulate(len.begin(), len.end(), 1, std::multiplies<size_t>());
size_t sz = std::accumulate(len.begin(), len.end(), 1ULL, std::multiplies<size_t>());
return sz;
}

Expand Down Expand Up @@ -827,7 +827,7 @@ int RNNSeqDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
const std::vector<int> out_lens = GetOutputTensorLengthsFromCmdLine();

const size_t vectors_cnt_host =
std::accumulate(sorted_seq_lens.begin(), sorted_seq_lens.end(), 0);
std::accumulate(sorted_seq_lens.begin(), sorted_seq_lens.end(), 0ULL);
const size_t vectors_cnt_gpu =
io_layout == miopenRNNDataSeqMajorNotPadded ? vectors_cnt_host : in_lens[0] * in_lens[1];

Expand Down
2 changes: 1 addition & 1 deletion driver/sum_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ int32_t mloSumForwardRunHost(miopenTensorDescriptor_t inputDesc,

auto reduce_size = input_dims[dim];
auto output_numel =
std::accumulate(output_dims.begin(), output_dims.end(), 1L, std::multiplies<int64_t>());
std::accumulate(output_dims.begin(), output_dims.end(), 1LL, std::multiplies<int64_t>());

auto inner_size = 1ULL;
for(int32_t i = dim + 1; i < input_dims.size(); i++)
Expand Down
20 changes: 14 additions & 6 deletions driver/tensor_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,14 @@ inline int SetTensorNd(miopenTensorDescriptor_t t,
return miopenSetTensorDescriptor(t, data_type, len.size(), len.data(), strides.data());
}

inline int SetTensorNd(miopenTensorDescriptor_t t,
std::vector<std::size_t>& len,
std::vector<std::size_t>& strides,
miopenDataType_t data_type = miopenFloat)
{
return miopenSetTensorDescriptorV2(t, data_type, len.size(), len.data(), strides.data());
}

inline int SetTensorNd(miopenTensorDescriptor_t t,
std::vector<int>& len,
const std::string& layout,
Expand Down Expand Up @@ -208,10 +216,10 @@ inline int SetTensorNd(miopenTensorDescriptor_t t,
return SetTensorNd(t, len, data_type);
}

std::vector<int> strides;
miopen::tensor_layout_to_strides(len, len_layout, layout, strides);

return SetTensorNd(t, len, strides, data_type);
std::vector<std::size_t> strides2;
std::vector<std::size_t> len2(len.cbegin(), len.cend());
miopen::tensor_layout_to_strides(len2, len_layout, layout, strides2);
return SetTensorNd(t, len2, strides2, data_type);
}

// This function ignores tensor strides completely and its result should not be interpreted as
Expand All @@ -222,8 +230,8 @@ inline size_t GetTensorSize(const miopenTensorDescriptor_t& tensor)
{
assert(miopen::deref(tensor).IsPacked() &&
"GetTensorSize should not be used on an unpacked tensor.");
const auto len = GetTensorLengths(tensor);
const auto vectorLength = GetTensorVectorLength(tensor);
const auto len = GetTensorLengths(tensor);
const size_t vectorLength = GetTensorVectorLength(tensor);
size_t sz = std::accumulate(len.begin(), len.end(), vectorLength, std::multiplies<size_t>());

return sz;
Expand Down
10 changes: 10 additions & 0 deletions include/miopen/miopen.h
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,16 @@ MIOPEN_EXPORT miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t
const int* dimsA,
const int* stridesA);

#ifdef MIOPEN_BETA_API
/*! @copydoc miopenSetTensorDescriptor()
*/
MIOPEN_EXPORT miopenStatus_t miopenSetTensorDescriptorV2(miopenTensorDescriptor_t tensorDesc,
miopenDataType_t dataType,
int nbDims,
const size_t* dimsA,
const size_t* stridesA);
#endif

#ifdef MIOPEN_BETA_API
/*! @brief Set the tensor cast type
*
Expand Down
6 changes: 6 additions & 0 deletions src/include/miopen/conv/problem_description.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,12 @@ struct ProblemDescription : ProblemDescriptionBase
return in.AllDimsFitIntoInt() && weights.AllDimsFitIntoInt() && out.AllDimsFitIntoInt();
}

bool AllTensorsLengthsFitIntoInt() const
{
return in.AllLengthsFitIntoInt() && weights.AllLengthsFitIntoInt() &&
out.AllLengthsFitIntoInt();
}

void HeuristicUpdateLayouts();

void MakeNetworkConfig(std::string& conf_key) const;
Expand Down
2 changes: 1 addition & 1 deletion src/include/miopen/oclkernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ class OCLKernel
{
assert(!gdims.empty() && gdims.size() <= 3);
assert(!ldims.empty() && ldims.size() <= 3);
if(std::accumulate(ldims.begin(), ldims.end(), 1, std::multiplies<size_t>{}) >
if(std::accumulate(ldims.begin(), ldims.end(), 1ULL, std::multiplies<size_t>{}) >
256) // FIXME: get ldims limit from runtime
{
std::fill(ldims.begin(), ldims.end(), 0);
Expand Down
4 changes: 2 additions & 2 deletions src/include/miopen/rnn_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ struct RNNTensorPaddingConverter
size_t total_batch = std::accumulate(
desc_array.data,
desc_array.data + desc_array.size(),
0,
0ULL,
[](size_t x, miopenTensorDescriptor_t y) { return x + deref(y).GetLengths()[0]; });

return GetTempPackedBuffersSpace(rnn_desc, total_batch, desc_array[0].GetLengths()[1]);
Expand Down Expand Up @@ -255,7 +255,7 @@ struct RNNTensorBaseLayoutConverter
size_t total_batch = std::accumulate(
desc_array.data,
desc_array.data + desc_array.size(),
0,
0ULL,
[](size_t x, miopenTensorDescriptor_t y) { return x + deref(y).GetLengths()[0]; });

return GetTempPackedBuffersSpace(rnn_desc, total_batch, desc_array[0].GetLengths()[1]);
Expand Down
12 changes: 12 additions & 0 deletions src/include/miopen/tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,19 @@ struct MIOPEN_EXPORT TensorDescriptor : miopenTensorDescriptor

// Use only for external API
static TensorDescriptor MakeDescriptor(miopenDataType_t t, const int* plens, int size);
static TensorDescriptor MakeDescriptor(miopenDataType_t t, const std::size_t* plens, int size);
static TensorDescriptor
MakeDescriptor(miopenDataType_t t, miopenTensorLayout_t layout, const int* plens, int size);
static TensorDescriptor MakeDescriptor(miopenDataType_t t,
miopenTensorLayout_t layout,
const std::size_t* plens,
int size);
static TensorDescriptor
MakeDescriptor(miopenDataType_t t, const int* plens, const int* pstrides, int size);
static TensorDescriptor MakeDescriptor(miopenDataType_t t,
const std::size_t* plens,
const std::size_t* pstrides,
int size);

bool IsVectorized() const;

Expand Down Expand Up @@ -205,7 +214,10 @@ struct MIOPEN_EXPORT TensorDescriptor : miopenTensorDescriptor
}

bool IsPacked() const;
/// Checks all lengths and strides.
bool AllDimsFitIntoInt() const;
/// Checks only lengths.
bool AllLengthsFitIntoInt() const;

bool operator==(const TensorDescriptor& rhs) const;
bool operator!=(const TensorDescriptor& rhs) const;
Expand Down
11 changes: 7 additions & 4 deletions src/include/miopen/tensor_layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,18 +62,21 @@ void tensor_layout_to_strides(const std::vector<T>& len,
}
return std::accumulate(layout.begin() + pos + 1,
layout.end(),
1,
static_cast<T>(1),
[&dim_to_len](T accumulator, char l) {
return accumulator * dim_to_len[l];
});
});
}

/// \brief Version for vectorized layouts.
///
/// \todo Generalize with non-vectorized version, 90% of code is the same.
template <typename T>
void tensor_layout_to_strides(const std::vector<T>& len,
const std::string& len_layout,
const std::string& layout,
const int vector,
const std::size_t vector_size,
std::vector<T>& strides)
{
const std::string base_layout = layout.substr(0, len.size());
Expand All @@ -91,7 +94,7 @@ void tensor_layout_to_strides(const std::vector<T>& len,
len_layout.begin(),
len_layout.end(),
std::back_inserter(strides),
[&base_layout, &vector, &dim_to_len](char cur_layout_char) {
[&base_layout, &vector_size, &dim_to_len](char cur_layout_char) {
auto pos = base_layout.find(cur_layout_char);
if(pos == std::string::npos)
{
Expand All @@ -100,7 +103,7 @@ void tensor_layout_to_strides(const std::vector<T>& len,
return std::accumulate(
base_layout.begin() + pos + 1,
base_layout.end(),
vector,
vector_size,
[&dim_to_len](T accumulator, char l) { return accumulator * dim_to_len[l]; });
});
}
Expand Down
40 changes: 0 additions & 40 deletions src/mlo_dir_conv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,6 @@
#include <miopen/sqlite_db.hpp>
#endif

// Only select the first applicable igemm solver due to long compilation time
// (JIRA SWDEV-227826)
/// \todo enable all applicable solvers of igemm after fixing slow compilation
#define WORKAROUND_SWDEV_227826 0

#if WORKAROUND_SWDEV_227826
MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_IMPLICIT_GEMM_FIND_ALL_SOLUTIONS)
#endif

miopen::PerformanceDb miopen::GetDb(const miopen::ExecutionContext& ctx)
{
return {DbKinds::PerfDb, ctx.GetPerfDbPath(), ctx.GetUserPerfDbPath()};
Expand Down Expand Up @@ -260,30 +251,15 @@ std::vector<std::pair<std::string, size_t>>
FindAllImplicitGemmWorkspaceSizes(const miopen::ExecutionContext& ctx,
const miopen::conv::ProblemDescription& problem)
{
#if WORKAROUND_SWDEV_227826
if(miopen::IsEnabled(ENV(MIOPEN_DEBUG_IMPLICIT_GEMM_FIND_ALL_SOLUTIONS)))
return GetImplicitGemmSolvers().GetWorkspaceSizes(ctx, problem);
else
return GetImplicitGemmSolvers().GetWorkspaceSizes(ctx, problem, 1);
#else
return GetImplicitGemmSolvers().GetWorkspaceSizes(ctx, problem);
#endif
}

std::vector<miopen::solver::ConvSolution>
FindAllImplicitGemmSolutions(const miopen::ExecutionContext& ctx,
const miopen::conv::ProblemDescription& problem,
const miopen::AnyInvokeParams& invoke_ctx)
{
#if WORKAROUND_SWDEV_227826
if(miopen::IsEnabled(ENV(MIOPEN_DEBUG_IMPLICIT_GEMM_FIND_ALL_SOLUTIONS)))
return GetImplicitGemmSolvers().SearchForAllSolutions(ctx, problem, GetDb(ctx), invoke_ctx);
else
return GetImplicitGemmSolvers().SearchForAllSolutions(
ctx, problem, GetDb(ctx), invoke_ctx, 1);
#else
return GetImplicitGemmSolvers().SearchForAllSolutions(ctx, problem, GetDb(ctx), invoke_ctx);
#endif
}

std::vector<miopen::solver::ConvSolution>
Expand Down Expand Up @@ -313,31 +289,15 @@ std::vector<std::pair<std::string, size_t>>
FindImplicitGemmWrWWorkspaceSizes(const miopen::ExecutionContext& ctx,
const miopen::conv::ProblemDescription& problem)
{
#if WORKAROUND_SWDEV_227826
if(miopen::IsEnabled(ENV(MIOPEN_DEBUG_IMPLICIT_GEMM_FIND_ALL_SOLUTIONS)))
return GetImplicitGemmWrWSolvers().GetWorkspaceSizes(ctx, problem);
else
return GetImplicitGemmWrWSolvers().GetWorkspaceSizes(ctx, problem, 1);
#else
return GetImplicitGemmWrWSolvers().GetWorkspaceSizes(ctx, problem);
#endif
}

std::vector<miopen::solver::ConvSolution>
FindImplicitGemmWrWAllSolutions(const miopen::ExecutionContext& ctx,
const miopen::conv::ProblemDescription& problem,
const miopen::AnyInvokeParams& invoke_ctx)
{
#if WORKAROUND_SWDEV_227826
if(miopen::IsEnabled(ENV(MIOPEN_DEBUG_IMPLICIT_GEMM_FIND_ALL_SOLUTIONS)))
return GetImplicitGemmWrWSolvers().SearchForAllSolutions(
ctx, problem, GetDb(ctx), invoke_ctx);
else
return GetImplicitGemmWrWSolvers().SearchForAllSolutions(
ctx, problem, GetDb(ctx), invoke_ctx, 1);
#else
return GetImplicitGemmWrWSolvers().SearchForAllSolutions(ctx, problem, GetDb(ctx), invoke_ctx);
#endif
}

std::vector<miopen::solver::ConvSolution>
Expand Down
Loading

0 comments on commit 2db8ccd

Please sign in to comment.