diff --git a/driver/argmax_driver.hpp b/driver/argmax_driver.hpp index d0ca256433..f3a8aed1ac 100644 --- a/driver/argmax_driver.hpp +++ b/driver/argmax_driver.hpp @@ -54,7 +54,7 @@ int32_t mloArgmaxForwardRunHost(miopenTensorDescriptor_t inputDesc, int32_t reduce_size = static_cast(input_dims[dim]); auto output_numel = - std::accumulate(output_dims.begin(), output_dims.end(), 1L, std::multiplies()); + std::accumulate(output_dims.begin(), output_dims.end(), 1LL, std::multiplies()); auto inner_size = std::accumulate( input_dims.begin() + dim + 1, input_dims.end(), 1ULL, std::multiplies()); diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp index 5e5ca32047..6e9e5fb0b6 100644 --- a/driver/conv_driver.hpp +++ b/driver/conv_driver.hpp @@ -205,7 +205,7 @@ class GpumemTensor return; } - for(int i = 0; i < sz; ++i) + for(size_t i = 0; i < sz; ++i) { /// \anchor move_rand /// Generate random value, even if buffer is unused. This provides the same @@ -1559,7 +1559,7 @@ int ConvDriver::AllocateBuffersAndCopy() if(!biasFileName.empty()) read = readBufferFromFile(b_int8.data(), b_sz, biasFileName.c_str()); if(!read) - for(int i = 0; i < b_sz; i++) + for(size_t i = 0; i < b_sz; ++i) b_int8[i] = static_cast(i % 8) + prng::gen_canonical(); } std::ignore = b.AllocOnDeviceAndInit(q, ctx, b_sz, b_int8); @@ -1602,15 +1602,20 @@ int ConvDriver::AllocateBuffersAndCopy() if(!is_gpualloc) { - for(int i = 0; i < b_sz; i++) + for(size_t i = 0; i < b_sz; ++i) { if(!b_read) { - b.GetVector()[i] = static_cast(i % 8) // + /// (i % 8) can't be converted to F8 type as there is no suitable + /// conversion, but we have conversions from int and from uint8_t. + /// int is not good as it would produce negative results + /// after truncation of size_t, while we want positive values. + /// uint8_t is fine because (i % 8) fits into 3 bits. + b.GetVector()[i] = static_cast(static_cast(i) % 8) // + (is_fp8 ? prng::gen_A_to_B(Data_min, Data_max) // : prng::gen_canonical()); } - db.GetVector()[i] = static_cast(i % 8) // + db.GetVector()[i] = static_cast(static_cast(i) % 8) // + (is_fp8 ? prng::gen_A_to_B(Data_min, Data_max) // : prng::gen_canonical()); } @@ -2415,7 +2420,7 @@ int ConvDriver::RunForwardGPUReference() { auto out_tmp = tensor(miopen::deref(outputTensor)); out.CopyFromDeviceToHost(GetStream(), out_tmp); - for(int i = 0; i < out_tmp.data.size(); i++) + for(size_t i = 0; i < out_tmp.data.size(); ++i) { outhost.data[i] = static_cast(out_tmp.data[i]); } @@ -3326,7 +3331,7 @@ int ConvDriver::RunBackwardWeightsGPUReference() { auto dwei_tmp = tensor(miopen::deref(weightTensor)); dwei.CopyFromDeviceToHost(GetStream(), dwei_tmp); - for(int i = 0; i < dwei_tmp.data.size(); i++) + for(size_t i = 0; i < dwei_tmp.data.size(); ++i) { dwei_host.data[i] = static_cast(dwei_tmp.data[i]); } @@ -3377,7 +3382,7 @@ int ConvDriver::RunBackwardDataGPUReference() { auto din_tmp = tensor(miopen::deref(inputTensor)); din.CopyFromDeviceToHost(GetStream(), din_tmp); - for(int i = 0; i < din_tmp.data.size(); i++) + for(size_t i = 0; i < din_tmp.data.size(); ++i) { din_host.data[i] = static_cast(din_tmp.data[i]); } @@ -3433,6 +3438,10 @@ std::string ConvDriver::GetVerificationCacheFileName( { return "int8"; } + if(std::is_same::value) + { + return "int32"; + } else if(std::is_same::value) { return "float16"; diff --git a/driver/ctc_driver.hpp b/driver/ctc_driver.hpp index fe9c27bdb2..c7fa9f02e6 100644 --- a/driver/ctc_driver.hpp +++ b/driver/ctc_driver.hpp @@ -251,7 +251,7 @@ template int CTCDriver::AllocateBuffersAndCopy() { size_t probs_sz = batch_size * (num_class + 1) * max_time_step; - size_t labels_sz = std::accumulate(labelLengths.begin(), labelLengths.end(), 0); + size_t labels_sz = std::accumulate(labelLengths.begin(), labelLengths.end(), 0ULL); size_t workSpaceSize; size_t workSpaceSizeCPU; diff --git a/driver/rnn_driver.hpp b/driver/rnn_driver.hpp index f93b719336..03da917e0a 100644 --- a/driver/rnn_driver.hpp +++ b/driver/rnn_driver.hpp @@ -618,7 +618,7 @@ int RNNDriver::AllocateBuffersAndCopy() int nseq = inflags.GetValueInt("seq_len"); std::vector in_n = GetInputTensorLengthsFromCmdLine(); std::size_t inputBatchLenSum; - inputBatchLenSum = std::accumulate(in_n.begin(), in_n.begin() + nseq, 0); + inputBatchLenSum = std::accumulate(in_n.begin(), in_n.begin() + nseq, 0ULL); int hid_h = inflags.GetValueInt("hid_h"); int layer = inflags.GetValueInt("num_layer"); diff --git a/driver/rnn_seq_driver.hpp b/driver/rnn_seq_driver.hpp index 2d5636e062..8d5b720960 100644 --- a/driver/rnn_seq_driver.hpp +++ b/driver/rnn_seq_driver.hpp @@ -766,7 +766,7 @@ inline size_t Get3DNoVECTensorSize(miopenTensorDescriptor_t& tensor) assert(miopen::deref(tensor).IsPacked() && "GetTensorSize should not be used on an unpacked tensor."); const auto len = GetTensorLengths(tensor); - size_t sz = std::accumulate(len.begin(), len.end(), 1, std::multiplies()); + size_t sz = std::accumulate(len.begin(), len.end(), 1ULL, std::multiplies()); return sz; } @@ -827,7 +827,7 @@ int RNNSeqDriver::AllocateBuffersAndCopy() const std::vector out_lens = GetOutputTensorLengthsFromCmdLine(); const size_t vectors_cnt_host = - std::accumulate(sorted_seq_lens.begin(), sorted_seq_lens.end(), 0); + std::accumulate(sorted_seq_lens.begin(), sorted_seq_lens.end(), 0ULL); const size_t vectors_cnt_gpu = io_layout == miopenRNNDataSeqMajorNotPadded ? vectors_cnt_host : in_lens[0] * in_lens[1]; diff --git a/driver/sum_driver.hpp b/driver/sum_driver.hpp index b348d3a22f..03589e29e6 100644 --- a/driver/sum_driver.hpp +++ b/driver/sum_driver.hpp @@ -58,7 +58,7 @@ int32_t mloSumForwardRunHost(miopenTensorDescriptor_t inputDesc, auto reduce_size = input_dims[dim]; auto output_numel = - std::accumulate(output_dims.begin(), output_dims.end(), 1L, std::multiplies()); + std::accumulate(output_dims.begin(), output_dims.end(), 1LL, std::multiplies()); auto inner_size = 1ULL; for(int32_t i = dim + 1; i < input_dims.size(); i++) diff --git a/driver/tensor_driver.hpp b/driver/tensor_driver.hpp index 71b7e9813b..f6868fab98 100644 --- a/driver/tensor_driver.hpp +++ b/driver/tensor_driver.hpp @@ -181,6 +181,14 @@ inline int SetTensorNd(miopenTensorDescriptor_t t, return miopenSetTensorDescriptor(t, data_type, len.size(), len.data(), strides.data()); } +inline int SetTensorNd(miopenTensorDescriptor_t t, + std::vector& len, + std::vector& strides, + miopenDataType_t data_type = miopenFloat) +{ + return miopenSetTensorDescriptorV2(t, data_type, len.size(), len.data(), strides.data()); +} + inline int SetTensorNd(miopenTensorDescriptor_t t, std::vector& len, const std::string& layout, @@ -208,10 +216,10 @@ inline int SetTensorNd(miopenTensorDescriptor_t t, return SetTensorNd(t, len, data_type); } - std::vector strides; - miopen::tensor_layout_to_strides(len, len_layout, layout, strides); - - return SetTensorNd(t, len, strides, data_type); + std::vector strides2; + std::vector len2(len.cbegin(), len.cend()); + miopen::tensor_layout_to_strides(len2, len_layout, layout, strides2); + return SetTensorNd(t, len2, strides2, data_type); } // This function ignores tensor strides completely and its result should not be interpreted as @@ -222,8 +230,8 @@ inline size_t GetTensorSize(const miopenTensorDescriptor_t& tensor) { assert(miopen::deref(tensor).IsPacked() && "GetTensorSize should not be used on an unpacked tensor."); - const auto len = GetTensorLengths(tensor); - const auto vectorLength = GetTensorVectorLength(tensor); + const auto len = GetTensorLengths(tensor); + const size_t vectorLength = GetTensorVectorLength(tensor); size_t sz = std::accumulate(len.begin(), len.end(), vectorLength, std::multiplies()); return sz; diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index a06dc8c2a4..89a3060c9e 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -749,6 +749,16 @@ MIOPEN_EXPORT miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t const int* dimsA, const int* stridesA); +#ifdef MIOPEN_BETA_API +/*! @copydoc miopenSetTensorDescriptor() + */ +MIOPEN_EXPORT miopenStatus_t miopenSetTensorDescriptorV2(miopenTensorDescriptor_t tensorDesc, + miopenDataType_t dataType, + int nbDims, + const size_t* dimsA, + const size_t* stridesA); +#endif + #ifdef MIOPEN_BETA_API /*! @brief Set the tensor cast type * diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp index 5ff81e3833..8bba1ba3c5 100644 --- a/src/include/miopen/conv/problem_description.hpp +++ b/src/include/miopen/conv/problem_description.hpp @@ -348,6 +348,12 @@ struct ProblemDescription : ProblemDescriptionBase return in.AllDimsFitIntoInt() && weights.AllDimsFitIntoInt() && out.AllDimsFitIntoInt(); } + bool AllTensorsLengthsFitIntoInt() const + { + return in.AllLengthsFitIntoInt() && weights.AllLengthsFitIntoInt() && + out.AllLengthsFitIntoInt(); + } + void HeuristicUpdateLayouts(); void MakeNetworkConfig(std::string& conf_key) const; diff --git a/src/include/miopen/oclkernel.hpp b/src/include/miopen/oclkernel.hpp index 8c00c64640..6b42aa2b3b 100644 --- a/src/include/miopen/oclkernel.hpp +++ b/src/include/miopen/oclkernel.hpp @@ -149,7 +149,7 @@ class OCLKernel { assert(!gdims.empty() && gdims.size() <= 3); assert(!ldims.empty() && ldims.size() <= 3); - if(std::accumulate(ldims.begin(), ldims.end(), 1, std::multiplies{}) > + if(std::accumulate(ldims.begin(), ldims.end(), 1ULL, std::multiplies{}) > 256) // FIXME: get ldims limit from runtime { std::fill(ldims.begin(), ldims.end(), 0); diff --git a/src/include/miopen/rnn_util.hpp b/src/include/miopen/rnn_util.hpp index d3a95b598f..92876b8a9a 100644 --- a/src/include/miopen/rnn_util.hpp +++ b/src/include/miopen/rnn_util.hpp @@ -193,7 +193,7 @@ struct RNNTensorPaddingConverter size_t total_batch = std::accumulate( desc_array.data, desc_array.data + desc_array.size(), - 0, + 0ULL, [](size_t x, miopenTensorDescriptor_t y) { return x + deref(y).GetLengths()[0]; }); return GetTempPackedBuffersSpace(rnn_desc, total_batch, desc_array[0].GetLengths()[1]); @@ -255,7 +255,7 @@ struct RNNTensorBaseLayoutConverter size_t total_batch = std::accumulate( desc_array.data, desc_array.data + desc_array.size(), - 0, + 0ULL, [](size_t x, miopenTensorDescriptor_t y) { return x + deref(y).GetLengths()[0]; }); return GetTempPackedBuffersSpace(rnn_desc, total_batch, desc_array[0].GetLengths()[1]); diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp index 658cbcd583..437668cfcb 100644 --- a/src/include/miopen/tensor.hpp +++ b/src/include/miopen/tensor.hpp @@ -171,10 +171,19 @@ struct MIOPEN_EXPORT TensorDescriptor : miopenTensorDescriptor // Use only for external API static TensorDescriptor MakeDescriptor(miopenDataType_t t, const int* plens, int size); + static TensorDescriptor MakeDescriptor(miopenDataType_t t, const std::size_t* plens, int size); static TensorDescriptor MakeDescriptor(miopenDataType_t t, miopenTensorLayout_t layout, const int* plens, int size); + static TensorDescriptor MakeDescriptor(miopenDataType_t t, + miopenTensorLayout_t layout, + const std::size_t* plens, + int size); static TensorDescriptor MakeDescriptor(miopenDataType_t t, const int* plens, const int* pstrides, int size); + static TensorDescriptor MakeDescriptor(miopenDataType_t t, + const std::size_t* plens, + const std::size_t* pstrides, + int size); bool IsVectorized() const; @@ -205,7 +214,10 @@ struct MIOPEN_EXPORT TensorDescriptor : miopenTensorDescriptor } bool IsPacked() const; + /// Checks all lengths and strides. bool AllDimsFitIntoInt() const; + /// Checks only lengths. + bool AllLengthsFitIntoInt() const; bool operator==(const TensorDescriptor& rhs) const; bool operator!=(const TensorDescriptor& rhs) const; diff --git a/src/include/miopen/tensor_layout.hpp b/src/include/miopen/tensor_layout.hpp index 93c09e171a..f5659d7dd3 100644 --- a/src/include/miopen/tensor_layout.hpp +++ b/src/include/miopen/tensor_layout.hpp @@ -62,18 +62,21 @@ void tensor_layout_to_strides(const std::vector& len, } return std::accumulate(layout.begin() + pos + 1, layout.end(), - 1, + static_cast(1), [&dim_to_len](T accumulator, char l) { return accumulator * dim_to_len[l]; }); }); } +/// \brief Version for vectorized layouts. +/// +/// \todo Generalize with non-vectorized version, 90% of code is the same. template void tensor_layout_to_strides(const std::vector& len, const std::string& len_layout, const std::string& layout, - const int vector, + const std::size_t vector_size, std::vector& strides) { const std::string base_layout = layout.substr(0, len.size()); @@ -91,7 +94,7 @@ void tensor_layout_to_strides(const std::vector& len, len_layout.begin(), len_layout.end(), std::back_inserter(strides), - [&base_layout, &vector, &dim_to_len](char cur_layout_char) { + [&base_layout, &vector_size, &dim_to_len](char cur_layout_char) { auto pos = base_layout.find(cur_layout_char); if(pos == std::string::npos) { @@ -100,7 +103,7 @@ void tensor_layout_to_strides(const std::vector& len, return std::accumulate( base_layout.begin() + pos + 1, base_layout.end(), - vector, + vector_size, [&dim_to_len](T accumulator, char l) { return accumulator * dim_to_len[l]; }); }); } diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index c498414d0f..55b450e7fc 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -49,15 +49,6 @@ #include #endif -// Only select the first applicable igemm solver due to long compilation time -// (JIRA SWDEV-227826) -/// \todo enable all applicable solvers of igemm after fixing slow compilation -#define WORKAROUND_SWDEV_227826 0 - -#if WORKAROUND_SWDEV_227826 -MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_IMPLICIT_GEMM_FIND_ALL_SOLUTIONS) -#endif - miopen::PerformanceDb miopen::GetDb(const miopen::ExecutionContext& ctx) { return {DbKinds::PerfDb, ctx.GetPerfDbPath(), ctx.GetUserPerfDbPath()}; @@ -260,14 +251,7 @@ std::vector> FindAllImplicitGemmWorkspaceSizes(const miopen::ExecutionContext& ctx, const miopen::conv::ProblemDescription& problem) { -#if WORKAROUND_SWDEV_227826 - if(miopen::IsEnabled(ENV(MIOPEN_DEBUG_IMPLICIT_GEMM_FIND_ALL_SOLUTIONS))) - return GetImplicitGemmSolvers().GetWorkspaceSizes(ctx, problem); - else - return GetImplicitGemmSolvers().GetWorkspaceSizes(ctx, problem, 1); -#else return GetImplicitGemmSolvers().GetWorkspaceSizes(ctx, problem); -#endif } std::vector @@ -275,15 +259,7 @@ FindAllImplicitGemmSolutions(const miopen::ExecutionContext& ctx, const miopen::conv::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx) { -#if WORKAROUND_SWDEV_227826 - if(miopen::IsEnabled(ENV(MIOPEN_DEBUG_IMPLICIT_GEMM_FIND_ALL_SOLUTIONS))) - return GetImplicitGemmSolvers().SearchForAllSolutions(ctx, problem, GetDb(ctx), invoke_ctx); - else - return GetImplicitGemmSolvers().SearchForAllSolutions( - ctx, problem, GetDb(ctx), invoke_ctx, 1); -#else return GetImplicitGemmSolvers().SearchForAllSolutions(ctx, problem, GetDb(ctx), invoke_ctx); -#endif } std::vector @@ -313,14 +289,7 @@ std::vector> FindImplicitGemmWrWWorkspaceSizes(const miopen::ExecutionContext& ctx, const miopen::conv::ProblemDescription& problem) { -#if WORKAROUND_SWDEV_227826 - if(miopen::IsEnabled(ENV(MIOPEN_DEBUG_IMPLICIT_GEMM_FIND_ALL_SOLUTIONS))) - return GetImplicitGemmWrWSolvers().GetWorkspaceSizes(ctx, problem); - else - return GetImplicitGemmWrWSolvers().GetWorkspaceSizes(ctx, problem, 1); -#else return GetImplicitGemmWrWSolvers().GetWorkspaceSizes(ctx, problem); -#endif } std::vector @@ -328,16 +297,7 @@ FindImplicitGemmWrWAllSolutions(const miopen::ExecutionContext& ctx, const miopen::conv::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx) { -#if WORKAROUND_SWDEV_227826 - if(miopen::IsEnabled(ENV(MIOPEN_DEBUG_IMPLICIT_GEMM_FIND_ALL_SOLUTIONS))) - return GetImplicitGemmWrWSolvers().SearchForAllSolutions( - ctx, problem, GetDb(ctx), invoke_ctx); - else - return GetImplicitGemmWrWSolvers().SearchForAllSolutions( - ctx, problem, GetDb(ctx), invoke_ctx, 1); -#else return GetImplicitGemmWrWSolvers().SearchForAllSolutions(ctx, problem, GetDb(ctx), invoke_ctx); -#endif } std::vector diff --git a/src/ocl/tensorocl.cpp b/src/ocl/tensorocl.cpp index 4842ad297b..6fd8a172cf 100644 --- a/src/ocl/tensorocl.cpp +++ b/src/ocl/tensorocl.cpp @@ -1432,9 +1432,8 @@ void SetTensor(const Handle& handle, #ifndef NDEBUG if(yDesc.GetNumDims() != yDesc_flat.GetNumDims()) { - MIOPEN_LOG_I2(__func__ << std::endl - << "real descriptor: " << yDesc << std::endl - << "flat descriptor: " << yDesc_flat << std::endl); + MIOPEN_LOG_I2("real descriptor: " << yDesc); + MIOPEN_LOG_I2("flat descriptor: " << yDesc_flat); } #endif @@ -1586,9 +1585,8 @@ void ScaleTensor(const Handle& handle, #ifndef NDEBUG if(yDesc.GetNumDims() != yDesc_flat.GetNumDims()) { - MIOPEN_LOG_I2(__func__ << std::endl - << "real descriptor: " << yDesc << std::endl - << "flat descriptor: " << yDesc_flat << std::endl); + MIOPEN_LOG_I2("real descriptor: " << yDesc); + MIOPEN_LOG_I2("flat descriptor: " << yDesc_flat); } #endif @@ -1765,11 +1763,10 @@ void CopyTensor(const Handle& handle, #ifndef NDEBUG if(srcDesc.GetNumDims() != srcDesc_flat.GetNumDims()) { - MIOPEN_LOG_I2(__func__ << std::endl - << "src real descriptor: " << srcDesc << std::endl - << "src flat descriptor: " << srcDesc_flat << std::endl - << "dst real descriptor: " << dstDesc << std::endl - << "dst flat descriptor: " << dstDesc_flat << std::endl); + MIOPEN_LOG_I2("src real descriptor: " << srcDesc); + MIOPEN_LOG_I2("src flat descriptor: " << srcDesc_flat); + MIOPEN_LOG_I2("dst real descriptor: " << dstDesc); + MIOPEN_LOG_I2("dst flat descriptor: " << dstDesc_flat); } #endif @@ -1975,11 +1972,10 @@ void CastTensor(const Handle& handle, #ifndef NDEBUG if(srcDesc.GetNumDims() != srcDesc_flat.GetNumDims()) { - MIOPEN_LOG_I2(__func__ << std::endl - << "src real descriptor: " << srcDesc << std::endl - << "src flat descriptor: " << srcDesc_flat << std::endl - << "dst real descriptor: " << dstDesc << std::endl - << "dst flat descriptor: " << dstDesc_flat << std::endl); + MIOPEN_LOG_I2("src real descriptor: " << srcDesc); + MIOPEN_LOG_I2("src flat descriptor: " << srcDesc_flat); + MIOPEN_LOG_I2("dst real descriptor: " << dstDesc); + MIOPEN_LOG_I2("dst flat descriptor: " << dstDesc_flat); } #endif @@ -2259,16 +2255,14 @@ void TransformTensor(const Handle& handle, #ifndef NDEBUG if(xDesc.GetNumDims() != xDesc_flat.GetNumDims()) { - MIOPEN_LOG_I2(__func__ << std::endl - << "real descriptor: " << xDesc << std::endl - << "flat descriptor: " << xDesc_flat << std::endl); + MIOPEN_LOG_I2("x real descriptor: " << xDesc); + MIOPEN_LOG_I2("x flat descriptor: " << xDesc_flat); } if(yDesc.GetNumDims() != yDesc_flat.GetNumDims()) { - MIOPEN_LOG_I2(__func__ << std::endl - << "real descriptor: " << yDesc << std::endl - << "flat descriptor: " << yDesc_flat << std::endl); + MIOPEN_LOG_I2("y real descriptor: " << yDesc); + MIOPEN_LOG_I2("y flat descriptor: " << yDesc_flat); } #endif diff --git a/src/rnn.cpp b/src/rnn.cpp index 6caf17a742..c23b6a0bc7 100644 --- a/src/rnn.cpp +++ b/src/rnn.cpp @@ -527,7 +527,7 @@ size_t RNNDescriptor::GetWorkspaceSize(Handle& handle, std::size_t total_sequence_len = 0; total_sequence_len = std::accumulate( - xDesc.data, xDesc.data + seqLength, 0, [](size_t x, miopenTensorDescriptor_t y) { + xDesc.data, xDesc.data + seqLength, 0ULL, [](size_t x, miopenTensorDescriptor_t y) { return x + deref(y).GetLengths()[0]; }); @@ -589,7 +589,7 @@ size_t RNNDescriptor::GetReserveSize(Handle& /* handle */, } std::size_t inputBatchLenSum = 0; inputBatchLenSum = std::accumulate( - xDesc.data, xDesc.data + seqLength, 0, [](size_t x, miopenTensorDescriptor_t y) { + xDesc.data, xDesc.data + seqLength, 0ULL, [](size_t x, miopenTensorDescriptor_t y) { return x + deref(y).GetLengths()[0]; }); return GetReserveSize(inputBatchLenSum); @@ -646,7 +646,7 @@ size_t RNNDescriptor::GetRNNInputSuperTensorSize(Handle& /* handle */, if(paddingMode == miopenRNNIONotPadded) { inputBatchLenSum = std::accumulate( - xDesc.data, xDesc.data + seqLength, 0, [](size_t x, miopenTensorDescriptor_t y) { + xDesc.data, xDesc.data + seqLength, 0ULL, [](size_t x, miopenTensorDescriptor_t y) { return x + deref(y).GetLengths()[0]; }); } diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp index abd286509d..5d8ed5d03d 100644 --- a/src/solver/conv_direct_naive_conv_bwd.cpp +++ b/src/solver/conv_direct_naive_conv_bwd.cpp @@ -49,7 +49,7 @@ bool ConvDirectNaiveConvBwd::IsApplicable(const ExecutionContext& ctx, if(!problem.IsDirectionBackwardData()) return false; - if(!problem.AllTensorsDimsFitIntoInt()) + if(!problem.AllTensorsLengthsFitIntoInt()) return false; if(!problem.IsLayoutDefault() && !problem.IsLayoutNHWC()) return false; diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp index 8e38537be4..c9ca46ffeb 100644 --- a/src/solver/conv_direct_naive_conv_fwd.cpp +++ b/src/solver/conv_direct_naive_conv_fwd.cpp @@ -56,7 +56,7 @@ bool ConvDirectNaiveConvFwd::IsApplicable(const ExecutionContext& ctx, if(!problem.IsDirectionForward()) return false; - if(!problem.AllTensorsDimsFitIntoInt()) + if(!problem.AllTensorsLengthsFitIntoInt()) return false; if(problem.IsTensorsCasted()) diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp index 95e7e75f7b..c937ec63cc 100644 --- a/src/solver/conv_direct_naive_conv_wrw.cpp +++ b/src/solver/conv_direct_naive_conv_wrw.cpp @@ -56,7 +56,7 @@ bool ConvDirectNaiveConvWrw::IsApplicable(const ExecutionContext& ctx, if(!problem.IsDirectionBackwardWrW()) return false; - if(!problem.AllTensorsDimsFitIntoInt()) + if(!problem.AllTensorsLengthsFitIntoInt()) return false; if(problem.IsTensorsCasted()) { diff --git a/src/tensor.cpp b/src/tensor.cpp index 484b2cda76..1d2ed405af 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -272,6 +272,12 @@ TensorDescriptor TensorDescriptor::MakeDescriptor(miopenDataType_t t, const int* return MakeDescriptor(t, GetDefaultLayout(), plens, size); } +TensorDescriptor +TensorDescriptor::MakeDescriptor(miopenDataType_t t, const std::size_t* plens, int size) +{ + return MakeDescriptor(t, GetDefaultLayout(), plens, size); +} + TensorDescriptor TensorDescriptor::MakeDescriptor(miopenDataType_t t, miopenTensorLayout_t layout, const int* plens, @@ -283,6 +289,17 @@ TensorDescriptor TensorDescriptor::MakeDescriptor(miopenDataType_t t, return {t, layout, std::vector(plens, plens + size)}; } +TensorDescriptor TensorDescriptor::MakeDescriptor(miopenDataType_t t, + miopenTensorLayout_t layout, + const std::size_t* plens, + int size) +{ + if(plens == nullptr || size <= 0) + MIOPEN_THROW(miopenStatusInvalidValue); + + return {t, layout, std::vector(plens, plens + size)}; +} + TensorDescriptor TensorDescriptor::MakeDescriptor(miopenDataType_t t, const int* plens, const int* pstrides, @@ -294,6 +311,19 @@ TensorDescriptor TensorDescriptor::MakeDescriptor(miopenDataType_t t, return {t, std::vector(plens, plens + size), std::vector(pstrides, pstrides + size)}; } +TensorDescriptor TensorDescriptor::MakeDescriptor(miopenDataType_t t, + const std::size_t* plens, + const std::size_t* pstrides, + int size) +{ + if(plens == nullptr || pstrides == nullptr || size <= 0) + MIOPEN_THROW(miopenStatusInvalidValue); + + return {t, + std::vector(plens, plens + size), + std::vector(pstrides, pstrides + size)}; +} + void TensorDescriptor::CalculateStrides() { if(lens.empty()) @@ -425,7 +455,7 @@ std::size_t TensorDescriptor::GetNumBytes() const bool TensorDescriptor::IsPacked() const { return this->packed; } -bool TensorDescriptor::AllDimsFitIntoInt() const +bool TensorDescriptor::AllLengthsFitIntoInt() const { if(std::any_of(lens.cbegin(), lens.cend(), [](std::size_t x) { return x > std::numeric_limits::max(); @@ -433,6 +463,13 @@ bool TensorDescriptor::AllDimsFitIntoInt() const { return false; } + return true; +} + +bool TensorDescriptor::AllDimsFitIntoInt() const +{ + if(!AllLengthsFitIntoInt()) + return false; if(std::any_of(strides.cbegin(), strides.cend(), [](std::size_t x) { return x > std::numeric_limits::max(); })) diff --git a/src/tensor_api.cpp b/src/tensor_api.cpp index 1b0209ecd2..494994b1d6 100644 --- a/src/tensor_api.cpp +++ b/src/tensor_api.cpp @@ -199,6 +199,33 @@ extern "C" miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t ten }); } +extern "C" miopenStatus_t miopenSetTensorDescriptorV2(miopenTensorDescriptor_t tensorDesc, + miopenDataType_t dataType, + int nbDims, + const size_t* dimsA, + const size_t* stridesA) +{ + if(miopen::IsLoggingFunctionCalls()) + { + const miopen::logger::CArray dim(dimsA, nbDims); + const miopen::logger::CArray stride(stridesA, nbDims); + MIOPEN_LOG_FUNCTION(tensorDesc, dataType, nbDims, dim.values, stride.values); + } + + return miopen::try_([&] { + if(stridesA == nullptr) + { + miopen::deref(tensorDesc) = + miopen::TensorDescriptor::MakeDescriptor(dataType, dimsA, nbDims); + } + else + { + miopen::deref(tensorDesc) = + miopen::TensorDescriptor::MakeDescriptor(dataType, dimsA, stridesA, nbDims); + } + }); +} + extern "C" miopenStatus_t miopenSetTensorCastType(miopenTensorDescriptor_t tensorDesc, miopenDataType_t cast_type) { diff --git a/test/conv3d.hpp b/test/conv3d.hpp index 2b49c0f10e..33542587f4 100644 --- a/test/conv3d.hpp +++ b/test/conv3d.hpp @@ -56,5 +56,11 @@ struct conv3d_driver : conv_driver this->add(this->in_layout, "in_layout", this->generate_data({"NCDHW"})); this->add(this->fil_layout, "fil_layout", this->generate_data({"NCDHW"})); this->add(this->out_layout, "out_layout", this->generate_data({"NCDHW"})); + this->add(this->deterministic, "deterministic", this->generate_data({false})); + this->add(this->tensor_vect, "tensor_vect", this->generate_data({0})); + this->add(this->vector_length, "vector_length", this->generate_data({1})); + // Only valid for int8 input and weights + this->add(this->output_type, "output_type", this->generate_data({"int32"})); + this->add(this->int8_vectorize, "int8_vectorize", this->generate_data({false})); } }; diff --git a/test/conv3d_find2.cpp b/test/conv3d_find2.cpp index 0a36c8958a..198b6890cc 100644 --- a/test/conv3d_find2.cpp +++ b/test/conv3d_find2.cpp @@ -56,6 +56,12 @@ struct conv3d_find2_driver : conv_driver this->add(this->in_layout, "in_layout", this->generate_data({"NCDHW"})); this->add(this->fil_layout, "fil_layout", this->generate_data({"NCDHW"})); this->add(this->out_layout, "out_layout", this->generate_data({"NCDHW"})); + this->add(this->deterministic, "deterministic", this->generate_data({false})); + this->add(this->tensor_vect, "tensor_vect", this->generate_data({0})); + this->add(this->vector_length, "vector_length", this->generate_data({1})); + // Only valid for int8 input and weights + this->add(this->output_type, "output_type", this->generate_data({"int32"})); + this->add(this->int8_vectorize, "int8_vectorize", this->generate_data({false})); } }; diff --git a/test/conv_common.hpp b/test/conv_common.hpp index acb29dc29f..9bcba030fc 100644 --- a/test/conv_common.hpp +++ b/test/conv_common.hpp @@ -2152,9 +2152,12 @@ struct conv_driver : test_driver bool is_bfloat16 = (input.desc.GetType() == miopenBFloat16 && weights.desc.GetType() == miopenBFloat16); - // bfloat16 is not supported for conv3d if(is_bfloat16 && !(filter.spatialDim == 2)) + { + show_command(); + std::cout << "Skipped: bfloat16 is supported for 2D conv only" << std::endl; return; + } if(((filter.mode == miopenTranspose) && ((filter.group_count == 1 && in_c_len == wei_k_len) || @@ -2171,6 +2174,8 @@ struct conv_driver : test_driver { if(miopen::any_of(filter.GetConvStrides(), [](auto v) { return v == 0; })) { + show_command(); + std::cout << "Skipped: stride[i] == 0" << std::endl; return; } @@ -2198,13 +2203,19 @@ struct conv_driver : test_driver if(miopen::any_of(out_spatial_len, [](auto v) { return v <= 0; })) { + show_command(); + std::cout << "Skipped: out_spatial_len[i] <= 0" << std::endl; return; } } else if(filter.paddingMode == miopenPaddingValid) { if(miopen::any_of(filter.GetConvStrides(), [](auto v) { return v == 0; })) + { + show_command(); + std::cout << "Skipped: stride[i] == 0" << std::endl; return; + } std::vector out_spatial_len(spatial_dim); @@ -2220,6 +2231,8 @@ struct conv_driver : test_driver if(miopen::any_of(out_spatial_len, [](auto v) { return v <= 0; })) { + show_command(); + std::cout << "Skipped: out_spatial_len[i] <= 0" << std::endl; return; } } @@ -2293,16 +2306,6 @@ struct conv_driver : test_driver } #endif - // bwd53 kernel (large images supported) doesnt support stride !=1 and dilation and - // pad. - if(filter.GetSpatialDimension() == 2 && in_spatial_len[1] >= 2048 && - ((filter.GetConvStrides()[0] != 1) || (filter.GetConvStrides()[1] != 1) || - (filter.GetConvDilations()[0] != 1) || (filter.GetConvDilations()[1] != 1) || - (filter.GetConvPads()[1] != 0) || (filter.GetConvPads()[0] != 0))) - { - return; - } - input.generate(gen_positive_value); output.generate(gen_positive_value); weights.generate(gen_sign_value); @@ -2429,6 +2432,12 @@ struct conv_driver : test_driver search, int8_vectorize}); } + else + { + show_command(); + std::cout << "FAILED: bad output_type: '" << output_type << '\'' + << std::endl; + } } else { diff --git a/test/cpu_argmax.hpp b/test/cpu_argmax.hpp index c70487019e..08f40c9c01 100644 --- a/test/cpu_argmax.hpp +++ b/test/cpu_argmax.hpp @@ -36,7 +36,7 @@ void cpu_argmax_forward(tensor input, tensor& ref_output, int32_t dim) auto reduce_size = input_dims[dim]; auto output_numel = - std::accumulate(output_dims.begin(), output_dims.end(), 1L, std::multiplies()); + std::accumulate(output_dims.begin(), output_dims.end(), 1LL, std::multiplies()); auto inner_size = 1ULL; for(int32_t i = dim + 1; i < input_dims.size(); i++) diff --git a/test/cpu_sum.hpp b/test/cpu_sum.hpp index 8898a5654b..73d68eb9b8 100644 --- a/test/cpu_sum.hpp +++ b/test/cpu_sum.hpp @@ -39,7 +39,7 @@ void cpu_sum_forward(tensor input, auto reduce_size = input_dims[dim]; auto output_numel = - std::accumulate(output_dims.begin(), output_dims.end(), 1L, std::multiplies()); + std::accumulate(output_dims.begin(), output_dims.end(), 1LL, std::multiplies()); auto inner_size = 1ULL; for(int32_t i = dim + 1; i < input_dims.size(); i++) diff --git a/test/ctc.cpp b/test/ctc.cpp index 1c759220f2..4dfd452d50 100644 --- a/test/ctc.cpp +++ b/test/ctc.cpp @@ -776,7 +776,7 @@ struct ctc_driver : test_driver losses = tensor{lossesDims}; std::fill(losses.begin(), losses.end(), T(0)); - size_t labels_sz = std::accumulate(labelLengths.begin(), labelLengths.end(), 0); + size_t labels_sz = std::accumulate(labelLengths.begin(), labelLengths.end(), 0ULL); auto labels = std::vector(labels_sz); int blank_lb = ctcLossDesc.blank_label_id; diff --git a/test/include_inliner.cpp b/test/include_inliner.cpp index b4f1e4d5aa..62e6510d3f 100644 --- a/test/include_inliner.cpp +++ b/test/include_inliner.cpp @@ -37,24 +37,24 @@ namespace miopen { namespace tests { -static int Child(std::string_view cmd, const fs::path& path) -{ - return miopen::Process{cmd}("-source " + path); -} - class InlinerTest { + const TmpDir test_srcs{"test_include_inliner"}; + + int Child(const fs::path& exe, const fs::path& source) const + { + return test_srcs.Execute(exe.string(), "-source " + source); + } + public: void Run(const fs::path& exe_path) const { - const TmpDir test_srcs{"test_include_inliner"}; - const auto addkernels = - miopen::make_executable_name(exe_path.parent_path() / "addkernels").string(); + const auto addkernels = make_executable_name(exe_path.parent_path() / "addkernels"); const auto header_filename = "header.h"; - const auto asm_src = test_srcs.path / "valid.s"; - const auto valid_src = test_srcs.path / "valid.cl"; - const auto invalid_src = test_srcs.path / "invalid.cl"; - const auto header_src = test_srcs.path / header_filename; + const auto asm_src = test_srcs / "valid.s"; + const auto valid_src = test_srcs / "valid.cl"; + const auto invalid_src = test_srcs / "invalid.cl"; + const auto header_src = test_srcs / header_filename; // clang-format-off std::ofstream(valid_src.c_str()) << "#include <" << header_filename << ">\n" diff --git a/test/lstm_common.hpp b/test/lstm_common.hpp index d2b7d1a077..85d17dc138 100644 --- a/test/lstm_common.hpp +++ b/test/lstm_common.hpp @@ -465,7 +465,7 @@ struct verify_forward_infer_lstm : verify_forward_lstm { reserveSpaceSize /= sizeof(T); reserveSpaceSize -= - nLayers * std::accumulate(batch_seq.begin(), batch_seq.begin() + seqLength, 0) * + nLayers * std::accumulate(batch_seq.begin(), batch_seq.begin() + seqLength, 0ULL) * hiddenSize * bi; reserveSpaceSize *= 2; reserveSpaceSize *= sizeof(T); @@ -771,7 +771,7 @@ struct verify_forward_train_lstm : verify_forward_lstm outputCPPDescs, outputDescs, batch_seq, out_h, miopen::deref(rnnDesc).dataType); size_t inputBatchLenSum = - std::accumulate(batch_seq.begin(), batch_seq.begin() + seqLength, 0); + std::accumulate(batch_seq.begin(), batch_seq.begin() + seqLength, 0ULL); size_t reserveSpaceSize; reserveSpaceSize = 2 * 6 * miopen::deref(rnnDesc).nLayers * inputBatchLenSum * out_h; @@ -1075,7 +1075,8 @@ verify_backward_data_lstm::cpu() const std::vector dhx(initHidden.size()); std::vector dcx(initHidden.size()); - size_t inputBatchLenSum = std::accumulate(batch_seq.begin(), batch_seq.begin() + seqLength, 0); + size_t inputBatchLenSum = + std::accumulate(batch_seq.begin(), batch_seq.begin() + seqLength, 0ULL); size_t reserveSpaceSize; reserveSpaceSize = 2ULL * 6 * miopen::deref(rnnDesc).nLayers * inputBatchLenSum * hiddenSize * ((dirMode != 0) ? 2 : 1); @@ -1778,7 +1779,7 @@ struct lstm_basic_driver : test_driver std::vector rsvgpu(reserveSpaceSize, T(0)); size_t inputBatchLenSum = - std::accumulate(batchSeq.begin(), batchSeq.begin() + seqLength, 0); + std::accumulate(batchSeq.begin(), batchSeq.begin() + seqLength, 0ULL); reserveSpaceSize = 2ULL * 6 * numLayers * inputBatchLenSum * hiddenSize * ((dirMode != 0) ? 2 : 1); if(useDropout != 0) diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp index 33bcb7164f..231b635a63 100644 --- a/test/pooling_common.hpp +++ b/test/pooling_common.hpp @@ -283,7 +283,7 @@ struct verify_backward_pooling std::size_t mx_idx_dim = mx_idx; mx_idx_dim /= std::accumulate(in_dim.begin() + i + 3, in_dim.end(), - 1, + 1ULL, std::multiplies()); mx_idx_dim %= in_dim[i + 2]; idx[i + 2] = mx_idx_dim; diff --git a/test/rnn_util.hpp b/test/rnn_util.hpp index 680c52582f..eeeebcadf9 100644 --- a/test/rnn_util.hpp +++ b/test/rnn_util.hpp @@ -115,7 +115,7 @@ inline void createTensorDescArray(std::vector& td, inline std::tuple GetTempPackedBuffersSize(std::vector batchs, int in_vec, int out_vec) { - size_t total_batch = std::accumulate(batchs.begin(), batchs.end(), 0); + size_t total_batch = std::accumulate(batchs.begin(), batchs.end(), 0ULL); size_t in_buff_size = total_batch * in_vec; size_t out_buff_size = total_batch * out_vec; @@ -131,9 +131,12 @@ inline size_t getSuperTensorSize(const std::vector& bs, bool isInput, bool isPadded) { - return static_cast(isPadded ? seqLength * maxPaddingVal - : std::accumulate(bs.begin(), bs.end(), 0)) * - static_cast(isInput ? inputSize : hiddenSize * (isBidirect ? 2 : 1)); + return (isPadded // + ? static_cast(seqLength) * maxPaddingVal + : std::accumulate(bs.begin(), bs.end(), 0ULL)) // + * (isInput // + ? static_cast(inputSize) + : static_cast(hiddenSize) * (isBidirect ? 2 : 1)); } template diff --git a/test/tensor_holder.hpp b/test/tensor_holder.hpp index 3895dd9583..0e7ff7548f 100644 --- a/test/tensor_holder.hpp +++ b/test/tensor_holder.hpp @@ -183,7 +183,12 @@ struct tensor tensor(miopen::TensorDescriptor rhs) : desc(std::move(rhs)) { - assert(desc.GetType() == miopen_type{}); + assert(desc.GetType() == miopen_type{} + /// In the driver, T is input tensor type, but output tensor holders + /// are instantiatied with T as well. This leads to false assertion + /// failures when T is INT8 because output type is different. + /// \todo Get rid of this hack when the driver is improved: + || (miopen_type{} == miopenInt8 && desc.GetType() == miopenInt32)); data.resize(desc.GetElementSpace()); } diff --git a/test/tensor_layout.hpp b/test/tensor_layout.hpp deleted file mode 100644 index 65d014d936..0000000000 --- a/test/tensor_layout.hpp +++ /dev/null @@ -1,80 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef GUARD_TENSOR_LAYOUT_HPP -#define GUARD_TENSOR_LAYOUT_HPP - -#include -#include -#include -#include -#include -#include - -template -void tensor_layout_to_strides(const std::vector& len, - const std::string& len_layout, - const std::string& layout, - std::vector& strides) -{ - // Bind the layout and the dimension lengths together into a map. - std::map dim_to_len; - std::transform(len.begin(), - len.end(), - len_layout.begin(), - std::inserter(dim_to_len, dim_to_len.end()), - [](T l, char dim) { return std::make_pair(dim, l); }); - - // Now construct the strides according to layout by multiply the - // dimension lengths together. - std::transform(len_layout.begin(), - len_layout.end(), - std::back_inserter(strides), - [&layout, &dim_to_len](char cur_layout_char) { - auto pos = layout.find(cur_layout_char); - if(pos == std::string::npos) - { - MIOPEN_THROW(std::string("mismatched layout string, unexpect char: ") - .append(1, cur_layout_char)); - } - return std::accumulate(layout.begin() + pos + 1, - layout.end(), - 1, - [&dim_to_len](T accumulator, char l) { - return accumulator * dim_to_len[l]; - }); - }); -} - -inline std::string tensor_layout_get_default(int size) -{ - if(size == 4) - return "NCHW"; - if(size == 5) - return "NCDHW"; - return ""; -} - -#endif