Skip to content

Commit

Permalink
improve error message of cudnn operators (apache#11886)
Browse files Browse the repository at this point in the history
  • Loading branch information
haojin2 authored and eric-haibin-lin committed Aug 8, 2018
1 parent ed41670 commit e0ee31a
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 17 deletions.
37 changes: 30 additions & 7 deletions src/operator/nn/cudnn/cudnn_convolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,7 @@ class CuDNNConvolutionOp {
const int kMaxAlgos = 10;
int nalgo = kMaxAlgos;
int i = 0;
size_t min_memory_needs = 0;
// Forward Algorithm Find/Get, v6 and earlier
if (CUDNN_MAJOR == 6 && param_.layout.value() == mshadow::kNHWC) {
// In cuDNNv6, for kNHWC, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is
Expand Down Expand Up @@ -720,10 +721,16 @@ class CuDNNConvolutionOp {
while (i < nalgo
&& (fwd_algo[i].status != CUDNN_STATUS_SUCCESS
|| (param_.cudnn_tune.value() == conv::kLimited
&& fwd_algo[i].memory > workspace_byte)))
&& fwd_algo[i].memory > workspace_byte))) {
++i;
min_memory_needs =
(i == 0) ? fwd_algo[i].memory : std::min(min_memory_needs, fwd_algo[i].memory);
}
if (i == nalgo) {
LOG(FATAL) << "Failed to find a forward convolution algorithm.";
LOG(FATAL) << nalgo << " forward algorithms with minimum memory requirement "
<< min_memory_needs << " bytes have been tried. Workspace size is set to "
<< workspace_byte << " bytes, please consider reducing the batch/model size, "
<< "or increasing workspace size.";
} else {
forward_algo_.Set(fwd_algo[i].algo, false);
}
Expand Down Expand Up @@ -754,10 +761,17 @@ class CuDNNConvolutionOp {
while (i < nalgo
&& (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS
|| (param_.cudnn_tune.value() == conv::kLimited
&& bwd_filter_algo[i].memory > workspace_byte)))
&& bwd_filter_algo[i].memory > workspace_byte))) {
++i;
min_memory_needs = (i == 0) ?
bwd_filter_algo[i].memory :
std::min(min_memory_needs, bwd_filter_algo[i].memory);
}
if (i == nalgo) {
LOG(FATAL) << "Failed to find a backward filter convolution algorithm.";
LOG(FATAL) << nalgo << " backward filter algorithms with minimum memory requirement "
<< min_memory_needs << " bytes have been tried. Workspace size is set to "
<< workspace_byte << " bytes, please consider reducing the batch/model size, "
<< "or increasing workspace size.";
} else {
back_algo_w_.Set(bwd_filter_algo[i].algo, false);
}
Expand Down Expand Up @@ -788,10 +802,17 @@ class CuDNNConvolutionOp {
while (i < nalgo
&& (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS
|| (param_.cudnn_tune.value() == conv::kLimited
&& bwd_data_algo[i].memory > workspace_byte)))
&& bwd_data_algo[i].memory > workspace_byte))) {
++i;
min_memory_needs = (i == 0) ?
bwd_data_algo[i].memory :
std::min(min_memory_needs, bwd_data_algo[i].memory);
}
if (i == nalgo) {
LOG(FATAL) << "Failed to find a backward data convolution algorithm.";
LOG(FATAL) << nalgo << " backward data algorithms with minimum memory requirement "
<< min_memory_needs << " bytes have been tried. Workspace size is set to "
<< workspace_byte << " bytes, please consider reducing the batch/model size, "
<< "or increasing workspace size.";
} else {
back_algo_.Set(bwd_data_algo[i].algo, false);
}
Expand Down Expand Up @@ -846,7 +867,9 @@ class CuDNNConvolutionOp {
}
}
auto mode = param_.cudnn_tune.value() == conv::kOff ? " get " : " find ";
LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " convolution algorithm.";
LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " convolution algorithm. "
<< " with workspace size of " << workspace_byte << " bytes,"
<< " please consider reducing batch/model size or increasing the workspace size";
}

void GetTempSize(const OpContext& ctx) {
Expand Down
47 changes: 37 additions & 10 deletions src/operator/nn/cudnn/cudnn_deconvolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,7 @@ class CuDNNDeconvolutionOp {
const int kMaxAlgos = 10;
int nalgo = kMaxAlgos;
int i = 0;
size_t min_memory_needs = 0;
// Forward Algorithm Find/Get, v6 and earlier
if (CUDNN_MAJOR == 6 && param_.layout.value() == mshadow::kNHWC) {
// In cuDNNv6, for kNHWC, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is
Expand Down Expand Up @@ -653,11 +654,19 @@ class CuDNNDeconvolutionOp {
while (i < nalgo
&& (fwd_algo[i].status != CUDNN_STATUS_SUCCESS
|| (param_.cudnn_tune.value() == deconv::kLimited
&& fwd_algo[i].memory > workspace_byte)))
&& fwd_algo[i].memory > workspace_byte))) {
++i;
min_memory_needs = (i == 0) ?
fwd_algo[i].memory :
std::min(min_memory_needs, fwd_algo[i].memory);
}
if (i == nalgo) {
LOG(FATAL) << "Failed to find a 'forward' convolution algorithm " <<
"(for use in deconvolution operator backprop-to-data).";
LOG(FATAL) << nalgo << " forward algorithms"
<< " (for use in deconvolution operator backprop-to-data)"
<< " with minimum memory requirement " << min_memory_needs
<< " bytes have been tried. Workspace size is set to " << workspace_byte
<< " bytes, please consider reducing the batch/model size,"
<< " or increasing workspace size.";
} else {
forward_algo_.Set(fwd_algo[i].algo, false);
}
Expand Down Expand Up @@ -688,11 +697,19 @@ class CuDNNDeconvolutionOp {
while (i < nalgo
&& (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS
|| (param_.cudnn_tune.value() == deconv::kLimited
&& bwd_filter_algo[i].memory > workspace_byte)))
&& bwd_filter_algo[i].memory > workspace_byte))) {
++i;
min_memory_needs = (i == 0) ?
bwd_filter_algo[i].memory :
std::min(min_memory_needs, bwd_filter_algo[i].memory);
}
if (i == nalgo) {
LOG(FATAL) << "Failed to find a backward filter convolution algorithm " <<
"(for use in deconvolution operator backprop-to-filter).";
LOG(FATAL) << nalgo << " backward filter algorithms"
<< " (for use in deconvolution operator backprop-to-filter)"
<< " with minimum memory requirement " << min_memory_needs
<< " bytes have been tried. Workspace size is set to " << workspace_byte
<< " bytes, please consider reducing the batch/model size,"
<< " or increasing workspace size.";
} else {
back_algo_w_.Set(bwd_filter_algo[i].algo, false);
}
Expand Down Expand Up @@ -723,11 +740,19 @@ class CuDNNDeconvolutionOp {
while (i < nalgo
&& (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS
|| (param_.cudnn_tune.value() == deconv::kLimited
&& bwd_data_algo[i].memory > workspace_byte)))
&& bwd_data_algo[i].memory > workspace_byte))) {
++i;
min_memory_needs = (i == 0) ?
bwd_data_algo[i].memory :
std::min(min_memory_needs, bwd_data_algo[i].memory);
}
if (i == nalgo) {
LOG(FATAL) << "Failed to find a backward data convolution algorithm." <<
"(for use in deconvolution operator forward inference).";
LOG(FATAL) << nalgo << " backward data algorithms"
<< " (for use in deconvolution operator forward inference) with"
<< " minimum memory requirement " << min_memory_needs
<< " bytes have been tried. Workspace size is set to " << workspace_byte
<< " bytes, please consider reducing the batch/model size,"
<< " or increasing workspace size.";
} else {
back_algo_.Set(bwd_data_algo[i].algo, false);
}
Expand Down Expand Up @@ -788,7 +813,9 @@ class CuDNNDeconvolutionOp {
}
}
auto mode = param_.cudnn_tune.value() == conv::kOff ? " get " : " find ";
LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " deconvolution algorithm.";
LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " deconvolution algorithm"
<< " with workspace size of " << workspace_byte << " bytes,"
<< " please consider reducing batch/model size or increasing the workspace size";
}

void GetTempSize(const OpContext& ctx) {
Expand Down

0 comments on commit e0ee31a

Please sign in to comment.