Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
improve error message of cudnn operators
Browse files Browse the repository at this point in the history
  • Loading branch information
Hao Jin committed Jul 28, 2018
1 parent bd3fc88 commit d317fbf
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 22 deletions.
39 changes: 30 additions & 9 deletions src/operator/nn/cudnn/cudnn_convolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,7 @@ class CuDNNConvolutionOp {
const int kMaxAlgos = 10;
int nalgo = kMaxAlgos;
int i = 0;
size_t min_memory_needs = 0;
// Forward Algorithm Find/Get, v6 and earlier
if (CUDNN_MAJOR == 6 && param_.layout.value() == mshadow::kNHWC) {
// In cuDNNv6, for kNHWC, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is
Expand Down Expand Up @@ -715,10 +716,16 @@ class CuDNNConvolutionOp {
while (i < nalgo
&& (fwd_algo[i].status != CUDNN_STATUS_SUCCESS
|| (param_.cudnn_tune.value() == conv::kLimited
&& fwd_algo[i].memory > workspace_byte)))
&& fwd_algo[i].memory > workspace_byte))) {
++i;
min_memory_needs =
(i == 0) ? fwd_algo[i].memory : std::min(min_memory_needs, fwd_algo[i].memory);
}
if (i == nalgo) {
LOG(FATAL) << "Failed to find a forward convolution algorithm.";
LOG(FATAL) << nalgo << " forward algorithms with minimum memory requirement "
<< min_memory_needs << " bytes have been tried. Workspace size is set to "
<< workspace_byte << " bytes, please consider reducing the batch/model size, "
<< "or increasing workspace size.";
} else {
forward_algo_.Set(fwd_algo[i].algo, false);
}
Expand Down Expand Up @@ -749,10 +756,16 @@ class CuDNNConvolutionOp {
while (i < nalgo
&& (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS
|| (param_.cudnn_tune.value() == conv::kLimited
&& bwd_filter_algo[i].memory > workspace_byte)))
&& bwd_filter_algo[i].memory > workspace_byte))) {
++i;
if (i == nalgo) {
LOG(FATAL) << "Failed to find a backward filter convolution algorithm.";
min_memory_needs = (i == 0) ?
bwd_filter_algo[i].memory :
std::min(min_memory_needs, bwd_filter_algo[i].memory);
} if (i == nalgo) {
LOG(FATAL) << nalgo << " backward filter algorithms with minimum memory requirement "
<< min_memory_needs << " bytes have been tried. Workspace size is set to "
<< workspace_byte << " bytes, please consider reducing the batch/model size, "
<< "or increasing workspace size.";
} else {
back_algo_w_.Set(bwd_filter_algo[i].algo, false);
}
Expand Down Expand Up @@ -783,10 +796,16 @@ class CuDNNConvolutionOp {
while (i < nalgo
&& (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS
|| (param_.cudnn_tune.value() == conv::kLimited
&& bwd_data_algo[i].memory > workspace_byte)))
&& bwd_data_algo[i].memory > workspace_byte))) {
++i;
if (i == nalgo) {
LOG(FATAL) << "Failed to find a backward data convolution algorithm.";
min_memory_needs = (i == 0) ?
bwd_data_algo[i].memory :
std::min(min_memory_needs, bwd_data_algo[i].memory);
} if (i == nalgo) {
LOG(FATAL) << nalgo << " backward data algorithms with minimum memory requirement "
<< min_memory_needs << " bytes have been tried. Workspace size is set to "
<< workspace_byte << " bytes, please consider reducing the batch/model size, "
<< "or increasing workspace size.";
} else {
back_algo_.Set(bwd_data_algo[i].algo, false);
}
Expand Down Expand Up @@ -833,7 +852,9 @@ class CuDNNConvolutionOp {
}
}
auto mode = param_.cudnn_tune.value() == conv::kOff ? " get " : " find ";
LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " convolution algorithm.";
LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " convolution algorithm. "
<< " with workspace size of " << workspace_byte << " bytes,"
<< " please consider reducing batch/model size or increasing the workspace size";
}

void GetTempSize(const OpContext& ctx) {
Expand Down
49 changes: 36 additions & 13 deletions src/operator/nn/cudnn/cudnn_deconvolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,7 @@ class CuDNNDeconvolutionOp {
const int kMaxAlgos = 10;
int nalgo = kMaxAlgos;
int i = 0;
size_t min_memory_needs = 0;
// Forward Algorithm Find/Get, v6 and earlier
if (CUDNN_MAJOR == 6 && param_.layout.value() == mshadow::kNHWC) {
// In cuDNNv6, for kNHWC, only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM is
Expand Down Expand Up @@ -648,11 +649,17 @@ class CuDNNDeconvolutionOp {
while (i < nalgo
&& (fwd_algo[i].status != CUDNN_STATUS_SUCCESS
|| (param_.cudnn_tune.value() == deconv::kLimited
&& fwd_algo[i].memory > workspace_byte)))
&& fwd_algo[i].memory > workspace_byte))) {
++i;
if (i == nalgo) {
LOG(FATAL) << "Failed to find a 'forward' convolution algorithm " <<
"(for use in deconvolution operator backprop-to-data).";
min_memory_needs = (i == 0) ?
fwd_algo[i].memory :
std::min(min_memory_needs, fwd_algo[i].memory);
} if (i == nalgo) {
LOG(FATAL) << nalgo << " forward algorithms (for use in deconvolution operator backprop-to-data)"
<< " with minimum memory requirement " << min_memory_needs
<< " bytes have been tried. Workspace size is set to " << workspace_byte
<< " bytes, please consider reducing the batch/model size,"
<< " or increasing workspace size.";
} else {
forward_algo_.Set(fwd_algo[i].algo, false);
}
Expand Down Expand Up @@ -683,11 +690,18 @@ class CuDNNDeconvolutionOp {
while (i < nalgo
&& (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS
|| (param_.cudnn_tune.value() == deconv::kLimited
&& bwd_filter_algo[i].memory > workspace_byte)))
&& bwd_filter_algo[i].memory > workspace_byte))) {
++i;
if (i == nalgo) {
LOG(FATAL) << "Failed to find a backward filter convolution algorithm " <<
"(for use in deconvolution operator backprop-to-filter).";
min_memory_needs = (i == 0) ?
bwd_filter_algo[i].memory :
std::min(min_memory_needs, bwd_filter_algo[i].memory);
} if (i == nalgo) {
LOG(FATAL) << nalgo << " backward filter algorithms"
<< " (for use in deconvolution operator backprop-to-filter)"
<< " with minimum memory requirement " << min_memory_needs
<< " bytes have been tried. Workspace size is set to " << workspace_byte
<< " bytes, please consider reducing the batch/model size,"
<< " or increasing workspace size.";
} else {
back_algo_w_.Set(bwd_filter_algo[i].algo, false);
}
Expand Down Expand Up @@ -718,11 +732,18 @@ class CuDNNDeconvolutionOp {
while (i < nalgo
&& (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS
|| (param_.cudnn_tune.value() == deconv::kLimited
&& bwd_data_algo[i].memory > workspace_byte)))
&& bwd_data_algo[i].memory > workspace_byte))) {
++i;
if (i == nalgo) {
LOG(FATAL) << "Failed to find a backward data convolution algorithm." <<
"(for use in deconvolution operator forward inference).";
min_memory_needs = (i == 0) ?
bwd_data_algo[i].memory :
std::min(min_memory_needs, bwd_data_algo[i].memory);
} if (i == nalgo) {
LOG(FATAL) << nalgo << " backward data algorithms"
<< " (for use in deconvolution operator forward inference) with"
<< " minimum memory requirement " << min_memory_needs
<< " bytes have been tried. Workspace size is set to " << workspace_byte
<< " bytes, please consider reducing the batch/model size,"
<< " or increasing workspace size.";
} else {
back_algo_.Set(bwd_data_algo[i].algo, false);
}
Expand Down Expand Up @@ -774,7 +795,9 @@ class CuDNNDeconvolutionOp {
}
}
auto mode = param_.cudnn_tune.value() == conv::kOff ? " get " : " find ";
LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " deconvolution algorithm.";
LOG(FATAL) << "Failed to" << mode << "any " << kernel_name << " deconvolution algorithm"
<< " with workspace size of " << workspace_byte << " bytes,"
<< " please consider reducing batch/model size or increasing the workspace size";
}

void GetTempSize(const OpContext& ctx) {
Expand Down

0 comments on commit d317fbf

Please sign in to comment.