Skip to content

Commit

Permalink
Better normalization options for SoftmaxWithLoss layer.
Browse files Browse the repository at this point in the history
  • Loading branch information
cdoersch committed Nov 11, 2015
1 parent 0ec116e commit 9963079
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 32 deletions.
11 changes: 8 additions & 3 deletions include/caffe/loss_layers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,12 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

/// Read the normalization mode parameter and compute the normalizer based
/// on the blob size. If normalization_mode is VALID, the count of valid
/// outputs will be read from valid_count, unless it is -1 in which case
/// all outputs are assumed to be valid.
virtual Dtype get_normalizer(
LossParameter_NormalizationMode normalization_mode, int valid_count);

/// The internal SoftmaxLayer used to map predictions to a distribution.
shared_ptr<Layer<Dtype> > softmax_layer_;
Expand All @@ -760,9 +766,8 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
bool has_ignore_label_;
/// The label indicating that an instance should be ignored.
int ignore_label_;
/// Whether to normalize the loss by the total number of values present
/// (otherwise just by the batch size).
bool normalize_;
/// How to normalize the output loss.
LossParameter_NormalizationMode normalization_;

int softmax_axis_, outer_num_, inner_num_;
};
Expand Down
52 changes: 40 additions & 12 deletions src/caffe/layers/softmax_loss_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,14 @@ void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
if (has_ignore_label_) {
ignore_label_ = this->layer_param_.loss_param().ignore_label();
}
normalize_ = this->layer_param_.loss_param().normalize();
if (!this->layer_param_.loss_param().has_normalization() &&
this->layer_param_.loss_param().has_normalize()) {
normalization_ = this->layer_param_.loss_param().normalize() ?
LossParameter_NormalizationMode_VALID :
LossParameter_NormalizationMode_BATCH_SIZE;
} else {
normalization_ = this->layer_param_.loss_param().normalization();
}
}

template <typename Dtype>
Expand All @@ -48,6 +55,34 @@ void SoftmaxWithLossLayer<Dtype>::Reshape(
}
}

template <typename Dtype>
Dtype SoftmaxWithLossLayer<Dtype>::get_normalizer(
LossParameter_NormalizationMode normalization_mode, int valid_count) {
Dtype normalizer;
switch (normalization_mode) {
case LossParameter_NormalizationMode_FULL:
normalizer = Dtype(outer_num_ * inner_num_);
break;
case LossParameter_NormalizationMode_VALID:
if (valid_count == -1) {
normalizer = Dtype(outer_num_ * inner_num_);
} else {
normalizer = Dtype(valid_count);
}
break;
case LossParameter_NormalizationMode_BATCH_SIZE:
normalizer = Dtype(outer_num_);
break;
case LossParameter_NormalizationMode_NONE:
normalizer = Dtype(1);
break;
default:
LOG(FATAL) << "Unknown normalization mode: "
<< LossParameter_NormalizationMode_Name(normalization_mode);
}
return normalizer;
}

template <typename Dtype>
void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
Expand All @@ -71,11 +106,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
++count;
}
}
if (normalize_) {
top[0]->mutable_cpu_data()[0] = loss / count;
} else {
top[0]->mutable_cpu_data()[0] = loss / outer_num_;
}
top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
if (top.size() == 2) {
top[1]->ShareData(prob_);
}
Expand Down Expand Up @@ -109,12 +140,9 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
}
}
// Scale gradient
const Dtype loss_weight = top[0]->cpu_diff()[0];
if (normalize_) {
caffe_scal(prob_.count(), loss_weight / count, bottom_diff);
} else {
caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
}
Dtype loss_weight = top[0]->cpu_diff()[0] /
get_normalizer(normalization_, count);
caffe_scal(prob_.count(), loss_weight, bottom_diff);
}
}

Expand Down
32 changes: 18 additions & 14 deletions src/caffe/layers/softmax_loss_layer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,15 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
Dtype loss;
caffe_gpu_asum(nthreads, loss_data, &loss);
if (normalize_) {
Dtype count;
caffe_gpu_asum(nthreads, counts, &count);
loss /= count;
} else {
loss /= outer_num_;
Dtype valid_count = -1;
// Only launch another CUDA kernel if we actually need the count of valid
// outputs.
if (normalization_ == LossParameter_NormalizationMode_VALID &&
has_ignore_label_) {
caffe_gpu_asum(nthreads, counts, &valid_count);
}
top[0]->mutable_cpu_data()[0] = loss;
top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_,
valid_count);
if (top.size() == 2) {
top[1]->ShareData(prob_);
}
Expand Down Expand Up @@ -108,14 +109,17 @@ void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
SoftmaxLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
const Dtype loss_weight = top[0]->cpu_diff()[0];
if (normalize_) {
Dtype count;
caffe_gpu_asum(nthreads, counts, &count);
caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
} else {
caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);

Dtype valid_count = -1;
// Only launch another CUDA kernel if we actually need the count of valid
// outputs.
if (normalization_ == LossParameter_NormalizationMode_VALID &&
has_ignore_label_) {
caffe_gpu_asum(nthreads, counts, &valid_count);
}
const Dtype loss_weight = top[0]->cpu_diff()[0] /
get_normalizer(normalization_, valid_count);
caffe_gpu_scal(prob_.count(), loss_weight , bottom_diff);
}
}

Expand Down
24 changes: 21 additions & 3 deletions src/caffe/proto/caffe.proto
Original file line number Diff line number Diff line change
Expand Up @@ -420,9 +420,27 @@ message TransformationParameter {
message LossParameter {
// If specified, ignore instances with the given label.
optional int32 ignore_label = 1;
// If true, normalize each batch across all instances (including spatial
// dimesions, but not ignored instances); else, divide by batch size only.
optional bool normalize = 2 [default = true];
// How to normalize the loss for loss layers that aggregate across batches,
// spatial dimensions, or other dimensions. Currently only implemented in
// SoftmaxWithLoss layer.
enum NormalizationMode {
// Divide by the number of examples in the batch times spatial dimensions.
// Outputs that receive the ignore label will NOT be ignored in computing
// the normalization factor.
FULL = 0;
// Divide by the total number of output locations that do not take the
// ignore_label. If ignore_label is not set, this behaves like FULL.
VALID = 1;
// Divide by the batch size.
BATCH_SIZE = 2;
// Do not normalize the loss.
NONE = 3;
}
optional NormalizationMode normalization = 3 [default = VALID];
// Deprecated. Ignored if normalization is specified. If normalization
// is not specified, then setting this to false will be equivalent to
// normalization = BATCH_SIZE to be consistent with previous behavior.
optional bool normalize = 2;
}

// Messages that store parameters used by individual layer types follow, in
Expand Down

0 comments on commit 9963079

Please sign in to comment.