From a807f6de32213cfec9462c8df2ca1fad4f9bcbad Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Mon, 27 Jul 2020 22:06:50 -0700 Subject: [PATCH] [NumPy] loss for np array (#17196) * loss for np/nd array * fix flaky --- python/mxnet/gluon/loss.py | 251 +++++++++++------- src/operator/nn/ctc_loss.cc | 1 + src/operator/nn/ctc_loss.cu | 1 + .../tensor/broadcast_reduce_norm_value.cc | 1 + tests/python/gpu/test_gluon_gpu.py | 1 + tests/python/unittest/test_loss.py | 28 +- tests/python/unittest/test_numpy_loss.py | 235 ++++++++++++++++ tests/python/unittest/test_numpy_op.py | 136 +++++----- 8 files changed, 470 insertions(+), 184 deletions(-) create mode 100644 tests/python/unittest/test_numpy_loss.py diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py index 852a9a791d53..bc447b0f1c55 100644 --- a/python/mxnet/gluon/loss.py +++ b/python/mxnet/gluon/loss.py @@ -74,6 +74,26 @@ def _reshape_like(F, x, y): return F.reshape_like(x, y) +def _batch_mean(F, loss, batch_axis): + """Return mean on the specified batch axis, not keeping the axis""" + if is_np_array(): + axes = list(range(loss.ndim)) + del axes[batch_axis] + return F.np.mean(loss, axis=axes) + else: + return F.mean(loss, axis=batch_axis, exclude=True) + +def _batch_sum(F, loss, batch_axis): + """Return sum on the specified batch axis, not keeping the axis""" + if is_np_array(): + axes = list(range(loss.ndim)) + del axes[batch_axis] + return F.np.sum(loss, axis=axes) + else: + return F.sum(loss, axis=batch_axis, exclude=True) + + + class Loss(HybridBlock): """Base class for loss. @@ -142,16 +162,11 @@ def __init__(self, weight=1., batch_axis=0, **kwargs): super(L2Loss, self).__init__(weight, batch_axis, **kwargs) def hybrid_forward(self, F, pred, label, sample_weight=None): + square_fn = F.np.square if is_np_array() else F.square label = _reshape_like(F, label, pred) - loss = F.np.square(label - pred) if is_np_array() else F.square(label - pred) + loss = square_fn(label - pred) loss = _apply_weighting(F, loss, self._weight / 2, sample_weight) - if is_np_array(): - if F is ndarray: - return F.np.mean(loss, axis=tuple(range(1, loss.ndim))) - else: - return F.npx.batch_flatten(loss).mean(axis=1) - else: - return F.mean(loss, axis=self._batch_axis, exclude=True) + return _batch_mean(F, loss, self._batch_axis) class L1Loss(Loss): @@ -187,16 +202,11 @@ def __init__(self, weight=None, batch_axis=0, **kwargs): super(L1Loss, self).__init__(weight, batch_axis, **kwargs) def hybrid_forward(self, F, pred, label, sample_weight=None): + abs_fn = F.np.abs if is_np_array() else F.abs label = _reshape_like(F, label, pred) - loss = F.np.abs(label - pred) if is_np_array() else F.abs(label - pred) + loss = abs_fn(label - pred) loss = _apply_weighting(F, loss, self._weight, sample_weight) - if is_np_array(): - if F is ndarray: - return F.np.mean(loss, axis=tuple(range(1, loss.ndim))) - else: - return F.npx.batch_flatten(loss).mean(axis=1) - else: - return F.mean(loss, axis=self._batch_axis, exclude=True) + return _batch_mean(F, loss, self._batch_axis) class SigmoidBinaryCrossEntropyLoss(Loss): @@ -262,7 +272,6 @@ def __init__(self, from_sigmoid=False, weight=None, batch_axis=0, **kwargs): self._from_sigmoid = from_sigmoid def hybrid_forward(self, F, pred, label, sample_weight=None, pos_weight=None): - label = _reshape_like(F, label, pred) if is_np_array(): relu_fn = F.npx.relu act_fn = F.npx.activation @@ -275,6 +284,7 @@ def hybrid_forward(self, F, pred, label, sample_weight=None, pos_weight=None): abs_fn = F.abs mul_fn = F.broadcast_mul log_fn = F.log + label = _reshape_like(F, label, pred) if not self._from_sigmoid: if pos_weight is None: # We use the stable formula: max(x, 0) - x * z + log(1 + exp(-abs(x))) @@ -295,13 +305,7 @@ def hybrid_forward(self, F, pred, label, sample_weight=None, pos_weight=None): loss = -(mul_fn(log_fn(pred + eps) * label, pos_weight) + log_fn(1. - pred + eps) * (1. - label)) loss = _apply_weighting(F, loss, self._weight, sample_weight) - if is_np_array(): - if F is ndarray: - return F.np.mean(loss, axis=tuple(range(1, loss.ndim))) - else: - return F.npx.batch_flatten(loss).mean(axis=1) - else: - return F.mean(loss, axis=self._batch_axis, exclude=True) + return _batch_mean(F, loss, self._batch_axis) SigmoidBCELoss = SigmoidBinaryCrossEntropyLoss @@ -379,26 +383,20 @@ def __init__(self, axis=-1, sparse_label=True, from_logits=False, weight=None, def hybrid_forward(self, F, pred, label, sample_weight=None): if is_np_array(): - log_softmax = F.npx.log_softmax - pick = F.npx.pick + log_softmax_fn = F.npx.log_softmax + pick_fn = F.npx.pick else: - log_softmax = F.log_softmax - pick = F.pick + log_softmax_fn = F.log_softmax + pick_fn = F.pick if not self._from_logits: - pred = log_softmax(pred, self._axis) + pred = log_softmax_fn(pred, self._axis) if self._sparse_label: - loss = -pick(pred, label, axis=self._axis, keepdims=True) + loss = -pick_fn(pred, label, axis=self._axis, keepdims=True) else: label = _reshape_like(F, label, pred) loss = -(pred * label).sum(axis=self._axis, keepdims=True) loss = _apply_weighting(F, loss, self._weight, sample_weight) - if is_np_array(): - if F is ndarray: - return loss.mean(axis=tuple(range(1, loss.ndim))) - else: - return F.npx.batch_flatten(loss).mean(axis=1) - else: - return loss.mean(axis=self._batch_axis, exclude=True) + return _batch_mean(F, loss, self._batch_axis) SoftmaxCELoss = SoftmaxCrossEntropyLoss @@ -472,11 +470,17 @@ def __init__(self, from_logits=True, axis=-1, weight=None, batch_axis=0, self._axis = axis def hybrid_forward(self, F, pred, label, sample_weight=None): + if is_np_array(): + log_softmax_fn = F.npx.log_softmax + log_fn = F.np.log + else: + log_softmax_fn = F.log_softmax + log_fn = F.log if not self._from_logits: - pred = F.log_softmax(pred, self._axis) - loss = label * (F.log(label + 1e-12) - pred) + pred = log_softmax_fn(pred, self._axis) + loss = label * (log_fn(label + 1e-12) - pred) loss = _apply_weighting(F, loss, self._weight, sample_weight) - return F.sum(loss, axis=self._batch_axis, exclude=True) + return _batch_mean(F, loss, self._batch_axis) class CTCLoss(Loss): @@ -549,14 +553,20 @@ def __init__(self, layout='NTC', label_layout='NT', weight=None, **kwargs): def hybrid_forward(self, F, pred, label, pred_lengths=None, label_lengths=None, sample_weight=None): + if is_np_array(): + swapaxes_fn = F.np.swapaxes + ctc_fn = F.npx.ctc_loss + else: + swapaxes_fn = F.swapaxes + ctc_fn = F.ctc_loss if self._layout == 'NTC': - pred = F.swapaxes(pred, 0, 1) + pred = swapaxes_fn(pred, 0, 1) if self._batch_axis == 1: - label = F.swapaxes(label, 0, 1) - loss = F.CTCLoss(pred, label, pred_lengths, label_lengths, - use_data_lengths=pred_lengths is not None, - use_label_lengths=label_lengths is not None, - blank_label='last') + label = swapaxes_fn(label, 0, 1) + loss = ctc_fn(pred, label, pred_lengths, label_lengths, + use_data_lengths=pred_lengths is not None, + use_label_lengths=label_lengths is not None, + blank_label='last') return _apply_weighting(F, loss, self._weight, sample_weight) @@ -602,12 +612,20 @@ def __init__(self, rho=1, weight=None, batch_axis=0, **kwargs): self._rho = rho def hybrid_forward(self, F, pred, label, sample_weight=None): + if is_np_array(): + abs_fn = F.np.abs + where_fn = F.np.where + square_fn = F.np.square + else: + abs_fn = F.abs + where_fn = F.where + square_fn = F.square label = _reshape_like(F, label, pred) - loss = F.abs(label - pred) - loss = F.where(loss > self._rho, loss - 0.5 * self._rho, - (0.5 / self._rho) * F.square(loss)) + loss = abs_fn(label - pred) + loss = where_fn(loss > self._rho, loss - 0.5 * self._rho, + (0.5 / self._rho) * square_fn(loss)) loss = _apply_weighting(F, loss, self._weight, sample_weight) - return F.mean(loss, axis=self._batch_axis, exclude=True) + return _batch_mean(F, loss, self._batch_axis) class HingeLoss(Loss): @@ -649,10 +667,11 @@ def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs): self._margin = margin def hybrid_forward(self, F, pred, label, sample_weight=None): + relu_fn = F.npx.relu if is_np_array() else F.relu label = _reshape_like(F, label, pred) - loss = F.relu(self._margin - pred * label) + loss = relu_fn(self._margin - pred * label) loss = _apply_weighting(F, loss, self._weight, sample_weight) - return F.mean(loss, axis=self._batch_axis, exclude=True) + return _batch_mean(F, loss, self._batch_axis) class SquaredHingeLoss(Loss): @@ -694,10 +713,16 @@ def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs): self._margin = margin def hybrid_forward(self, F, pred, label, sample_weight=None): + if is_np_array(): + relu_fn = F.npx.relu + square_fn = F.np.square + else: + relu_fn = F.relu + square_fn = F.square label = _reshape_like(F, label, pred) - loss = F.square(F.relu(self._margin - pred * label)) + loss = square_fn(relu_fn(self._margin - pred * label)) loss = _apply_weighting(F, loss, self._weight, sample_weight) - return F.mean(loss, axis=self._batch_axis, exclude=True) + return _batch_mean(F, loss, self._batch_axis) class LogisticLoss(Loss): @@ -743,14 +768,22 @@ def __init__(self, weight=None, batch_axis=0, label_format='signed', **kwargs): % label_format) def hybrid_forward(self, F, pred, label, sample_weight=None): + if is_np_array(): + relu_fn = F.npx.relu + act_fn = F.npx.activation + abs_fn = F.np.abs + else: + relu_fn = F.relu + act_fn = F.Activation + abs_fn = F.abs label = _reshape_like(F, label, pred) if self._label_format == 'signed': label = (label + 1.0) / 2.0 # Transform label to be either 0 or 1 # Use a stable formula in computation - loss = F.relu(pred) - pred * label + \ - F.Activation(-F.abs(pred), act_type='softrelu') + loss = relu_fn(pred) - pred * label + \ + act_fn(-abs_fn(pred), act_type='softrelu') loss = _apply_weighting(F, loss, self._weight, sample_weight) - return F.mean(loss, axis=self._batch_axis, exclude=True) + return _batch_mean(F, loss, self._batch_axis) class TripletLoss(Loss): @@ -790,13 +823,18 @@ def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs): super(TripletLoss, self).__init__(weight, batch_axis, **kwargs) self._margin = margin - def hybrid_forward(self, F, pred, positive, negative): + def hybrid_forward(self, F, pred, positive, negative, sample_weight=None): + if is_np_array(): + relu_fn = F.npx.relu + square_fn = F.np.square + else: + relu_fn = F.relu + square_fn = F.square positive = _reshape_like(F, positive, pred) negative = _reshape_like(F, negative, pred) - loss = F.sum(F.square(positive - pred) - F.square(negative - pred), - axis=self._batch_axis, exclude=True) - loss = F.relu(loss + self._margin) - return _apply_weighting(F, loss, self._weight, None) + loss = _batch_sum(F, square_fn(positive - pred) - square_fn(negative - pred), self._batch_axis) + loss = relu_fn(loss + self._margin) + return _apply_weighting(F, loss, self._weight, sample_weight) class PoissonNLLLoss(Loss): @@ -845,20 +883,26 @@ def __init__(self, weight=None, from_logits=True, batch_axis=0, compute_full=Fal self._compute_full = compute_full def hybrid_forward(self, F, pred, target, sample_weight=None, epsilon=1e-08): + if is_np_array(): + exp_fn = F.np.exp + log_fn = F.np.log + else: + exp_fn = F.exp + log_fn = F.log target = _reshape_like(F, target, pred) if self._from_logits: - loss = F.exp(pred) - target * pred + loss = exp_fn(pred) - target * pred else: - loss = pred - target * F.log(pred + epsilon) + loss = pred - target * log_fn(pred + epsilon) if self._compute_full: # Using numpy's pi value stirling_factor = target * \ - F.log(target) - target + 0.5 * F.log(2 * target * np.pi) + log_fn(target) - target + 0.5 * log_fn(2 * target * np.pi) target_gt_1 = target > 1 stirling_factor *= target_gt_1 loss += stirling_factor loss = _apply_weighting(F, loss, self._weight, sample_weight) - return F.mean(loss) + return _batch_mean(F, loss, self._batch_axis) class CosineEmbeddingLoss(Loss): @@ -902,33 +946,42 @@ def __init__(self, weight=None, batch_axis=0, margin=0, **kwargs): self._margin = margin def hybrid_forward(self, F, input1, input2, label, sample_weight=None): + if is_np_array(): + where_fn = F.np.where + clip_fn = F.np.clip + else: + where_fn = F.where + clip_fn = F.clip + input1 = _reshape_like(F, input1, input2) - label = label.reshape((-1, 1)) cos_sim = self._cosine_similarity(F, input1, input2) - y_1 = label == 1 - y_minus_1 = label == -1 - cos_sim_a = (1 - cos_sim) * y_1 + label = _reshape_like(F, label, cos_sim) + loss = where_fn(label == 1, + 1 - cos_sim, + clip_fn(cos_sim - self._margin, 0, 1 - self._margin)) - if F is ndarray: - z_array = F.array([0]) - else: - z_array = F.zeros((1, 1)) - cos_sim_b = F.broadcast_maximum( - z_array, y_minus_1 * (cos_sim - self._margin), axis=1) - loss = cos_sim_a + cos_sim_b loss = _apply_weighting(F, loss, self._weight, sample_weight) - return loss + return _batch_mean(F, loss, self._batch_axis) def _cosine_similarity(self, F, x, y, axis=-1): - # Calculates the cosine similarity between 2 vectors - x_norm = F.norm(x, axis=axis).reshape((-1, 1)) - y_norm = F.norm(y, axis=axis).reshape((-1, 1)) - x_dot_y = F.sum(x * y, axis=axis).reshape((-1, 1)) - if F is ndarray: - eps_arr = F.array([1e-12]) + if is_np_array(): + reshape_fn = F.npx.reshape + norm_fn = F.npx.norm + sum_fn = F.np.sum + full_fn = F.np.full + max_fn = F.np.maximum else: - eps_arr = F.full((1, 1), 1e-12) - return (x_dot_y / F.broadcast_maximum(x_norm * y_norm, eps_arr)) + reshape_fn = F.reshape + norm_fn = F.norm + sum_fn = F.sum + full_fn = F.full + max_fn = F.broadcast_maximum + # Calculates the cosine similarity between 2 vectors + x_norm = reshape_fn(norm_fn(x, axis=axis), (-1, 1)) + y_norm = reshape_fn(norm_fn(y, axis=axis), (-1, 1)) + x_dot_y = reshape_fn(sum_fn(x * y, axis=axis), (-1, 1)) + eps_arr = full_fn((1, 1), 1e-12) + return (x_dot_y / max_fn(x_norm * y_norm, eps_arr)) class SDMLLoss(Loss): @@ -972,18 +1025,24 @@ def __init__(self, smoothing_parameter=0.3, weight=1., batch_axis=0, **kwargs): self.kl_loss = KLDivLoss(from_logits=True) self.smoothing_parameter = smoothing_parameter # Smoothing probability mass - def _compute_distances(self, x1, x2): + def _compute_distances(self, F, x1, x2): """ This function computes the euclidean distance between every vector in the two batches in input. """ + if is_np_array(): + expand_dims_fn = F.np.expand_dims + broadcast_to_fn = F.np.broadcast_to + else: + expand_dims_fn = F.expand_dims + broadcast_to_fn = F.broadcast_to # extracting sizes expecting [batch_size, dim] assert x1.shape == x2.shape batch_size, dim = x1.shape # expanding both tensor form [batch_size, dim] to [batch_size, batch_size, dim] - x1_ = x1.expand_dims(1).broadcast_to([batch_size, batch_size, dim]) - x2_ = x2.expand_dims(0).broadcast_to([batch_size, batch_size, dim]) + x1_ = broadcast_to_fn(expand_dims_fn(x1, 1), [batch_size, batch_size, dim]) + x2_ = broadcast_to_fn(expand_dims_fn(x2, 0), [batch_size, batch_size, dim]) # pointwise squared differences squared_diffs = (x1_ - x2_)**2 # sum of squared differences distance @@ -1015,7 +1074,7 @@ def _compute_labels(self, F, batch_size): return labels - def _loss(self, F, x1, x2): + def hybrid_forward(self, F, x1, x2): """ the function computes the kl divergence between the negative distances (internally it compute a softmax casting into probabilities) and the @@ -1033,15 +1092,15 @@ def _loss(self, F, x1, x2): learn to predict french president comparing it with all the other vectors in batch 2 """ + if is_np_array(): + log_softmax_fn = F.npx.log_softmax + else: + log_softmax_fn = F.log_softmax batch_size = x1.shape[0] labels = self._compute_labels(F, batch_size) - distances = self._compute_distances(x1, x2) - log_probabilities = F.log_softmax(-distances, axis=1) + distances = self._compute_distances(F, x1, x2) + log_probabilities = log_softmax_fn(-distances, axis=1) # multiply for the number of labels to obtain the correct loss (gluon kl_loss averages instead of sum) # PR#18423:multiply for the number of labels should multiply x1.shape[1] rather than x1.shape[0]) # After PR#18423, it is no need to multiply it anymore. return self.kl_loss(log_probabilities, labels.as_in_context(distances.context)) - - - def hybrid_forward(self, F, x1, x2): - return self._loss(F, x1, x2) diff --git a/src/operator/nn/ctc_loss.cc b/src/operator/nn/ctc_loss.cc index 096ef8c0d7b4..59f89f0576f3 100644 --- a/src/operator/nn/ctc_loss.cc +++ b/src/operator/nn/ctc_loss.cc @@ -50,6 +50,7 @@ DMLC_REGISTER_PARAMETER(CTCLossOpParam); NNVM_REGISTER_OP(CTCLoss) .add_alias("ctc_loss") +.add_alias("_npx_ctc_loss") .add_alias("_contrib_CTCLoss") .add_alias("_contrib_ctc_loss") .describe(R"code(Connectionist Temporal Classification Loss. diff --git a/src/operator/nn/ctc_loss.cu b/src/operator/nn/ctc_loss.cu index a4491bf6986e..c6952d33c86a 100644 --- a/src/operator/nn/ctc_loss.cu +++ b/src/operator/nn/ctc_loss.cu @@ -51,6 +51,7 @@ namespace op { NNVM_REGISTER_OP(CTCLoss) .add_alias("ctc_loss") +.add_alias("_npx_ctc_loss") .add_alias("_contrib_ctc_loss") .add_alias("_contrib_CTCLoss") .set_attr("FCompute", CTCLossOpForward); diff --git a/src/operator/tensor/broadcast_reduce_norm_value.cc b/src/operator/tensor/broadcast_reduce_norm_value.cc index 557c4d9e7746..43a22b2a8314 100644 --- a/src/operator/tensor/broadcast_reduce_norm_value.cc +++ b/src/operator/tensor/broadcast_reduce_norm_value.cc @@ -87,6 +87,7 @@ Examples:: norm(csr) = [5.47722578] )code" ADD_FILELINE) +.add_alias("_npx_norm") .set_num_inputs(1) .set_num_outputs(1) .set_attr_parser(ParamParser) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 278ea02fce98..27325dbcf3c1 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -33,6 +33,7 @@ from common import setup_module, with_seed, teardown_module, assert_raises_cudnn_not_satisfied, run_in_spawned_process from test_gluon import * from test_loss import * +from test_numpy_loss import * from test_gluon_rnn import * set_default_context(mx.gpu(0)) diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py index 9c1d496d715e..80d8cde5050f 100644 --- a/tests/python/unittest/test_loss.py +++ b/tests/python/unittest/test_loss.py @@ -56,16 +56,6 @@ def test_loss_ndarray(): assert_almost_equal(L, np.array([ 1.06346405, 0.04858733]), rtol=1e-3, atol=1e-4) -def get_net(num_hidden, flatten=True): - data = mx.symbol.Variable('data') - fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128, flatten=flatten) - act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu") - fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64, flatten=flatten) - act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu") - fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=num_hidden, flatten=flatten) - return fc3 - - @with_seed() def test_bce_equal_ce2(): N = 100 @@ -163,7 +153,7 @@ def test_cosine_loss(): denominator = mx.nd.sqrt(mx.nd.sum(input1**2, axis=1, keepdims=True)) \ * mx.nd.sqrt(mx.nd.sum(input2**2, axis=1, keepdims=True)) numpy_loss = mx.nd.where(label == 1, 1-numerator/denominator, \ - mx.nd.broadcast_maximum(mx.nd.array([0]), numerator/denominator, axis=1)) + mx.nd.broadcast_maximum(mx.nd.array([0]), numerator/denominator, axis=1)).reshape((-1,)) assert_almost_equal(loss.asnumpy(), numpy_loss.asnumpy(), rtol=1e-3, atol=1e-5) @xfail_when_nonstandard_decimal_separator @@ -186,25 +176,23 @@ def test_poisson_nllloss(): #Calculating by brute formula for default value of from_logits = True # 1) Testing for flag logits = True - brute_loss = np.mean(np.exp(pred.asnumpy()) - target.asnumpy() * pred.asnumpy()) + brute_loss = np.mean(np.exp(pred.asnumpy()) - target.asnumpy() * pred.asnumpy(), axis=1) loss_withlogits = Loss(pred, target) - assert_almost_equal(brute_loss, loss_withlogits.asscalar()) + assert_almost_equal(brute_loss, loss_withlogits) #2) Testing for flag logits = False loss_no_logits = Loss_no_logits(pred, target) - np_loss_no_logits = np.mean(pred.asnumpy() - target.asnumpy() * np.log(pred.asnumpy() + 1e-08)) - if np.isnan(loss_no_logits.asscalar()): - assert_almost_equal(np.isnan(np_loss_no_logits), np.isnan(loss_no_logits.asscalar())) - else: - assert_almost_equal(np_loss_no_logits, loss_no_logits.asscalar()) + np_loss_no_logits = np.mean(pred.asnumpy() - target.asnumpy() * np.log(pred.asnumpy() + 1e-08), + axis=1) + assert_almost_equal(np_loss_no_logits, loss_no_logits.asnumpy()) #3) Testing for Sterling approximation shape=(2, 3) np_pred = np.random.uniform(1, 5, shape) np_target = np.random.uniform(1, 5, shape) np_compute_full = np.mean((np_pred - np_target * np.log(np_pred + 1e-08)) + ((np_target * np.log(np_target)-\ - np_target + 0.5 * np.log(2 * np_target * np.pi))*(np_target > 1))) + np_target + 0.5 * np.log(2 * np_target * np.pi))*(np_target > 1)), axis=1) Loss_compute_full = gluon.loss.PoissonNLLLoss(from_logits=False, compute_full=True) loss_compute_full = Loss_compute_full(mx.nd.array(np_pred), mx.nd.array(np_target)) - assert_almost_equal(np_compute_full, loss_compute_full.asscalar()) + assert_almost_equal(np_compute_full, loss_compute_full) diff --git a/tests/python/unittest/test_numpy_loss.py b/tests/python/unittest/test_numpy_loss.py new file mode 100644 index 000000000000..6c63546f85b1 --- /dev/null +++ b/tests/python/unittest/test_numpy_loss.py @@ -0,0 +1,235 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import numpy as np +from mxnet import gluon, autograd +from mxnet.test_utils import assert_almost_equal, default_context, use_np +from common import setup_module, with_seed, teardown_module, xfail_when_nonstandard_decimal_separator +import unittest + + +@xfail_when_nonstandard_decimal_separator +@with_seed() +@use_np +def test_loss_np_ndarray(): + output = mx.np.array([1, 2, 3, 4]) + label = mx.np.array([1, 3, 5, 7]) + weighting = mx.np.array([0.5, 1, 0.5, 1]) + + loss = gluon.loss.L1Loss() + assert mx.np.sum(loss(output, label)) == 6. + loss = gluon.loss.L1Loss(weight=0.5) + assert mx.np.sum(loss(output, label)) == 3. + loss = gluon.loss.L1Loss() + assert mx.np.sum(loss(output, label, weighting)) == 5. + + loss = gluon.loss.L2Loss() + assert mx.np.sum(loss(output, label)) == 7. + loss = gluon.loss.L2Loss(weight=0.25) + assert mx.np.sum(loss(output, label)) == 1.75 + loss = gluon.loss.L2Loss() + assert mx.np.sum(loss(output, label, weighting)) == 6 + + loss = gluon.loss.HuberLoss() + assert mx.np.sum(loss(output, label)) == 4.5 + loss = gluon.loss.HuberLoss(weight=0.25) + assert mx.np.sum(loss(output, label)) == 1.125 + loss = gluon.loss.HuberLoss() + assert mx.np.sum(loss(output, label, weighting)) == 3.75 + + loss = gluon.loss.HingeLoss(margin=10) + assert mx.np.sum(loss(output, label)) == 13. + loss = gluon.loss.HingeLoss(margin=8, weight=0.25) + assert mx.np.sum(loss(output, label)) == 2.25 + loss = gluon.loss.HingeLoss(margin=7) + assert mx.np.sum(loss(output, label, weighting)) == 4. + + loss = gluon.loss.SquaredHingeLoss(margin=10) + assert mx.np.sum(loss(output, label)) == 97. + loss = gluon.loss.SquaredHingeLoss(margin=8, weight=0.25) + assert mx.np.sum(loss(output, label)) == 13.25 + loss = gluon.loss.SquaredHingeLoss(margin=7) + assert mx.np.sum(loss(output, label, weighting)) == 19. + + loss = gluon.loss.TripletLoss(margin=10) + assert mx.np.sum(loss(output, label, -label)) == 6. + loss = gluon.loss.TripletLoss(margin=8, weight=0.25) + assert mx.np.sum(loss(output, label, -label)) == 1. + loss = gluon.loss.TripletLoss(margin=7) + assert mx.np.sum(loss(output, label, -label, weighting)) == 1.5 + + output = mx.np.array([[0, 2], [1, 4]]) + label = mx.np.array([0, 1]) + weighting = mx.np.array([[0.5], [1.0]]) + + loss = gluon.loss.SoftmaxCrossEntropyLoss() + L = loss(output, label).asnumpy() + assert_almost_equal(L, np.array([ 2.12692809, 0.04858733]), rtol=1e-3, atol=1e-4) + + L = loss(output, label, weighting).asnumpy() + assert_almost_equal(L, np.array([ 1.06346405, 0.04858733]), rtol=1e-3, atol=1e-4) + + +@with_seed() +@use_np +def test_bce_equal_ce2(): + N = 100 + loss1 = gluon.loss.SigmoidBCELoss(from_sigmoid=True) + loss2 = gluon.loss.SoftmaxCELoss(from_logits=True) + out1 = mx.np.random.uniform(0.1, 0.9, size=(N, 1)) + out2 = mx.np.log(mx.np.concatenate((1-out1, out1), axis=1) + 1e-8) + label = mx.np.round(mx.np.random.uniform(0, 1, size=(N, 1))) + assert_almost_equal(loss1(out1, label).asnumpy(), loss2(out2, label).asnumpy()) + +@use_np +def test_logistic_loss_equal_bce(): + N = 100 + loss_binary = gluon.loss.LogisticLoss(label_format='binary') + loss_signed = gluon.loss.LogisticLoss(label_format='signed') + loss_bce = gluon.loss.SigmoidBCELoss(from_sigmoid=False) + data = mx.np.random.uniform(-10, 10, size=(N, 1)) + label = mx.np.round(mx.np.random.uniform(0, 1, size=(N, 1))) + assert_almost_equal(loss_binary(data, label), loss_bce(data, label), atol=1e-6) + assert_almost_equal(loss_signed(data, 2 * label - 1), loss_bce(data, label), atol=1e-6) + + +@with_seed() +@use_np +def test_ctc_loss(): + loss = gluon.loss.CTCLoss() + l = loss(mx.np.ones((2,20,4)), mx.np.array([[1,0,-1,-1],[2,1,1,-1]])) + assert_almost_equal(l, np.array([18.82820702, 16.50581741])) + + loss = gluon.loss.CTCLoss(layout='TNC') + l = loss(mx.np.ones((20,2,4)), mx.np.array([[1,0,-1,-1],[2,1,1,-1]])) + assert_almost_equal(l, np.array([18.82820702, 16.50581741])) + + loss = gluon.loss.CTCLoss(layout='TNC', label_layout='TN') + l = loss(mx.np.ones((20,2,4)), mx.np.array([[1,0,-1,-1],[2,1,1,-1]]).T) + assert_almost_equal(l, np.array([18.82820702, 16.50581741])) + + loss = gluon.loss.CTCLoss() + l = loss(mx.np.ones((2,20,4)), mx.np.array([[2,1,2,2],[3,2,2,2]]), None, mx.np.array([2,3])) + assert_almost_equal(l, np.array([18.82820702, 16.50581741])) + + loss = gluon.loss.CTCLoss() + l = loss(mx.np.ones((2,25,4)), mx.np.array([[2,1,-1,-1],[3,2,2,-1]]), mx.np.array([20,20])) + assert_almost_equal(l, np.array([18.82820702, 16.50581741])) + + loss = gluon.loss.CTCLoss() + l = loss(mx.np.ones((2,25,4)), mx.np.array([[2,1,3,3],[3,2,2,3]]), mx.np.array([20,20]), mx.np.array([2,3])) + assert_almost_equal(l, np.array([18.82820702, 16.50581741])) + + +@xfail_when_nonstandard_decimal_separator +@with_seed() +@use_np +def test_sdml_loss(): + + N = 5 # number of samples + DIM = 10 # Dimensionality + EPOCHS = 20 + + # Generate randomized data and 'positive' samples + data = mx.np.random.uniform(-1, 1, size=(N, DIM)) + pos = data + mx.np.random.uniform(-0.1, 0.1, size=(N, DIM)) # correlated paired data + data_iter = mx.io.NDArrayIter({'data' : data, 'pos' : pos}, batch_size=N) + + # Init model and trainer + sdml_loss = gluon.loss.SDMLLoss() + model = gluon.nn.Dense(DIM, activation='tanh') # Simple NN encoder + model.initialize(mx.init.Xavier(), ctx=mx.current_context()) + trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate' : 0.1}) + + for i in range(EPOCHS): # Training loop + data_iter.reset() + for iter_batch in data_iter: + batch = [datum.as_in_ctx(mx.current_context()).as_np_ndarray() for datum in iter_batch.data] + with autograd.record(): + data, pos = batch + z_data, z_pos = model(data), model(pos) + loss = sdml_loss(z_data, z_pos) + loss.backward() + trainer.step(1) + + # After training euclidean distance between aligned pairs should be lower than all non-aligned pairs + avg_loss = loss.sum()/len(loss) + assert(avg_loss < 0.05) + +@with_seed() +@use_np +def test_cosine_loss(): + #Generating samples + input1 = mx.np.random.randn(3, 2) + input2 = mx.np.random.randn(3, 2) + label = mx.np.sign(mx.np.random.randn(input1.shape[0])) + #Calculating loss from cosine embedding loss function in Gluon + Loss = gluon.loss.CosineEmbeddingLoss() + loss = Loss(input1, input2, label) + + # Calculating the loss Numpy way + numerator = mx.np.sum(input1 * input2, keepdims=True, axis=1) + denominator = mx.np.sqrt(mx.np.sum(input1**2, axis=1, keepdims=True)) \ + * mx.np.sqrt(mx.np.sum(input2**2, axis=1, keepdims=True)) + x = numerator/denominator + label = mx.npx.reshape(label, (-1, 1)) + numpy_loss = mx.npx.reshape( + mx.np.where(label == 1, 1-x, mx.npx.relu(x)), (-1,)) + assert_almost_equal(loss.asnumpy(), numpy_loss.asnumpy(), rtol=1e-3, atol=1e-5) + +@xfail_when_nonstandard_decimal_separator +@use_np +def test_poisson_nllloss(): + shape=(3, 4) + not_axis0 = tuple(range(1, len(shape))) + pred = mx.np.random.normal(size=shape) + min_pred = mx.np.min(pred) + #This is necessary to ensure only positive random values are generated for prediction, + # to avoid ivalid log calculation + pred[:] = pred + mx.np.abs(min_pred) + target = mx.np.random.normal(size=shape) + min_target = mx.np.min(target) + #This is necessary to ensure only positive random values are generated for prediction, + # to avoid ivalid log calculation + target[:] += mx.np.abs(min_target) + + Loss = gluon.loss.PoissonNLLLoss(from_logits=True) + Loss_no_logits = gluon.loss.PoissonNLLLoss(from_logits=False) + #Calculating by brute formula for default value of from_logits = True + + # 1) Testing for flag logits = True + brute_loss = mx.np.mean(mx.np.exp(pred) - target * pred, axis=1) + loss_withlogits = Loss(pred, target) + assert_almost_equal(brute_loss, loss_withlogits) + + #2) Testing for flag logits = False + loss_no_logits = Loss_no_logits(pred, target) + np_loss_no_logits = mx.np.mean(pred - target * mx.np.log(pred + 1e-08), + axis=1) + assert_almost_equal(np_loss_no_logits, loss_no_logits) + + #3) Testing for Sterling approximation + shape=(2, 3) + np_pred = mx.np.random.uniform(1, 5, shape) + np_target = mx.np.random.uniform(1, 5, shape) + np_compute_full = mx.np.mean((np_pred - np_target * mx.np.log(np_pred + 1e-08)) + ((np_target * np.log(np_target)-\ + np_target + 0.5 * np.log(2 * np_target * np.pi))*(np_target > 1)), axis=1) + Loss_compute_full = gluon.loss.PoissonNLLLoss(from_logits=False, compute_full=True) + loss_compute_full = Loss_compute_full(np_pred, np_target) + assert_almost_equal(np_compute_full, loss_compute_full) + diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py index ed9886ea8f75..76181e148950 100644 --- a/tests/python/unittest/test_numpy_op.py +++ b/tests/python/unittest/test_numpy_op.py @@ -852,7 +852,23 @@ def _test_np_exception(func, shape, dim): @with_seed() @use_np -def test_np_average(): +@pytest.mark.parametrize('a_shape,w_shape,axes', [ + ((3, 5), (3, 5), None), + ((4, 5, 6), (4, 5, 6), (0, 2)), + ((3,), (3,), 0), + ((2, 3), (3,), 1), + ((2, 3, 4), (2,), 0), + ((2, 3, 4), (3,), 1), + ((2, 3, 4), (4,), -1), + ((2, 3, 4, 5), (5,), 3) +]) +@pytest.mark.parametrize('dtype', ['float32', 'float64']) +@pytest.mark.parametrize('hybridize', [True, False]) +@pytest.mark.parametrize('is_weighted', [True, False]) +@pytest.mark.parametrize('returned', [True, False]) +@pytest.mark.parametrize('req_a', ['null', 'add', 'write']) +def test_np_average(a_shape, w_shape, axes, is_weighted, req_a, + hybridize, returned, dtype): class TestAverage(HybridBlock): def __init__(self, axis=None, returned=False): super(TestAverage, self).__init__() @@ -894,73 +910,57 @@ def avg_backward(a, w, avg, axes, init_a_grad=None, init_w_grad=None): w_grad += init_w_grad.asnumpy() return [a_grad, w_grad] - tensor_shapes = [ - ((3, 5), (3, 5), None), # (a_shape, w_shape, axes) - ((4, 5, 6), (4, 5, 6), (0, 2)), - ((3,), (3,), 0), - ((2, 3), (3,), 1), - ((2, 3, 4), (2,), 0), - ((2, 3, 4), (3,), 1), - ((2, 3, 4), (4,), -1), - ((2, 3, 4, 5), (5,), 3) - ] - - flags = [True, False] - dtypes = ['float32', 'float64'] - reqs = ['null', 'add', 'write'] - for hybridize, returned, (a_shape, w_shape, axes), dtype, is_weighted, req_a in \ - itertools.product(flags, flags, tensor_shapes, dtypes, flags, reqs): - if req_a == 'null' and not is_weighted: - continue - rtol, atol = 1e-3, 1e-4 - test_average = TestAverage(axes, returned) - if hybridize: - test_average.hybridize() - a = np.random.uniform(-1.0, 1.0, size=a_shape, dtype=dtype) - a.attach_grad(req_a) - init_a_grad = np.random.uniform(-1.0, 1.0, size=a_shape, dtype=dtype) if req_a == 'add' else None - init_w_grad = None - req_w = req_a - w, np_w = None, None - if is_weighted: - w = np.random.uniform(-1.0, 1.0, size=w_shape, dtype=dtype) - if req_a == 'null': - req_w = random.choice(['add', 'write']) - w.attach_grad(req_w) - if req_w == 'add': - init_w_grad = np.random.uniform(-1.0, 1.0, size=w_shape, dtype=dtype) - np_w = w.asnumpy() - np_out = _np.average(a.asnumpy(), axis=axes, weights=np_w, returned=returned) - with mx.autograd.record(): - mx_out = test_average(a, w) - if returned: - np_out, np_sum_of_weights = np_out - mx_out, mx_sum_of_weights = mx_out - assert_almost_equal(mx_sum_of_weights.asnumpy(), np_sum_of_weights, rtol=rtol, atol=atol) - assert mx_out.shape == np_out.shape - assert_almost_equal(mx_out.asnumpy(), np_out.astype(dtype), rtol=rtol, atol=atol) - if req_a == 'add': - a.grad[:] = init_a_grad - if is_weighted and req_w == 'add': - w.grad[:] = init_w_grad - mx_out.backward() - # Code to get reference backward value - a_grad, w_grad = avg_backward(a.asnumpy(), np_w, np_out, axes, init_a_grad, init_w_grad) - if is_weighted: - assert_almost_equal(w.grad.asnumpy(), w_grad, rtol=rtol*10, atol=atol*10) + if req_a == 'null' and not is_weighted: + return + rtol, atol = 1e-3, 1e-4 + test_average = TestAverage(axes, returned) + if hybridize: + test_average.hybridize() + a = np.random.uniform(-1.0, 1.0, size=a_shape, dtype=dtype) + a.attach_grad(req_a) + init_a_grad = np.random.uniform(-1.0, 1.0, size=a_shape, dtype=dtype) if req_a == 'add' else None + init_w_grad = None + req_w = req_a + w, np_w = None, None + if is_weighted: + w = np.random.uniform(-1.0, 1.0, size=w_shape, dtype=dtype) if req_a == 'null': - assert a.grad is None - else: - assert_almost_equal(a.grad.asnumpy(), a_grad, rtol=rtol, atol=atol) + req_w = random.choice(['add', 'write']) + w.attach_grad(req_w) + if req_w == 'add': + init_w_grad = np.random.uniform(-1.0, 1.0, size=w_shape, dtype=dtype) + np_w = w.asnumpy() + np_out = _np.average(a.asnumpy(), axis=axes, weights=np_w, returned=returned) + with mx.autograd.record(): + mx_out = test_average(a, w) + if returned: + np_out, np_sum_of_weights = np_out + mx_out, mx_sum_of_weights = mx_out + assert_almost_equal(mx_sum_of_weights.asnumpy(), np_sum_of_weights, rtol=rtol, atol=atol) + assert mx_out.shape == np_out.shape + assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol) + if req_a == 'add': + a.grad[:] = init_a_grad + if is_weighted and req_w == 'add': + w.grad[:] = init_w_grad + mx_out.backward() + # Code to get reference backward value + a_grad, w_grad = avg_backward(a.asnumpy(), np_w, np_out, axes, init_a_grad, init_w_grad) + if is_weighted: + assert_almost_equal(w.grad.asnumpy(), w_grad, rtol=rtol*10, atol=atol*10) + if req_a == 'null': + assert a.grad is None + else: + assert_almost_equal(a.grad.asnumpy(), a_grad, rtol=rtol, atol=atol) - # Test imperative once again - np_out = _np.average(a.asnumpy(), weights=np_w, axis=axes, returned=returned) - mx_out = np.average(a, weights=w, axis=axes, returned=returned) - if returned: - np_out, np_sum_of_weights = np_out - mx_out, mx_sum_of_weights = mx_out - assert_almost_equal(mx_sum_of_weights.asnumpy(), np_sum_of_weights, rtol=rtol, atol=atol) - assert_almost_equal(mx_out.asnumpy(), np_out.astype(dtype), rtol=rtol, atol=atol) + # Test imperative once again + np_out = _np.average(a.asnumpy(), weights=np_w, axis=axes, returned=returned) + mx_out = np.average(a, weights=w, axis=axes, returned=returned) + if returned: + np_out, np_sum_of_weights = np_out + mx_out, mx_sum_of_weights = mx_out + assert_almost_equal(mx_sum_of_weights.asnumpy(), np_sum_of_weights, rtol=rtol, atol=atol) + assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol) @with_seed() @@ -1352,7 +1352,7 @@ def index_add_forward(a, ind, val, ind_ndim, ind_num): else: a[t_ind] += val return a - + def index_add_bwd(out_grad, a_grad, ind, val_grad, ind_ndim, ind_num, grad_req_a, grad_req_val): if grad_req_a == 'add': init_a_grad = _np.array(a_grad) @@ -9523,7 +9523,7 @@ def hybrid_forward(self, F, cond, x, y): same(ret.asnumpy(), _np.where(cond.asnumpy(), x.asnumpy(), 1)) ret_rscalar.backward() same(x.grad.asnumpy(), collapse_sum_like(_np.broadcast_to(cond.asnumpy(), ret.shape), shape_pair[1])) - + # check both scalar case x = _np.random.randint(0, 100) y = _np.random.randint(0, 100)