From 5c3870fb1a49fc0a9c181683a839c3100890bce2 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sat, 19 Dec 2020 16:21:33 +0100
Subject: [PATCH 01/44] first version

---
 rasa/utils/tensorflow/layers.py | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index a9017094e945..aba25b46f86f 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -853,16 +853,33 @@ def _loss_softmax(
     ) -> tf.Tensor:
         """Define softmax loss."""
 
-        logits = tf.concat(
+        softmax_logits = tf.concat(
+            [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
+        )
+
+        sigmoid_logits = tf.concat(
             [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
         )
 
         # create label_ids for softmax
-        label_ids = tf.zeros_like(logits[..., 0], tf.int32)
+        softmax_label_ids = tf.zeros_like(softmax_logits[..., 0], tf.int32)
+
+        sigmoid_label_ids = tf.concat(
+            [
+                tf.expand_dims(tf.ones_like(sigmoid_logits[..., 0], tf.float32), -1),
+                tf.zeros_like(sigmoid_logits[..., 1:], tf.float32),
+            ],
+            axis=-1,
+        )
 
-        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-            labels=label_ids, logits=logits
+        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            labels=softmax_label_ids, logits=softmax_logits
         )
+        sigmoid_loss = tf.nn.sigmoid_cross_entropy_with_logits(
+            labels=sigmoid_label_ids, logits=sigmoid_logits
+        )
+
+        loss = softmax_loss + tf.reduce_mean(sigmoid_loss, axis=-1)
 
         if self.scale_loss:
             # in case of cross entropy log_likelihood = -loss
@@ -878,6 +895,14 @@ def _loss_softmax(
             else:
                 loss = tf.reduce_mean(loss, axis=-1)
 
+        tf.print(
+            tf.reduce_mean(sim_pos),
+            tf.reduce_mean(sim_neg_ii),
+            tf.reduce_mean(sim_neg_il),
+            tf.reduce_mean(sim_neg_ll),
+            tf.reduce_mean(sim_neg_li),
+        )
+
         # average the loss over the batch
         return tf.reduce_mean(loss)
 

From 8cff4ec91f953f465c42e5f634d07eb3247c5ea6 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 21 Dec 2020 16:23:41 +0100
Subject: [PATCH 02/44] remove extra terms from softmax

---
 rasa/utils/tensorflow/layers.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index aba25b46f86f..07f16851edde 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -853,9 +853,7 @@ def _loss_softmax(
     ) -> tf.Tensor:
         """Define softmax loss."""
 
-        softmax_logits = tf.concat(
-            [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
-        )
+        softmax_logits = tf.concat([sim_pos, sim_neg_il, sim_neg_li], axis=-1)
 
         sigmoid_logits = tf.concat(
             [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
@@ -864,7 +862,7 @@ def _loss_softmax(
         # create label_ids for softmax
         softmax_label_ids = tf.zeros_like(softmax_logits[..., 0], tf.int32)
 
-        sigmoid_label_ids = tf.concat(
+        sigmoid_labels = tf.concat(
             [
                 tf.expand_dims(tf.ones_like(sigmoid_logits[..., 0], tf.float32), -1),
                 tf.zeros_like(sigmoid_logits[..., 1:], tf.float32),
@@ -876,7 +874,7 @@ def _loss_softmax(
             labels=softmax_label_ids, logits=softmax_logits
         )
         sigmoid_loss = tf.nn.sigmoid_cross_entropy_with_logits(
-            labels=sigmoid_label_ids, logits=sigmoid_logits
+            labels=sigmoid_labels, logits=sigmoid_logits
         )
 
         loss = softmax_loss + tf.reduce_mean(sigmoid_loss, axis=-1)

From 7d971d8084dd67ee024d02069efae59c5d15a031 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 6 Jan 2021 15:42:00 +0100
Subject: [PATCH 03/44] refactor based on config option. Ready for test

---
 rasa/core/policies/ted_policy.py        |  4 ++
 rasa/nlu/classifiers/diet_classifier.py |  4 ++
 rasa/nlu/selectors/response_selector.py |  4 ++
 rasa/utils/tensorflow/constants.py      |  1 +
 rasa/utils/tensorflow/layers.py         | 62 ++++++++++++++++---------
 rasa/utils/tensorflow/models.py         |  2 +
 6 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index f06530b26a54..29c3e1ac9dfc 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -99,6 +99,7 @@
     HIDDEN_LAYERS_SIZES,
     FEATURIZERS,
     ENTITY_RECOGNITION,
+    CONSTRAIN_SIMILARITIES,
 )
 from rasa.shared.core.events import EntitiesAdded, Event
 from rasa.shared.nlu.training_data.message import Message
@@ -272,6 +273,9 @@ class TEDPolicy(Policy):
         FEATURIZERS: [],
         # If set to true, entities are predicted in user utterances.
         ENTITY_RECOGNITION: True,
+        # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
+        # ensure that similarity values are approximately bounded. Used inside softmax loss only.
+        CONSTRAIN_SIMILARITIES: True,
     }
 
     @staticmethod
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index d1f26fec25fd..8bf50afefc81 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -95,6 +95,7 @@
     SEQUENCE_LENGTH,
     DENSE_DIMENSION,
     MASK,
+    CONSTRAIN_SIMILARITIES,
 )
 
 logger = logging.getLogger(__name__)
@@ -252,6 +253,9 @@ def required_components(cls) -> List[Type[Component]]:
         # Split entities by comma, this makes sense e.g. for a list of ingredients
         # in a recipie, but it doesn't make sense for the parts of an address
         SPLIT_ENTITIES_BY_COMMA: True,
+        # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
+        # ensure that similarity values are approximately bounded. Used inside softmax loss only.
+        CONSTRAIN_SIMILARITIES: True,
     }
 
     # init helpers
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index ac78b6d3964a..a1b325ba8600 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -73,6 +73,7 @@
     FEATURIZERS,
     CHECKPOINT_MODEL,
     DENSE_DIMENSION,
+    CONSTRAIN_SIMILARITIES,
 )
 from rasa.nlu.constants import (
     RESPONSE_SELECTOR_PROPERTY_NAME,
@@ -230,6 +231,9 @@ def required_components(cls) -> List[Type[Component]]:
         FEATURIZERS: [],
         # Perform model checkpointing
         CHECKPOINT_MODEL: False,
+        # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
+        # ensure that similarity values are approximately bounded. Used inside softmax loss only.
+        CONSTRAIN_SIMILARITIES: True,
     }
 
     def __init__(
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 29c046258dac..6595fbfa090e 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -38,6 +38,7 @@
 DROP_RATE_ATTENTION = "drop_rate_attention"
 DROP_RATE_DIALOGUE = "drop_rate_dialogue"
 DROP_RATE_LABEL = "drop_rate_label"
+CONSTRAIN_SIMILARITIES = "constrain_similarities"
 
 WEIGHT_SPARSITY = "weight_sparsity"
 
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 07f16851edde..fb1afc334165 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -566,6 +566,10 @@ class DotProductLoss(tf.keras.layers.Layer):
             to run in parallel.
         same_sampling: Boolean, if 'True' sample same negative labels
             for the whole batch.
+        constrain_similarities: Boolean, if 'True' applies sigmoid on all
+            similarity terms and adds to the loss function to
+            ensure that similarity values are approximately bounded.
+            Used inside _loss_softmax() only.
     """
 
     def __init__(
@@ -580,6 +584,7 @@ def __init__(
         name: Optional[Text] = None,
         parallel_iterations: int = 1000,
         same_sampling: bool = False,
+        constrain_similarities=True,
     ) -> None:
         super().__init__(name=name)
         self.num_neg = num_neg
@@ -591,6 +596,7 @@ def __init__(
         self.scale_loss = scale_loss
         self.parallel_iterations = parallel_iterations
         self.same_sampling = same_sampling
+        self.constrain_similarities = constrain_similarities
 
     @staticmethod
     def _make_flat(x: tf.Tensor) -> tf.Tensor:
@@ -853,31 +859,49 @@ def _loss_softmax(
     ) -> tf.Tensor:
         """Define softmax loss."""
 
+        # Similarity terms between input and label should be optimized relative
+        # to each other and hence use them as logits for softmax term
         softmax_logits = tf.concat([sim_pos, sim_neg_il, sim_neg_li], axis=-1)
 
-        sigmoid_logits = tf.concat(
-            [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
-        )
+        if not self.constrain_similarities:
+            # Concatenate other similarity terms as well. Due to this,
+            # similarity values between input and label may not be
+            # approximately bounded in a defined range.
+            softmax_logits = tf.concat(
+                [softmax_logits, sim_neg_ii, sim_neg_ll], axis=-1
+            )
 
         # create label_ids for softmax
         softmax_label_ids = tf.zeros_like(softmax_logits[..., 0], tf.int32)
 
-        sigmoid_labels = tf.concat(
-            [
-                tf.expand_dims(tf.ones_like(sigmoid_logits[..., 0], tf.float32), -1),
-                tf.zeros_like(sigmoid_logits[..., 1:], tf.float32),
-            ],
-            axis=-1,
-        )
-
         softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
             labels=softmax_label_ids, logits=softmax_logits
         )
-        sigmoid_loss = tf.nn.sigmoid_cross_entropy_with_logits(
-            labels=sigmoid_labels, logits=sigmoid_logits
-        )
 
-        loss = softmax_loss + tf.reduce_mean(sigmoid_loss, axis=-1)
+        loss = softmax_loss
+
+        if self.constrain_similarities:
+            # Constrain similarity values in a range by applying sigmoid
+            # on them individually so that they saturate at extreme values.
+            sigmoid_logits = tf.concat(
+                [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
+            )
+
+            sigmoid_labels = tf.concat(
+                [
+                    tf.expand_dims(
+                        tf.ones_like(sigmoid_logits[..., 0], tf.float32), -1
+                    ),
+                    tf.zeros_like(sigmoid_logits[..., 1:], tf.float32),
+                ],
+                axis=-1,
+            )
+
+            sigmoid_loss = tf.nn.sigmoid_cross_entropy_with_logits(
+                labels=sigmoid_labels, logits=sigmoid_logits
+            )
+
+            loss += tf.reduce_mean(sigmoid_loss, axis=-1)
 
         if self.scale_loss:
             # in case of cross entropy log_likelihood = -loss
@@ -893,14 +917,6 @@ def _loss_softmax(
             else:
                 loss = tf.reduce_mean(loss, axis=-1)
 
-        tf.print(
-            tf.reduce_mean(sim_pos),
-            tf.reduce_mean(sim_neg_ii),
-            tf.reduce_mean(sim_neg_il),
-            tf.reduce_mean(sim_neg_ll),
-            tf.reduce_mean(sim_neg_li),
-        )
-
         # average the loss over the batch
         return tf.reduce_mean(loss)
 
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index f4ff88562645..bb81c0ac1772 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -55,6 +55,7 @@
     CONCAT_DIMENSION,
     DROP_RATE_ATTENTION,
     SCALE_LOSS,
+    CONSTRAIN_SIMILARITIES,
 )
 from rasa.utils.tensorflow import layers
 from rasa.utils.tensorflow.transformer import TransformerEncoder
@@ -790,6 +791,7 @@ def _prepare_dot_product_loss(
             scale_loss,
             # set to 1 to get deterministic behaviour
             parallel_iterations=1 if self.random_seed is not None else 1000,
+            constrain_similarities=self.config[CONSTRAIN_SIMILARITIES],
         )
 
     def _prepare_sparse_dense_dropout_layers(

From bcecf412bd7125410c40d66006eab635a2a5a021 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 7 Jan 2021 11:45:53 +0100
Subject: [PATCH 04/44] add sigmoid based prediction during inference

---
 rasa/core/policies/ted_policy.py        |  9 +++++++-
 rasa/nlu/classifiers/diet_classifier.py | 10 ++++++++-
 rasa/nlu/selectors/response_selector.py |  3 +++
 rasa/utils/tensorflow/constants.py      |  1 +
 rasa/utils/tensorflow/layers.py         | 26 +++++++++++++++++-----
 rasa/utils/tensorflow/models.py         |  2 ++
 rasa/utils/train_utils.py               | 29 ++++++++++++++++++++-----
 7 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 29c3e1ac9dfc..37f9a424663d 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -100,6 +100,7 @@
     FEATURIZERS,
     ENTITY_RECOGNITION,
     CONSTRAIN_SIMILARITIES,
+    RELATIVE_CONFIDENCE,
 )
 from rasa.shared.core.events import EntitiesAdded, Event
 from rasa.shared.nlu.training_data.message import Message
@@ -276,6 +277,8 @@ class TEDPolicy(Policy):
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: True,
+        # Return sigmoid based probabilities during prediction.
+        RELATIVE_CONFIDENCE: True,
     }
 
     @staticmethod
@@ -613,10 +616,14 @@ def predict_action_probabilities(
         confidence, is_e2e_prediction = self._pick_confidence(confidences, similarities)
 
         if self.config[LOSS_TYPE] == SOFTMAX and self.config[RANKING_LENGTH] > 0:
-            confidence = rasa.utils.train_utils.normalize(
+            confidence = rasa.utils.train_utils.sort_and_rank(
                 confidence, self.config[RANKING_LENGTH]
             )
 
+            if self.config[RELATIVE_CONFIDENCE]:
+                # Normalize the values if returned probabilities are from softmax.
+                confidence = rasa.utils.train_utils.normalize(confidence)
+
         optional_events = self._create_optional_event_for_entities(
             output, is_e2e_prediction, interpreter, tracker
         )
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 8bf50afefc81..9fc18febf5cb 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -96,6 +96,7 @@
     DENSE_DIMENSION,
     MASK,
     CONSTRAIN_SIMILARITIES,
+    RELATIVE_CONFIDENCE,
 )
 
 logger = logging.getLogger(__name__)
@@ -256,6 +257,8 @@ def required_components(cls) -> List[Type[Component]]:
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: True,
+        # Return sigmoid based probabilities during prediction.
+        RELATIVE_CONFIDENCE: True,
     }
 
     # init helpers
@@ -858,10 +861,15 @@ def _predict_label(
             self.component_config[LOSS_TYPE] == SOFTMAX
             and self.component_config[RANKING_LENGTH] > 0
         ):
-            message_sim = train_utils.normalize(
+            message_sim = train_utils.sort_and_rank(
                 message_sim, self.component_config[RANKING_LENGTH]
             )
 
+            if self.component_config[RELATIVE_CONFIDENCE]:
+                # Normalize the values if returned probabilities are from
+                # softmax(hence relative to each other).
+                message_sim = train_utils.normalize(message_sim)
+
         message_sim[::-1].sort()
         message_sim = message_sim.tolist()
 
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index a1b325ba8600..ba00a29e9bf0 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -74,6 +74,7 @@
     CHECKPOINT_MODEL,
     DENSE_DIMENSION,
     CONSTRAIN_SIMILARITIES,
+    RELATIVE_CONFIDENCE,
 )
 from rasa.nlu.constants import (
     RESPONSE_SELECTOR_PROPERTY_NAME,
@@ -234,6 +235,8 @@ def required_components(cls) -> List[Type[Component]]:
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: True,
+        # Return sigmoid based probabilities during prediction.
+        RELATIVE_CONFIDENCE: False,
     }
 
     def __init__(
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 6595fbfa090e..6f462ec1381e 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -53,6 +53,7 @@
 DENSE_INPUT_DROPOUT = "use_dense_input_dropout"
 
 RANKING_LENGTH = "ranking_length"
+RELATIVE_CONFIDENCE = "relative_confidence"
 
 BILOU_FLAG = "BILOU_flag"
 
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index fb1afc334165..93a294c72bab 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -570,6 +570,8 @@ class DotProductLoss(tf.keras.layers.Layer):
             similarity terms and adds to the loss function to
             ensure that similarity values are approximately bounded.
             Used inside _loss_softmax() only.
+        relative_confidence: Boolean, if 'True' confidence is calculated by applying
+            softmax over similarities, else sigmoid is applied on individual similarities.
     """
 
     def __init__(
@@ -584,7 +586,8 @@ def __init__(
         name: Optional[Text] = None,
         parallel_iterations: int = 1000,
         same_sampling: bool = False,
-        constrain_similarities=True,
+        constrain_similarities: bool = True,
+        relative_confidence: bool = True,
     ) -> None:
         super().__init__(name=name)
         self.num_neg = num_neg
@@ -597,6 +600,7 @@ def __init__(
         self.parallel_iterations = parallel_iterations
         self.same_sampling = same_sampling
         self.constrain_similarities = constrain_similarities
+        self.relative_confidence = relative_confidence
 
     @staticmethod
     def _make_flat(x: tf.Tensor) -> tf.Tensor:
@@ -737,14 +741,26 @@ def sim(a: tf.Tensor, b: tf.Tensor, mask: Optional[tf.Tensor] = None) -> tf.Tens
 
         return sim
 
-    @staticmethod
-    def confidence_from_sim(sim: tf.Tensor, similarity_type: Text) -> tf.Tensor:
+    def confidence_from_sim(self, sim: tf.Tensor, similarity_type: Text) -> tf.Tensor:
+        """Computes model confidence/probability from computed similarities.
+
+        Args:
+            sim: Computed similarities
+            similarity_type: Similarity function to use - COSINE, INNER, AUTO.
+
+        Returns:
+            Confidences corresponding to each similarity value.
+        """
         if similarity_type == COSINE:
             # clip negative values to zero
             return tf.nn.relu(sim)
         else:
-            # normalize result to [0, 1] with softmax
-            return tf.nn.softmax(sim)
+            if self.relative_confidence:
+                # normalize result to [0, 1] with softmax
+                return tf.nn.softmax(sim)
+            else:
+                # Convert each individual similarity to probability
+                return tf.nn.sigmoid(sim)
 
     def _train_sim(
         self,
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index bb81c0ac1772..1abe040732ee 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -56,6 +56,7 @@
     DROP_RATE_ATTENTION,
     SCALE_LOSS,
     CONSTRAIN_SIMILARITIES,
+    RELATIVE_CONFIDENCE,
 )
 from rasa.utils.tensorflow import layers
 from rasa.utils.tensorflow.transformer import TransformerEncoder
@@ -792,6 +793,7 @@ def _prepare_dot_product_loss(
             # set to 1 to get deterministic behaviour
             parallel_iterations=1 if self.random_seed is not None else 1000,
             constrain_similarities=self.config[CONSTRAIN_SIMILARITIES],
+            relative_confidence=self.config[RELATIVE_CONFIDENCE],
         )
 
     def _prepare_sparse_dense_dropout_layers(
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index fb9ea1faf6ed..9eec452176cb 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -32,19 +32,38 @@
     from rasa.nlu.tokenizers.tokenizer import Token
 
 
-def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
+def normalize(values: np.ndarray) -> np.ndarray:
     """Normalizes an array of positive numbers over the top `ranking_length` values.
 
+    Args:
+        values: Values to normalize
+
+    Returns:
+        Normalized values.
+    """
+    new_values = values.copy()
+
+    if np.sum(new_values) > 0:
+        new_values = new_values / np.sum(new_values)
+
+    return new_values
+
+
+def sort_and_rank(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
+    """Sorts the values in descending order and keep only top `ranking_length` values.
+
     Other values will be set to 0.
+    Args:
+        values: Values to sort and rank
+        ranking_length: number of values to maintain above 0.
+
+    Returns:
+        Modified values.
     """
     new_values = values.copy()  # prevent mutation of the input
     if 0 < ranking_length < len(new_values):
         ranked = sorted(new_values, reverse=True)
         new_values[new_values < ranked[ranking_length - 1]] = 0
-
-    if np.sum(new_values) > 0:
-        new_values = new_values / np.sum(new_values)
-
     return new_values
 
 

From 6746bbdee0d817542a7cfb4b33683210690c6497 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 7 Jan 2021 12:56:11 +0100
Subject: [PATCH 05/44] docs, changelog, docstrings

---
 changelog/7616.improvement.md           |  6 ++++++
 docs/docs/components.mdx                | 24 ++++++++++++++++++++++++
 docs/docs/policies.mdx                  | 12 ++++++++++++
 rasa/core/policies/ted_policy.py        |  2 +-
 rasa/nlu/classifiers/diet_classifier.py |  2 +-
 rasa/nlu/selectors/response_selector.py |  6 +++---
 rasa/utils/tensorflow/layers.py         |  2 +-
 7 files changed, 48 insertions(+), 6 deletions(-)
 create mode 100644 changelog/7616.improvement.md

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
new file mode 100644
index 000000000000..bd6007ae1bc3
--- /dev/null
+++ b/changelog/7616.improvement.md
@@ -0,0 +1,6 @@
+Constrain similarity values to an approximate range in `DotProductLoss` by applying sigmoid over them during training.
+
+This affects the default behaviour of the loss function(`loss_type=softmax`) inside machine learning(ML) components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`.
+If you notice a degradation in performance, set `constrain_similarities=False` in the respective ML component.
+
+Also, adds an option `relative_confidence` to each ML component. Contrary to the default behaviour, if `relative_confidence` is set to `False`, the confidences for each label will be between 0 and 1 but they will not sum up to 1.
\ No newline at end of file
diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx
index fa23ed2d93cd..f57866432aa8 100644
--- a/docs/docs/components.mdx
+++ b/docs/docs/components.mdx
@@ -1616,6 +1616,18 @@ However, additional parameters exist that can be adapted.
 |                                 |                  | ...                                                          |
 |                                 |                  | ```                                                          |
 +---------------------------------+------------------+--------------------------------------------------------------+
+| constrain_similarities          | True             | If `True`, applies sigmoid on all similarity terms and adds  |
+|                                 |                  | it to the loss function to ensure that similarity values are |
+|                                 |                  | approximately bounded. Used only when `loss_type=softmax`    |
++---------------------------------+------------------+--------------------------------------------------------------+
+| relative_confidence             | True             | If `True`, applies softmax on all similarity values for pairs|
+|                                 |                  | of input and labels. This means that output confidences      |
+|                                 |                  | will always add up to 1.                                     |
+|                                 |                  | If `False`, applies sigmoid on all similarity values for     |
+|                                 |                  | pairs of input and labels. This means that confidence for    |
+|                                 |                  | each label will be between 0 and 1 but all of them won't add |
+|                                 |                  | up to 1.                                                     |
++---------------------------------+------------------+--------------------------------------------------------------+
 ```
 
 :::note
@@ -2809,6 +2821,18 @@ However, additional parameters exist that can be adapted.
 |                                 |                   | Requires `evaluate_on_number_of_examples > 0` and            |
 |                                 |                   | `evaluate_every_number_of_epochs > 0`                        |
 +---------------------------------+-------------------+--------------------------------------------------------------+
+| constrain_similarities          | True             | If `True`, applies sigmoid on all similarity terms and adds  |
+|                                 |                  | it to the loss function to ensure that similarity values are |
+|                                 |                  | approximately bounded. Used only when `loss_type=softmax`    |
++---------------------------------+------------------+--------------------------------------------------------------+
+| relative_confidence             | True             | If `True`, applies softmax on all similarity values for pairs|
+|                                 |                  | of input and labels. This means that output confidences      |
+|                                 |                  | will always add up to 1.                                     |
+|                                 |                  | If `False`, applies sigmoid on all similarity values for     |
+|                                 |                  | pairs of input and labels. This means that confidence for    |
+|                                 |                  | each label will be between 0 and 1 but all of them won't add |
+|                                 |                  | up to 1.                                                     |
++---------------------------------+------------------+--------------------------------------------------------------+
 ```
 
 :::note
diff --git a/docs/docs/policies.mdx b/docs/docs/policies.mdx
index 6dcd139907cf..788e03c43c55 100644
--- a/docs/docs/policies.mdx
+++ b/docs/docs/policies.mdx
@@ -320,6 +320,18 @@ However, additional parameters exist that can be adapted.
 | entity_recognition                    | True                   | If 'True' entity recognition is trained and entities are     |
 |                                       |                        | extracted.                                                   |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
+| constrain_similarities                | True                   | If `True`, applies sigmoid on all similarity terms and adds  |
+|                                       |                        | it to the loss function to ensure that similarity values are |
+|                                       |                        | approximately bounded. Used only when `loss_type=softmax`    |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| relative_confidence                   | True                   | If `True`, applies softmax on all similarity values for pairs|
+|                                       |                        | of input and labels. This means that output confidences      |
+|                                       |                        | will always add up to 1.                                     |
+|                                       |                        | If `False`, applies sigmoid on all similarity values for     |
+|                                       |                        | pairs of input and labels. This means that confidence for    |
+|                                       |                        | each label will be between 0 and 1 but all of them won't add |
+|                                       |                        | up to 1.                                                     |
++---------------------------------------+------------------------+--------------------------------------------------------------+
 ```
 
 :::note
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 37f9a424663d..4e401befff6a 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -277,7 +277,7 @@ class TEDPolicy(Policy):
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: True,
-        # Return sigmoid based probabilities during prediction.
+        # Return softmax based probabilities during prediction.
         RELATIVE_CONFIDENCE: True,
     }
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 9fc18febf5cb..d13ba9e47bec 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -257,7 +257,7 @@ def required_components(cls) -> List[Type[Component]]:
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: True,
-        # Return sigmoid based probabilities during prediction.
+        # Return softmax based probabilities during prediction.
         RELATIVE_CONFIDENCE: True,
     }
 
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index ba00a29e9bf0..779a50afccf4 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -235,8 +235,8 @@ def required_components(cls) -> List[Type[Component]]:
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: True,
-        # Return sigmoid based probabilities during prediction.
-        RELATIVE_CONFIDENCE: False,
+        # Return softmax based probabilities during prediction.
+        RELATIVE_CONFIDENCE: True,
     }
 
     def __init__(
@@ -249,7 +249,7 @@ def __init__(
         responses: Optional[Dict[Text, List[Dict[Text, Any]]]] = None,
         finetune_mode: bool = False,
     ) -> None:
-
+        """Declare instance variables with default values."""
         component_config = component_config or {}
 
         # the following properties cannot be adapted for the ResponseSelector
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 93a294c72bab..04f98eb6fc85 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -589,6 +589,7 @@ def __init__(
         constrain_similarities: bool = True,
         relative_confidence: bool = True,
     ) -> None:
+        """Declare instance variables with default values."""
         super().__init__(name=name)
         self.num_neg = num_neg
         self.loss_type = loss_type
@@ -874,7 +875,6 @@ def _loss_softmax(
         mask: Optional[tf.Tensor],
     ) -> tf.Tensor:
         """Define softmax loss."""
-
         # Similarity terms between input and label should be optimized relative
         # to each other and hence use them as logits for softmax term
         softmax_logits = tf.concat([sim_pos, sim_neg_il, sim_neg_li], axis=-1)

From b674d731d0d85661cdd53fb0ca5781528cb77b09 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 7 Jan 2021 14:14:13 +0100
Subject: [PATCH 06/44] add tests

---
 rasa/core/policies/ted_policy.py              | 14 +++++
 rasa/nlu/classifiers/diet_classifier.py       | 16 +++++-
 tests/nlu/classifiers/test_diet_classifier.py | 52 +++++++++++++++++++
 tests/utils/test_train_utils.py               | 21 ++++++++
 4 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 4e401befff6a..c8a5af723d9a 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -324,11 +324,25 @@ def __init__(
         self._label_data: Optional[RasaModelData] = None
         self.data_example: Optional[Dict[Text, List[np.ndarray]]] = None
 
+    def _check_similarity_confidence_setting(self) -> None:
+        if (
+            not self.config[CONSTRAIN_SIMILARITIES]
+            and not self.config[RELATIVE_CONFIDENCE]
+        ):
+            raise ValueError(
+                f"If {CONSTRAIN_SIMILARITIES} is set to False, "
+                f"{RELATIVE_CONFIDENCE} cannot be set to False as"
+                f"similarities need to be constrained during training "
+                f"time in order to compute appropriate confidence values "
+                f"for each label at inference time."
+            )
+
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         new_config = rasa.utils.train_utils.check_core_deprecated_options(kwargs)
         self.config = rasa.utils.train_utils.override_defaults(
             self.defaults, new_config
         )
+        self._check_similarity_confidence_setting()
         self.config = rasa.utils.train_utils.update_similarity_type(self.config)
         self.config = rasa.utils.train_utils.update_evaluation_parameters(self.config)
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index d13ba9e47bec..01c6ed3f6390 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -254,7 +254,7 @@ def required_components(cls) -> List[Type[Component]]:
         # Split entities by comma, this makes sense e.g. for a list of ingredients
         # in a recipie, but it doesn't make sense for the parts of an address
         SPLIT_ENTITIES_BY_COMMA: True,
-        # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
+        # If 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: True,
         # Return softmax based probabilities during prediction.
@@ -290,6 +290,19 @@ def _check_share_hidden_layers_sizes(self) -> None:
                     f"{HIDDEN_LAYERS_SIZES} must coincide."
                 )
 
+    def _check_similarity_confidence_setting(self) -> None:
+        if (
+            not self.component_config[CONSTRAIN_SIMILARITIES]
+            and not self.component_config[RELATIVE_CONFIDENCE]
+        ):
+            raise ValueError(
+                f"If {CONSTRAIN_SIMILARITIES} is set to False, "
+                f"{RELATIVE_CONFIDENCE} cannot be set to False as"
+                f"similarities need to be constrained during training "
+                f"time in order to compute appropriate confidence values "
+                f"for each label at inference time."
+            )
+
     def _check_config_parameters(self) -> None:
         self.component_config = train_utils.check_deprecated_options(
             self.component_config
@@ -297,6 +310,7 @@ def _check_config_parameters(self) -> None:
 
         self._check_masked_lm()
         self._check_share_hidden_layers_sizes()
+        self._check_similarity_confidence_setting()
 
         self.component_config = train_utils.update_similarity_type(
             self.component_config
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index 90f20a61039e..be5f338b0f22 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -4,6 +4,7 @@
 import pytest
 from unittest.mock import Mock
 from typing import List, Text, Dict, Any
+from _pytest.monkeypatch import MonkeyPatch
 
 from rasa.shared.nlu.training_data.features import Features
 from rasa.nlu import train
@@ -29,6 +30,7 @@
     BILOU_FLAG,
     ENTITY_RECOGNITION,
     INTENT_CLASSIFICATION,
+    RELATIVE_CONFIDENCE,
 )
 from rasa.nlu.components import ComponentBuilder
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
@@ -311,6 +313,56 @@ async def test_softmax_normalization(
     assert parse_data.get("intent") == intent_ranking[0]
 
 
+@pytest.mark.parametrize(
+    "classifier_params, output_length",
+    [({RANDOM_SEED: 42, EPOCHS: 1, RELATIVE_CONFIDENCE: False}, LABEL_RANKING_LENGTH)],
+)
+async def test_softmax_with_absolute_confidence(
+    component_builder,
+    tmp_path,
+    classifier_params,
+    output_length,
+    monkeypatch: MonkeyPatch,
+):
+    pipeline = as_pipeline(
+        "WhitespaceTokenizer", "CountVectorsFeaturizer", "DIETClassifier"
+    )
+    assert pipeline[2]["name"] == "DIETClassifier"
+    pipeline[2].update(classifier_params)
+
+    _config = RasaNLUModelConfig({"pipeline": pipeline})
+    (trained_model, _, persisted_path) = await train(
+        _config,
+        path=str(tmp_path),
+        data="data/test/many_intents.md",
+        component_builder=component_builder,
+    )
+    loaded = Interpreter.load(persisted_path, component_builder)
+
+    mock = Mock()
+    monkeypatch.setattr(train_utils, "normalize", mock.normalize)
+
+    parse_data = loaded.parse("hello")
+    intent_ranking = parse_data.get("intent_ranking")
+
+    # check that the output was correctly truncated
+    assert len(intent_ranking) == output_length
+
+    intent_confidences = [intent.get("confidence") for intent in intent_ranking]
+
+    # check each confidence is in range
+    confidence_in_range = [
+        0.0 <= confidence <= 1.0 for confidence in intent_confidences
+    ]
+    assert all(confidence_in_range)
+
+    # normalize shouldn't have been called
+    mock.normalize.assert_not_called()
+
+    # check whether the normalization of rankings is reflected in intent prediction
+    assert parse_data.get("intent") == intent_ranking[0]
+
+
 @pytest.mark.parametrize(
     "classifier_params, output_length",
     [({LOSS_TYPE: "margin", RANDOM_SEED: 42, EPOCHS: 1}, LABEL_RANKING_LENGTH)],
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 8400a2be68e9..3bdd6d743ef6 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -1,4 +1,6 @@
 import numpy as np
+import pytest
+from typing import List
 
 import rasa.utils.train_utils as train_utils
 from rasa.nlu.constants import NUMBER_OF_SUB_TOKENS
@@ -26,3 +28,22 @@ def test_align_token_features():
     assert np.all(actual_features[0][3] == np.mean(token_features[0][3:5], axis=0))
     # embedding is split into 4 sub-tokens
     assert np.all(actual_features[0][4] == np.mean(token_features[0][5:10], axis=0))
+
+
+def test_normalize():
+    input_values = [0.7, 0.1, 0.1]
+    normalized_values = train_utils.normalize(np.array(input_values))
+    assert np.allclose(
+        normalized_values, np.array([0.77777778, 0.11111111, 0.11111111]), atol=1e-5
+    )
+
+
+@pytest.mark.parametrize(
+    "input_values, ranking_length, output_values",
+    [([0.5, 0.8, 0.1], 2, [0.5, 0.8, 0.0]), ([0.5, 0.3, 0.9], 5, [0.5, 0.3, 0.9]),],
+)
+def test_sort_and_rank(
+    input_values: List[float], ranking_length: int, output_values: List[float]
+):
+    ranked_values = train_utils.sort_and_rank(np.array(input_values), ranking_length)
+    assert np.array_equal(ranked_values, output_values)

From 4d4d52e7f0c5f9cb7599b9258faf3572f32774de Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Tue, 12 Jan 2021 01:12:58 +0100
Subject: [PATCH 07/44] review comments

---
 changelog/7616.improvement.md           |  8 ++--
 rasa/core/policies/ted_policy.py        | 16 +++++--
 rasa/nlu/classifiers/diet_classifier.py | 33 ++++++-------
 rasa/nlu/selectors/response_selector.py |  6 +--
 rasa/utils/tensorflow/constants.py      |  1 +
 rasa/utils/tensorflow/layers.py         | 62 +++++++++++++++++++------
 rasa/utils/train_utils.py               | 58 ++++++++++++++++++++++-
 7 files changed, 140 insertions(+), 44 deletions(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index bd6007ae1bc3..b4849350acb4 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -1,6 +1,8 @@
-Constrain similarity values to an approximate range in `DotProductLoss` by applying sigmoid over them during training.
+Added cross-entropy loss over sigmoid of all similarity values to constrain them in an approximate range in `DotProductLoss`.
 
-This affects the default behaviour of the loss function(`loss_type=softmax`) inside machine learning(ML) components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`.
+This affects the default behaviour of the loss function(`loss_type=cross_entropy`) inside machine learning (ML) components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`.
 If you notice a degradation in performance, set `constrain_similarities=False` in the respective ML component.
 
-Also, adds an option `relative_confidence` to each ML component. Contrary to the default behaviour, if `relative_confidence` is set to `False`, the confidences for each label will be between 0 and 1 but they will not sum up to 1.
\ No newline at end of file
+Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
+
+Also, added an option `relative_confidence` to each ML component. Contrary to the default behaviour, if `relative_confidence` is set to `False`, the confidences for each label will be between 0 and 1 but they will not sum up to 1.
\ No newline at end of file
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index c8a5af723d9a..37049f84365f 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -79,7 +79,8 @@
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
-    SOFTMAX,
+    CROSS_ENTROPY,
+    INNER,
     AUTO,
     BALANCED,
     TENSORBOARD_LOG_DIR,
@@ -210,7 +211,7 @@ class TEDPolicy(Policy):
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'softmax' or 'margin'.
-        LOSS_TYPE: SOFTMAX,
+        LOSS_TYPE: CROSS_ENTROPY,
         # Number of top actions to normalize scores for loss type 'softmax'.
         # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
@@ -342,7 +343,9 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = rasa.utils.train_utils.override_defaults(
             self.defaults, new_config
         )
-        self._check_similarity_confidence_setting()
+        rasa.utils.train_utils._check_similarity_confidence_setting(self.config)
+        rasa.utils.train_utils._check_similarity_loss_setting(self.config)
+        self.config = rasa.utils.train_utils.update_loss_type(self.config)
         self.config = rasa.utils.train_utils.update_similarity_type(self.config)
         self.config = rasa.utils.train_utils.update_evaluation_parameters(self.config)
 
@@ -629,12 +632,15 @@ def predict_action_probabilities(
         # take correct prediction from batch
         confidence, is_e2e_prediction = self._pick_confidence(confidences, similarities)
 
-        if self.config[LOSS_TYPE] == SOFTMAX and self.config[RANKING_LENGTH] > 0:
+        if self.config[LOSS_TYPE] == CROSS_ENTROPY and self.config[RANKING_LENGTH] > 0:
             confidence = rasa.utils.train_utils.sort_and_rank(
                 confidence, self.config[RANKING_LENGTH]
             )
 
-            if self.config[RELATIVE_CONFIDENCE]:
+            if (
+                self.config[SIMILARITY_TYPE] == INNER
+                and self.config[RELATIVE_CONFIDENCE]
+            ):
                 # Normalize the values if returned probabilities are from softmax.
                 confidence = rasa.utils.train_utils.normalize(confidence)
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 01c6ed3f6390..12ba2cb4e692 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -83,9 +83,10 @@
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
-    SOFTMAX,
     AUTO,
+    INNER,
     BALANCED,
+    CROSS_ENTROPY,
     TENSORBOARD_LOG_LEVEL,
     CONCAT_DIMENSION,
     FEATURIZERS,
@@ -184,8 +185,8 @@ def required_components(cls) -> List[Type[Component]]:
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: AUTO,
-        # The type of the loss function, either 'softmax' or 'margin'.
-        LOSS_TYPE: SOFTMAX,
+        # The type of the loss function, either 'cross_entropy' or 'margin'.
+        LOSS_TYPE: CROSS_ENTROPY,
         # Number of top actions to normalize scores for loss type 'softmax'.
         # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
@@ -290,19 +291,6 @@ def _check_share_hidden_layers_sizes(self) -> None:
                     f"{HIDDEN_LAYERS_SIZES} must coincide."
                 )
 
-    def _check_similarity_confidence_setting(self) -> None:
-        if (
-            not self.component_config[CONSTRAIN_SIMILARITIES]
-            and not self.component_config[RELATIVE_CONFIDENCE]
-        ):
-            raise ValueError(
-                f"If {CONSTRAIN_SIMILARITIES} is set to False, "
-                f"{RELATIVE_CONFIDENCE} cannot be set to False as"
-                f"similarities need to be constrained during training "
-                f"time in order to compute appropriate confidence values "
-                f"for each label at inference time."
-            )
-
     def _check_config_parameters(self) -> None:
         self.component_config = train_utils.check_deprecated_options(
             self.component_config
@@ -310,7 +298,11 @@ def _check_config_parameters(self) -> None:
 
         self._check_masked_lm()
         self._check_share_hidden_layers_sizes()
-        self._check_similarity_confidence_setting()
+
+        train_utils._check_similarity_confidence_setting(self.component_config)
+        train_utils._check_similarity_loss_setting(self.component_config)
+
+        self.component_config = train_utils.update_loss_type(self.component_config)
 
         self.component_config = train_utils.update_similarity_type(
             self.component_config
@@ -872,14 +864,17 @@ def _predict_label(
         label_ids = message_sim.argsort()[::-1]
 
         if (
-            self.component_config[LOSS_TYPE] == SOFTMAX
+            self.component_config[LOSS_TYPE] == CROSS_ENTROPY
             and self.component_config[RANKING_LENGTH] > 0
         ):
             message_sim = train_utils.sort_and_rank(
                 message_sim, self.component_config[RANKING_LENGTH]
             )
 
-            if self.component_config[RELATIVE_CONFIDENCE]:
+            if (
+                self.component_config[SIMILARITY_TYPE] == INNER
+                and self.component_config[RELATIVE_CONFIDENCE]
+            ):
                 # Normalize the values if returned probabilities are from
                 # softmax(hence relative to each other).
                 message_sim = train_utils.normalize(message_sim)
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 779a50afccf4..fa065eb7cdb7 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -64,7 +64,7 @@
     MAX_RELATIVE_POSITION,
     RETRIEVAL_INTENT,
     USE_TEXT_AS_LABEL,
-    SOFTMAX,
+    CROSS_ENTROPY,
     AUTO,
     BALANCED,
     TENSORBOARD_LOG_DIR,
@@ -171,8 +171,8 @@ def required_components(cls) -> List[Type[Component]]:
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: AUTO,
-        # The type of the loss function, either 'softmax' or 'margin'.
-        LOSS_TYPE: SOFTMAX,
+        # The type of the loss function, either 'cross_entropy' or 'margin'.
+        LOSS_TYPE: CROSS_ENTROPY,
         # Number of top actions to normalize scores for loss type 'softmax'.
         # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 6f462ec1381e..f8371d5c6975 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -66,6 +66,7 @@
 AUTO = "auto"
 INNER = "inner"
 COSINE = "cosine"
+CROSS_ENTROPY = "cross_entropy"
 
 BALANCED = "balanced"
 
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 04f98eb6fc85..3a034115a561 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -5,7 +5,13 @@
 import rasa.utils.tensorflow.crf
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras import backend as K
-from rasa.utils.tensorflow.constants import SOFTMAX, MARGIN, COSINE, INNER
+from rasa.utils.tensorflow.constants import (
+    SOFTMAX,
+    MARGIN,
+    COSINE,
+    INNER,
+    CROSS_ENTROPY,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -589,7 +595,37 @@ def __init__(
         constrain_similarities: bool = True,
         relative_confidence: bool = True,
     ) -> None:
-        """Declare instance variables with default values."""
+        """Declare instance variables with default values.
+
+        Args:
+            num_neg: Positive integer, the number of incorrect labels;
+                the algorithm will minimize their similarity to the input.
+            loss_type: The type of the loss function, either 'softmax' or 'margin'.
+            mu_pos: Float, indicates how similar the algorithm should
+                try to make embedding vectors for correct labels;
+                should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+            mu_neg: Float, maximum negative similarity for incorrect labels,
+                should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+            use_max_sim_neg: Boolean, if 'True' the algorithm only minimizes
+                maximum similarity over incorrect intent labels,
+                used only if 'loss_type' is set to 'margin'.
+            neg_lambda: Float, the scale of how important is to minimize
+                the maximum similarity between embeddings of different labels,
+                used only if 'loss_type' is set to 'margin'.
+            scale_loss: Boolean, if 'True' scale loss inverse proportionally to
+                the confidence of the correct prediction.
+            name: Optional name of the layer.
+            parallel_iterations: Positive integer, the number of iterations allowed
+                to run in parallel.
+            same_sampling: Boolean, if 'True' sample same negative labels
+                for the whole batch.
+            constrain_similarities: Boolean, if 'True' applies sigmoid on all
+                similarity terms and adds to the loss function to
+                ensure that similarity values are approximately bounded.
+                Used inside _loss_softmax() only.
+            relative_confidence: Boolean, if 'True' confidence is calculated by applying
+                softmax over similarities, else sigmoid is applied on individual similarities.
+        """
         super().__init__(name=name)
         self.num_neg = num_neg
         self.loss_type = loss_type
@@ -755,13 +791,12 @@ def confidence_from_sim(self, sim: tf.Tensor, similarity_type: Text) -> tf.Tenso
         if similarity_type == COSINE:
             # clip negative values to zero
             return tf.nn.relu(sim)
+        if self.relative_confidence:
+            # normalize result to [0, 1] with softmax
+            return tf.nn.softmax(sim)
         else:
-            if self.relative_confidence:
-                # normalize result to [0, 1] with softmax
-                return tf.nn.softmax(sim)
-            else:
-                # Convert each individual similarity to probability
-                return tf.nn.sigmoid(sim)
+            # Convert each individual similarity to probability
+            return tf.nn.sigmoid(sim)
 
     def _train_sim(
         self,
@@ -865,7 +900,7 @@ def _loss_margin(
 
         return loss
 
-    def _loss_softmax(
+    def _loss_cross_entropy(
         self,
         sim_pos: tf.Tensor,
         sim_neg_il: tf.Tensor,
@@ -874,7 +909,7 @@ def _loss_softmax(
         sim_neg_li: tf.Tensor,
         mask: Optional[tf.Tensor],
     ) -> tf.Tensor:
-        """Define softmax loss."""
+        """Define cross entropy loss."""
         # Similarity terms between input and label should be optimized relative
         # to each other and hence use them as logits for softmax term
         softmax_logits = tf.concat([sim_pos, sim_neg_il, sim_neg_li], axis=-1)
@@ -917,6 +952,7 @@ def _loss_softmax(
                 labels=sigmoid_labels, logits=sigmoid_logits
             )
 
+            # average over logits axis
             loss += tf.reduce_mean(sigmoid_loss, axis=-1)
 
         if self.scale_loss:
@@ -942,12 +978,12 @@ def _chosen_loss(self) -> Callable:
 
         if self.loss_type == MARGIN:
             return self._loss_margin
-        elif self.loss_type == SOFTMAX:
-            return self._loss_softmax
+        elif self.loss_type == CROSS_ENTROPY:
+            return self._loss_cross_entropy
         else:
             raise ValueError(
                 f"Wrong loss type '{self.loss_type}', "
-                f"should be '{MARGIN}' or '{SOFTMAX}'"
+                f"should be '{MARGIN}' or '{CROSS_ENTROPY}'"
             )
 
     # noinspection PyMethodOverriding
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 9eec452176cb..94daf5631962 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -19,9 +19,12 @@
     AUTO,
     INNER,
     COSINE,
+    CROSS_ENTROPY,
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     DENSE_DIMENSION,
+    CONSTRAIN_SIMILARITIES,
+    RELATIVE_CONFIDENCE,
 )
 from rasa.shared.nlu.constants import ACTION_NAME, INTENT, ENTITIES
 from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS
@@ -77,7 +80,7 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
     Returns: updated model configuration
     """
     if config.get(SIMILARITY_TYPE) == AUTO:
-        if config[LOSS_TYPE] == SOFTMAX:
+        if config[LOSS_TYPE] == CROSS_ENTROPY:
             config[SIMILARITY_TYPE] = INNER
         elif config[LOSS_TYPE] == MARGIN:
             config[SIMILARITY_TYPE] = COSINE
@@ -85,6 +88,27 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
     return config
 
 
+def update_loss_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
+    """
+    If LOSS_TYPE is set to 'softmax', update it to 'cross_entropy' since former is deprecated.
+    Args:
+        config: model configuration
+
+    Returns: updated model configuration
+    """
+    # TODO: Completely deprecate this with 3.0
+    if config.get(LOSS_TYPE) == SOFTMAX:
+        rasa.shared.utils.io.raise_deprecation_warning(
+            f"`{LOSS_TYPE}={SOFTMAX}` is deprecated. "
+            f"Please update your configuration file to use"
+            f"`{LOSS_TYPE}={CROSS_ENTROPY}` instead.",
+            warn_until_version=NEXT_MAJOR_VERSION_FOR_DEPRECATIONS,
+        )
+        config[LOSS_TYPE] = CROSS_ENTROPY
+
+    return config
+
+
 def align_token_features(
     list_of_tokens: List[List["Token"]],
     in_token_features: np.ndarray,
@@ -354,3 +378,35 @@ def override_defaults(
                 config[key] = custom[key]
 
     return config
+
+
+def _check_similarity_confidence_setting(component_config) -> None:
+    if (
+        not component_config[CONSTRAIN_SIMILARITIES]
+        and not component_config[RELATIVE_CONFIDENCE]
+    ):
+        raise ValueError(
+            f"If {CONSTRAIN_SIMILARITIES} is set to False, "
+            f"{RELATIVE_CONFIDENCE} cannot be set to False as"
+            f"similarities need to be constrained during training "
+            f"time in order to compute appropriate confidence values "
+            f"for each label at inference time."
+        )
+
+
+def _check_similarity_loss_setting(component_config) -> None:
+    if (
+        component_config[SIMILARITY_TYPE] == COSINE
+        and component_config[LOSS_TYPE] == CROSS_ENTROPY
+        or component_config[SIMILARITY_TYPE] == INNER
+        and component_config[LOSS_TYPE] == MARGIN
+    ):
+        raise rasa.shared.utils.io.raise_warning(
+            f"`{SIMILARITY_TYPE}={component_config[SIMILARITY_TYPE]}`"
+            f" and `{LOSS_TYPE}={component_config[LOSS_TYPE]}` "
+            f"is not a recommended setting as it may not lead to best results."
+            f"Ideally use `{SIMILARITY_TYPE}={INNER}`"
+            f" and `{LOSS_TYPE}={CROSS_ENTROPY}` or"
+            f"`{SIMILARITY_TYPE}={COSINE}` and `{LOSS_TYPE}={MARGIN}`.",
+            category=UserWarning,
+        )

From db17411e22ba05d701288b4554ca105d25be6658 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Tue, 12 Jan 2021 12:50:20 +0100
Subject: [PATCH 08/44] review comments

---
 changelog/7616.improvement.md           |  4 +--
 docs/docs/migration-guide.mdx           | 16 +++++++++
 rasa/core/policies/ted_policy.py        | 15 ++-------
 rasa/nlu/classifiers/diet_classifier.py |  2 ++
 rasa/utils/tensorflow/layers.py         | 44 ++++---------------------
 rasa/utils/train_utils.py               | 16 +++++++--
 6 files changed, 42 insertions(+), 55 deletions(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index b4849350acb4..c1cdb8c8c966 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -1,8 +1,8 @@
-Added cross-entropy loss over sigmoid of all similarity values to constrain them in an approximate range in `DotProductLoss`.
+Added sigmoid cross-entropy loss on all similarity values to constrain them to an approximate range in `DotProductLoss`.
 
 This affects the default behaviour of the loss function(`loss_type=cross_entropy`) inside machine learning (ML) components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`.
 If you notice a degradation in performance, set `constrain_similarities=False` in the respective ML component.
 
 Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
 
-Also, added an option `relative_confidence` to each ML component. Contrary to the default behaviour, if `relative_confidence` is set to `False`, the confidences for each label will be between 0 and 1 but they will not sum up to 1.
\ No newline at end of file
+Also, added an option `relative_confidence` to each ML component. Contrary to the default behaviour, if `relative_confidence` is set to `False`, the confidences for each label will be between 0 and 1 but they will not sum up to 1. It is also recommended to set `relative_confidence=False` as it will be made default in Rasa Open Source 3.0. You may need to tune fallback confidence thresholds after making this change.
\ No newline at end of file
diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index 203b3d7bcc3b..2c34fdc904a5 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -10,6 +10,22 @@ description: |
 This page contains information about changes between major versions and
 how you can migrate from one version to another.
 
+## Rasa 2.2 to Rasa 2.3
+
+### Machine Learning Components
+
+Few changes have been made to the default loss function inside machine learning (ML)
+components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`. These include:
+- Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
+- The default loss function (`loss_type=cross_entropy`) adds a sigmoid cross-entropy loss of all similarity values to constrain
+them to an approximate range. If you notice a degradation in performance, set `constrain_similarities=False`
+in the respective ML component.
+- Added an option `relative_confidence` to each ML component. Contrary to the default behaviour,
+if `relative_confidence` is set to `False`, the confidences for each label will be between 0 and 1
+but they will not sum up to 1. It is also recommended to set `relative_confidence=False` as it will be made
+default in Rasa Open Source 3.0. You may need to tune fallback confidence thresholds after making this change.
+
+
 ## Rasa 2.1 to Rasa 2.2
 
 ### General
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 37049f84365f..20191a300cfa 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -325,24 +325,12 @@ def __init__(
         self._label_data: Optional[RasaModelData] = None
         self.data_example: Optional[Dict[Text, List[np.ndarray]]] = None
 
-    def _check_similarity_confidence_setting(self) -> None:
-        if (
-            not self.config[CONSTRAIN_SIMILARITIES]
-            and not self.config[RELATIVE_CONFIDENCE]
-        ):
-            raise ValueError(
-                f"If {CONSTRAIN_SIMILARITIES} is set to False, "
-                f"{RELATIVE_CONFIDENCE} cannot be set to False as"
-                f"similarities need to be constrained during training "
-                f"time in order to compute appropriate confidence values "
-                f"for each label at inference time."
-            )
-
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         new_config = rasa.utils.train_utils.check_core_deprecated_options(kwargs)
         self.config = rasa.utils.train_utils.override_defaults(
             self.defaults, new_config
         )
+        rasa.utils.train_utils._check_confidence_setting(self.config)
         rasa.utils.train_utils._check_similarity_confidence_setting(self.config)
         rasa.utils.train_utils._check_similarity_loss_setting(self.config)
         self.config = rasa.utils.train_utils.update_loss_type(self.config)
@@ -813,6 +801,7 @@ def load(
             label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data=loaded_data
         )
         meta = rasa.utils.train_utils.update_similarity_type(meta)
+        meta = rasa.utils.train_utils.update_loss_type(meta)
 
         meta[EPOCHS] = epoch_override
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 12ba2cb4e692..b4931627620f 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -299,6 +299,7 @@ def _check_config_parameters(self) -> None:
         self._check_masked_lm()
         self._check_share_hidden_layers_sizes()
 
+        train_utils._check_confidence_setting(self.component_config)
         train_utils._check_similarity_confidence_setting(self.component_config)
         train_utils._check_similarity_loss_setting(self.component_config)
 
@@ -1018,6 +1019,7 @@ def load(
         ) = cls._load_from_files(meta, model_dir)
 
         meta = train_utils.update_similarity_type(meta)
+        meta = train_utils.update_loss_type(meta)
 
         model = cls._load_model(
             entity_tag_specs,
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 3a034115a561..18ad1ec38378 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -548,37 +548,7 @@ def f1_score(
 
 
 class DotProductLoss(tf.keras.layers.Layer):
-    """Dot-product loss layer.
-
-    Arguments:
-        num_neg: Positive integer, the number of incorrect labels;
-            the algorithm will minimize their similarity to the input.
-        loss_type: The type of the loss function, either 'softmax' or 'margin'.
-        mu_pos: Float, indicates how similar the algorithm should
-            try to make embedding vectors for correct labels;
-            should be 0.0 < ... < 1.0 for 'cosine' similarity type.
-        mu_neg: Float, maximum negative similarity for incorrect labels,
-            should be -1.0 < ... < 1.0 for 'cosine' similarity type.
-        use_max_sim_neg: Boolean, if 'True' the algorithm only minimizes
-            maximum similarity over incorrect intent labels,
-            used only if 'loss_type' is set to 'margin'.
-        neg_lambda: Float, the scale of how important is to minimize
-            the maximum similarity between embeddings of different labels,
-            used only if 'loss_type' is set to 'margin'.
-        scale_loss: Boolean, if 'True' scale loss inverse proportionally to
-            the confidence of the correct prediction.
-        name: Optional name of the layer.
-        parallel_iterations: Positive integer, the number of iterations allowed
-            to run in parallel.
-        same_sampling: Boolean, if 'True' sample same negative labels
-            for the whole batch.
-        constrain_similarities: Boolean, if 'True' applies sigmoid on all
-            similarity terms and adds to the loss function to
-            ensure that similarity values are approximately bounded.
-            Used inside _loss_softmax() only.
-        relative_confidence: Boolean, if 'True' confidence is calculated by applying
-            softmax over similarities, else sigmoid is applied on individual similarities.
-    """
+    """Dot-product loss layer"""
 
     def __init__(
         self,
@@ -600,7 +570,7 @@ def __init__(
         Args:
             num_neg: Positive integer, the number of incorrect labels;
                 the algorithm will minimize their similarity to the input.
-            loss_type: The type of the loss function, either 'softmax' or 'margin'.
+            loss_type: The type of the loss function, either 'cross_entropy' or 'margin'.
             mu_pos: Float, indicates how similar the algorithm should
                 try to make embedding vectors for correct labels;
                 should be 0.0 < ... < 1.0 for 'cosine' similarity type.
@@ -622,7 +592,7 @@ def __init__(
             constrain_similarities: Boolean, if 'True' applies sigmoid on all
                 similarity terms and adds to the loss function to
                 ensure that similarity values are approximately bounded.
-                Used inside _loss_softmax() only.
+                Used inside _loss_cross_entropy() only.
             relative_confidence: Boolean, if 'True' confidence is calculated by applying
                 softmax over similarities, else sigmoid is applied on individual similarities.
         """
@@ -791,12 +761,12 @@ def confidence_from_sim(self, sim: tf.Tensor, similarity_type: Text) -> tf.Tenso
         if similarity_type == COSINE:
             # clip negative values to zero
             return tf.nn.relu(sim)
-        if self.relative_confidence:
+        elif self.relative_confidence:
             # normalize result to [0, 1] with softmax
             return tf.nn.softmax(sim)
-        else:
-            # Convert each individual similarity to probability
-            return tf.nn.sigmoid(sim)
+
+        # In other cases convert each individual similarity to probability
+        return tf.nn.sigmoid(sim)
 
     def _train_sim(
         self,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 94daf5631962..f894c17dfe8f 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -380,6 +380,16 @@ def override_defaults(
     return config
 
 
+def _check_confidence_setting(component_config) -> None:
+    if component_config[RELATIVE_CONFIDENCE]:
+        rasa.shared.utils.io.raise_warning(
+            f"{RELATIVE_CONFIDENCE} is set to `True`. It is recommended "
+            f"to set it to `False`. It will be set to `False` by default "
+            f"Rasa Open Source 3.0 onwards.",
+            category=UserWarning,
+        )
+
+
 def _check_similarity_confidence_setting(component_config) -> None:
     if (
         not component_config[CONSTRAIN_SIMILARITIES]
@@ -387,9 +397,9 @@ def _check_similarity_confidence_setting(component_config) -> None:
     ):
         raise ValueError(
             f"If {CONSTRAIN_SIMILARITIES} is set to False, "
-            f"{RELATIVE_CONFIDENCE} cannot be set to False as"
+            f"{RELATIVE_CONFIDENCE} cannot be set to False as "
             f"similarities need to be constrained during training "
-            f"time in order to compute appropriate confidence values "
+            f"time as well in order to correctly compute confidence values "
             f"for each label at inference time."
         )
 
@@ -401,7 +411,7 @@ def _check_similarity_loss_setting(component_config) -> None:
         or component_config[SIMILARITY_TYPE] == INNER
         and component_config[LOSS_TYPE] == MARGIN
     ):
-        raise rasa.shared.utils.io.raise_warning(
+        rasa.shared.utils.io.raise_warning(
             f"`{SIMILARITY_TYPE}={component_config[SIMILARITY_TYPE]}`"
             f" and `{LOSS_TYPE}={component_config[LOSS_TYPE]}` "
             f"is not a recommended setting as it may not lead to best results."

From 1d5527a33fccac83794d8c73dedb5f7026ddfd4b Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 13 Jan 2021 00:40:03 +0100
Subject: [PATCH 09/44] remove sim_neg_ii to run experiments

---
 rasa/nlu/classifiers/diet_classifier.py |  1 +
 rasa/nlu/test.py                        | 14 ++++++++++++++
 rasa/utils/tensorflow/layers.py         | 13 ++++++++++++-
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index b4931627620f..b82675866168 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1020,6 +1020,7 @@ def load(
 
         meta = train_utils.update_similarity_type(meta)
         meta = train_utils.update_loss_type(meta)
+        # meta[RELATIVE_CONFIDENCE] = True
 
         model = cls._load_model(
             entity_tag_specs,
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index 837aec238855..e7167bcdfb45 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -332,6 +332,20 @@ def plot_attribute_confidences(
         if getattr(r, target_key) != getattr(r, prediction_key)
     ]
 
+    # import matplotlib.pyplot as plt
+    #
+    # plt.gcf().clear()
+    #
+    # fig = plt.hist(pos_hist)
+    # plt.title("Positive_sims")
+    # plt.savefig(f"{hist_filename.split('.')[0]}_pos.png")
+    #
+    # plt.gcf().clear()
+    #
+    # fig = plt.hist(neg_hist)
+    # plt.title("Negative_sims")
+    # plt.savefig(f"{hist_filename.split('.')[0]}_neg.png")
+
     plot_utils.plot_histogram([pos_hist, neg_hist], title, hist_filename)
 
 
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 18ad1ec38378..9d138d29e327 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -710,6 +710,12 @@ def _get_negs(
 
         return neg_embeds, bad_negs
 
+    def _compute_vector_length(self, embedding):
+        norm = tf.norm(embedding, axis=-1)
+        mean_norm = tf.reduce_mean(norm)
+        std_norm = tf.math.reduce_std(norm)
+        # tf.print("Norm", mean_norm, std_norm)
+
     def _sample_negatives(
         self,
         inputs_embed: tf.Tensor,
@@ -720,6 +726,10 @@ def _sample_negatives(
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
         """Sample negative examples."""
 
+        # self._compute_vector_length(inputs_embed)
+        # tf.print('---------------')
+        # self._compute_vector_length(labels_embed)
+        # tf.print('===============s')
         pos_inputs_embed = tf.expand_dims(inputs_embed, axis=-2)
         pos_labels_embed = tf.expand_dims(labels_embed, axis=-2)
 
@@ -758,6 +768,7 @@ def confidence_from_sim(self, sim: tf.Tensor, similarity_type: Text) -> tf.Tenso
         Returns:
             Confidences corresponding to each similarity value.
         """
+        # return sim
         if similarity_type == COSINE:
             # clip negative values to zero
             return tf.nn.relu(sim)
@@ -905,7 +916,7 @@ def _loss_cross_entropy(
             # Constrain similarity values in a range by applying sigmoid
             # on them individually so that they saturate at extreme values.
             sigmoid_logits = tf.concat(
-                [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
+                [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_li], axis=-1
             )
 
             sigmoid_labels = tf.concat(

From 763c2cd23d3649fbffd9eea2bb08b4d05a5c7638 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 25 Jan 2021 13:04:06 +0100
Subject: [PATCH 10/44] revert back experimental change

---
 rasa/nlu/classifiers/diet_classifier.py |  1 -
 rasa/nlu/test.py                        | 14 --------------
 rasa/utils/tensorflow/layers.py         | 13 +------------
 3 files changed, 1 insertion(+), 27 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index b82675866168..b4931627620f 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1020,7 +1020,6 @@ def load(
 
         meta = train_utils.update_similarity_type(meta)
         meta = train_utils.update_loss_type(meta)
-        # meta[RELATIVE_CONFIDENCE] = True
 
         model = cls._load_model(
             entity_tag_specs,
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index e7167bcdfb45..837aec238855 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -332,20 +332,6 @@ def plot_attribute_confidences(
         if getattr(r, target_key) != getattr(r, prediction_key)
     ]
 
-    # import matplotlib.pyplot as plt
-    #
-    # plt.gcf().clear()
-    #
-    # fig = plt.hist(pos_hist)
-    # plt.title("Positive_sims")
-    # plt.savefig(f"{hist_filename.split('.')[0]}_pos.png")
-    #
-    # plt.gcf().clear()
-    #
-    # fig = plt.hist(neg_hist)
-    # plt.title("Negative_sims")
-    # plt.savefig(f"{hist_filename.split('.')[0]}_neg.png")
-
     plot_utils.plot_histogram([pos_hist, neg_hist], title, hist_filename)
 
 
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 9d138d29e327..18ad1ec38378 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -710,12 +710,6 @@ def _get_negs(
 
         return neg_embeds, bad_negs
 
-    def _compute_vector_length(self, embedding):
-        norm = tf.norm(embedding, axis=-1)
-        mean_norm = tf.reduce_mean(norm)
-        std_norm = tf.math.reduce_std(norm)
-        # tf.print("Norm", mean_norm, std_norm)
-
     def _sample_negatives(
         self,
         inputs_embed: tf.Tensor,
@@ -726,10 +720,6 @@ def _sample_negatives(
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
         """Sample negative examples."""
 
-        # self._compute_vector_length(inputs_embed)
-        # tf.print('---------------')
-        # self._compute_vector_length(labels_embed)
-        # tf.print('===============s')
         pos_inputs_embed = tf.expand_dims(inputs_embed, axis=-2)
         pos_labels_embed = tf.expand_dims(labels_embed, axis=-2)
 
@@ -768,7 +758,6 @@ def confidence_from_sim(self, sim: tf.Tensor, similarity_type: Text) -> tf.Tenso
         Returns:
             Confidences corresponding to each similarity value.
         """
-        # return sim
         if similarity_type == COSINE:
             # clip negative values to zero
             return tf.nn.relu(sim)
@@ -916,7 +905,7 @@ def _loss_cross_entropy(
             # Constrain similarity values in a range by applying sigmoid
             # on them individually so that they saturate at extreme values.
             sigmoid_logits = tf.concat(
-                [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_li], axis=-1
+                [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
             )
 
             sigmoid_labels = tf.concat(

From 2b9d532358f4ef3f3b6a81d43ab0b3dfea314134 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sun, 31 Jan 2021 18:27:46 +0100
Subject: [PATCH 11/44] update similarity computation during prediction, to be
 tested.

---
 rasa/core/policies/ted_policy.py              | 12 +--
 rasa/nlu/classifiers/diet_classifier.py       | 21 +++--
 rasa/nlu/selectors/response_selector.py       | 12 ++-
 rasa/utils/plotting.py                        |  7 +-
 rasa/utils/tensorflow/constants.py            |  2 +-
 rasa/utils/tensorflow/layers.py               | 90 ++++++++++---------
 rasa/utils/tensorflow/models.py               |  4 +-
 rasa/utils/train_utils.py                     | 26 ++----
 tests/nlu/classifiers/test_diet_classifier.py |  4 +-
 tests/utils/test_train_utils.py               |  2 +-
 10 files changed, 85 insertions(+), 95 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 8e67c4575219..4b2ee7ed8e32 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -103,7 +103,8 @@
     FEATURIZERS,
     ENTITY_RECOGNITION,
     CONSTRAIN_SIMILARITIES,
-    RELATIVE_CONFIDENCE,
+    MODEL_CONFIDENCE,
+    SOFTMAX,
 )
 from rasa.shared.core.events import EntitiesAdded, Event
 from rasa.shared.nlu.training_data.message import Message
@@ -281,7 +282,7 @@ class TEDPolicy(Policy):
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: True,
         # Return softmax based probabilities during prediction.
-        RELATIVE_CONFIDENCE: True,
+        MODEL_CONFIDENCE: SOFTMAX,
         # Split entities by comma, this makes sense e.g. for a list of
         # ingredients in a recipe, but it doesn't make sense for the parts of
         # an address
@@ -342,7 +343,6 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
             self.defaults, new_config
         )
         rasa.utils.train_utils._check_confidence_setting(self.config)
-        rasa.utils.train_utils._check_similarity_confidence_setting(self.config)
         rasa.utils.train_utils._check_similarity_loss_setting(self.config)
         self.config = rasa.utils.train_utils.update_loss_type(self.config)
         self.config = rasa.utils.train_utils.update_similarity_type(self.config)
@@ -632,14 +632,16 @@ def predict_action_probabilities(
         confidence, is_e2e_prediction = self._pick_confidence(confidences, similarities)
 
         if self.config[LOSS_TYPE] == CROSS_ENTROPY and self.config[RANKING_LENGTH] > 0:
-            confidence = rasa.utils.train_utils.sort_and_rank(
+            confidence = rasa.utils.train_utils.filter_top_k(
                 confidence, self.config[RANKING_LENGTH]
             )
 
             if (
                 self.config[SIMILARITY_TYPE] == INNER
-                and self.config[RELATIVE_CONFIDENCE]
+                and self.config[MODEL_CONFIDENCE] == SOFTMAX
             ):
+                # TODO: This should be removed in 3.0 when softmax as
+                #  model confidence is completely deprecated.
                 # Normalize the values if returned probabilities are from softmax.
                 confidence = rasa.utils.train_utils.normalize(confidence)
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index b4931627620f..3b909a469e1f 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -97,7 +97,8 @@
     DENSE_DIMENSION,
     MASK,
     CONSTRAIN_SIMILARITIES,
-    RELATIVE_CONFIDENCE,
+    MODEL_CONFIDENCE,
+    SOFTMAX,
 )
 
 logger = logging.getLogger(__name__)
@@ -258,8 +259,8 @@ def required_components(cls) -> List[Type[Component]]:
         # If 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: True,
-        # Return softmax based probabilities during prediction.
-        RELATIVE_CONFIDENCE: True,
+        # Model confidence to be returned during inference. Possible values - softmax, cosine, inner.
+        MODEL_CONFIDENCE: SOFTMAX,
     }
 
     # init helpers
@@ -300,7 +301,6 @@ def _check_config_parameters(self) -> None:
         self._check_share_hidden_layers_sizes()
 
         train_utils._check_confidence_setting(self.component_config)
-        train_utils._check_similarity_confidence_setting(self.component_config)
         train_utils._check_similarity_loss_setting(self.component_config)
 
         self.component_config = train_utils.update_loss_type(self.component_config)
@@ -868,15 +868,17 @@ def _predict_label(
             self.component_config[LOSS_TYPE] == CROSS_ENTROPY
             and self.component_config[RANKING_LENGTH] > 0
         ):
-            message_sim = train_utils.sort_and_rank(
+            message_sim = train_utils.filter_top_k(
                 message_sim, self.component_config[RANKING_LENGTH]
             )
 
             if (
                 self.component_config[SIMILARITY_TYPE] == INNER
-                and self.component_config[RELATIVE_CONFIDENCE]
+                and self.component_config[MODEL_CONFIDENCE] == SOFTMAX
             ):
-                # Normalize the values if returned probabilities are from
+                # TODO: This should be removed in 3.0 when softmax as
+                #  model confidence is completely deprecated.
+                # Normalize the values if returned confidences are from
                 # softmax(hence relative to each other).
                 message_sim = train_utils.normalize(message_sim)
 
@@ -1664,12 +1666,9 @@ def _batch_predict_intents(
         sentence_vector = self._last_token(text_transformed, sequence_lengths)
         sentence_vector_embed = self._tf_layers[f"embed.{TEXT}"](sentence_vector)
 
-        sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
+        scores = self._tf_layers[f"loss.{LABEL}"]._confidence_from_embeddings(
             sentence_vector_embed[:, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, :, :],
         )
-        scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
-            sim_all, self.config[SIMILARITY_TYPE]
-        )
 
         return {"i_scores": scores}
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index fa065eb7cdb7..841e66465628 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -74,7 +74,8 @@
     CHECKPOINT_MODEL,
     DENSE_DIMENSION,
     CONSTRAIN_SIMILARITIES,
-    RELATIVE_CONFIDENCE,
+    MODEL_CONFIDENCE,
+    SOFTMAX,
 )
 from rasa.nlu.constants import (
     RESPONSE_SELECTOR_PROPERTY_NAME,
@@ -235,8 +236,8 @@ def required_components(cls) -> List[Type[Component]]:
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: True,
-        # Return softmax based probabilities during prediction.
-        RELATIVE_CONFIDENCE: True,
+        # Model confidence to be returned during inference. Possible values - softmax, cosine, inner.
+        MODEL_CONFIDENCE: SOFTMAX,
     }
 
     def __init__(
@@ -749,13 +750,10 @@ def batch_predict(
         sentence_vector = self._last_token(text_transformed, sequence_lengths_text)
         sentence_vector_embed = self._tf_layers[f"embed.{TEXT}"](sentence_vector)
 
-        sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
+        scores = self._tf_layers[f"loss.{LABEL}"]._confidence_from_embeddings(
             sentence_vector_embed[:, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, :, :],
         )
-        scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
-            sim_all, self.config[SIMILARITY_TYPE]
-        )
         out["i_scores"] = scores
 
         return out
diff --git a/rasa/utils/plotting.py b/rasa/utils/plotting.py
index 7eba3d6d0f7f..2de1769b3fd0 100644
--- a/rasa/utils/plotting.py
+++ b/rasa/utils/plotting.py
@@ -133,7 +133,8 @@ def plot_histogram(
     # Wine-ish colour for the confidences of hits.
     # Blue-ish colour for the confidences of misses.
     colors = ["#009292", "#920000"]
-    bins = [0.05 * i for i in range(1, 21)]
+    bins = [0.025 * i for i in range(1, 42)]
+    # bins = [1 * i for i in range(1, 31)]
 
     binned_data_sets = [np.histogram(d, bins=bins)[0] for d in hist_data]
 
@@ -172,8 +173,8 @@ def plot_histogram(
     )
     axes[1].set(title="Wrong")
 
-    axes[0].set(yticks=bins, xlim=(0, max_xlims[0]), ylim=(min_ylim, 1.0))
-    axes[1].set(yticks=bins, xlim=(0, max_xlims[1]), ylim=(min_ylim, 1.0))
+    # axes[0].set(yticks=bins, xlim=(0, max_xlims[0]), ylim=(min_ylim, 1.0))
+    # axes[1].set(yticks=bins, xlim=(0, max_xlims[1]), ylim=(min_ylim, 1.0))
 
     axes[0].invert_xaxis()
     axes[0].yaxis.tick_right()
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index f8371d5c6975..d43c85066b9e 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -53,7 +53,7 @@
 DENSE_INPUT_DROPOUT = "use_dense_input_dropout"
 
 RANKING_LENGTH = "ranking_length"
-RELATIVE_CONFIDENCE = "relative_confidence"
+MODEL_CONFIDENCE = "model_confidence"
 
 BILOU_FLAG = "BILOU_flag"
 
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 18ad1ec38378..051353110fb3 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -275,13 +275,6 @@ def call(
 class Embed(tf.keras.layers.Layer):
     """Dense embedding layer.
 
-    Arguments:
-        embed_dim: Positive integer, dimensionality of the output space.
-        reg_lambda: Float; regularization factor.
-        layer_name_suffix: Text added to the name of the layers.
-        similarity_type: Optional type of similarity measure to use,
-            either 'cosine' or 'inner'.
-
     Input shape:
         N-D tensor with shape: `(batch_size, ..., input_dim)`.
         The most common situation would be
@@ -294,20 +287,16 @@ class Embed(tf.keras.layers.Layer):
     """
 
     def __init__(
-        self,
-        embed_dim: int,
-        reg_lambda: float,
-        layer_name_suffix: Text,
-        similarity_type: Optional[Text] = None,
+        self, embed_dim: int, reg_lambda: float, layer_name_suffix: Text,
     ) -> None:
-        super().__init__(name=f"embed_{layer_name_suffix}")
+        """Initialize layer.
 
-        self.similarity_type = similarity_type
-        if self.similarity_type and self.similarity_type not in {COSINE, INNER}:
-            raise ValueError(
-                f"Wrong similarity type '{self.similarity_type}', "
-                f"should be '{COSINE}' or '{INNER}'."
-            )
+        Args:
+            embed_dim: Dimensionality of the output space.
+            reg_lambda: Regularization factor.
+            layer_name_suffix: Text added to the name of the layers.
+        """
+        super().__init__(name=f"embed_{layer_name_suffix}")
 
         regularizer = tf.keras.regularizers.l2(reg_lambda)
         self._dense = tf.keras.layers.Dense(
@@ -319,10 +308,8 @@ def __init__(
 
     # noinspection PyMethodOverriding
     def call(self, x: tf.Tensor) -> tf.Tensor:
+        """Apply dense layer."""
         x = self._dense(x)
-        if self.similarity_type == COSINE:
-            x = tf.nn.l2_normalize(x, axis=-1)
-
         return x
 
 
@@ -562,8 +549,9 @@ def __init__(
         name: Optional[Text] = None,
         parallel_iterations: int = 1000,
         same_sampling: bool = False,
+        similarity_type: Optional[Text] = None,
         constrain_similarities: bool = True,
-        relative_confidence: bool = True,
+        model_confidence: bool = True,
     ) -> None:
         """Declare instance variables with default values.
 
@@ -589,12 +577,13 @@ def __init__(
                 to run in parallel.
             same_sampling: Boolean, if 'True' sample same negative labels
                 for the whole batch.
+            similarity_type: Similarity measure to use, either 'cosine' or 'inner'.
             constrain_similarities: Boolean, if 'True' applies sigmoid on all
                 similarity terms and adds to the loss function to
                 ensure that similarity values are approximately bounded.
                 Used inside _loss_cross_entropy() only.
-            relative_confidence: Boolean, if 'True' confidence is calculated by applying
-                softmax over similarities, else sigmoid is applied on individual similarities.
+            model_confidence: Model confidence to be returned during inference.
+                Possible values - softmax, cosine, inner.
         """
         super().__init__(name=name)
         self.num_neg = num_neg
@@ -607,7 +596,13 @@ def __init__(
         self.parallel_iterations = parallel_iterations
         self.same_sampling = same_sampling
         self.constrain_similarities = constrain_similarities
-        self.relative_confidence = relative_confidence
+        self.model_confidence = model_confidence
+        self.similarity_type = similarity_type
+        if self.similarity_type and self.similarity_type not in {COSINE, INNER}:
+            raise ValueError(
+                f"Wrong similarity type '{self.similarity_type}', "
+                f"should be '{COSINE}' or '{INNER}'."
+            )
 
     @staticmethod
     def _make_flat(x: tf.Tensor) -> tf.Tensor:
@@ -738,35 +733,44 @@ def _sample_negatives(
             labels_bad_negs,
         )
 
-    @staticmethod
-    def sim(a: tf.Tensor, b: tf.Tensor, mask: Optional[tf.Tensor] = None) -> tf.Tensor:
+    def sim(
+        self, a: tf.Tensor, b: tf.Tensor, mask: Optional[tf.Tensor] = None
+    ) -> tf.Tensor:
         """Calculate similarity between given tensors."""
-
+        if self.similarity_type == COSINE:
+            a = tf.nn.l2_normalize(a, axis=-1)
+            b = tf.nn.l2_normalize(b, axis=-1)
         sim = tf.reduce_sum(a * b, axis=-1)
         if mask is not None:
             sim *= tf.expand_dims(mask, 2)
 
         return sim
 
-    def confidence_from_sim(self, sim: tf.Tensor, similarity_type: Text) -> tf.Tensor:
-        """Computes model confidence/probability from computed similarities.
+    def _confidence_from_embeddings(
+        self, input_embeddings: tf.Tensor, label_embeddings: tf.Tensor
+    ) -> tf.Tensor:
+        """Computes model's prediction confidences from input and label embeddings.
+
+        First compute the similarity from embeddings and then apply an activation
+        function as needed.
 
         Args:
-            sim: Computed similarities
-            similarity_type: Similarity function to use - COSINE, INNER, AUTO.
+            input_embeddings: Embeddings of input
+            label_embeddings: Embeddings of labels
 
         Returns:
-            Confidences corresponding to each similarity value.
+            model confidence during prediction.
         """
-        if similarity_type == COSINE:
-            # clip negative values to zero
-            return tf.nn.relu(sim)
-        elif self.relative_confidence:
-            # normalize result to [0, 1] with softmax
-            return tf.nn.softmax(sim)
-
-        # In other cases convert each individual similarity to probability
-        return tf.nn.sigmoid(sim)
+        # If model's prediction confidence is configured to be cosine similarity,
+        # then normalize embeddings to unit vectors.
+        if self.model_confidence == COSINE or self.similarity_type == COSINE:
+            input_embeddings = tf.nn.l2_normalize(input_embeddings, axis=-1)
+            label_embeddings = tf.nn.l2_normalize(label_embeddings, axis=-1)
+
+        similarities = self.sim(input_embeddings, label_embeddings)
+        if self.model_confidence == SOFTMAX:
+            return tf.nn.softmax(similarities)
+        return similarities
 
     def _train_sim(
         self,
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 1abe040732ee..fea4cca91b31 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -56,7 +56,7 @@
     DROP_RATE_ATTENTION,
     SCALE_LOSS,
     CONSTRAIN_SIMILARITIES,
-    RELATIVE_CONFIDENCE,
+    MODEL_CONFIDENCE,
 )
 from rasa.utils.tensorflow import layers
 from rasa.utils.tensorflow.transformer import TransformerEncoder
@@ -793,7 +793,7 @@ def _prepare_dot_product_loss(
             # set to 1 to get deterministic behaviour
             parallel_iterations=1 if self.random_seed is not None else 1000,
             constrain_similarities=self.config[CONSTRAIN_SIMILARITIES],
-            relative_confidence=self.config[RELATIVE_CONFIDENCE],
+            model_confidence=self.config[MODEL_CONFIDENCE],
         )
 
     def _prepare_sparse_dense_dropout_layers(
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e75407b734f7..3dd97c02ddf4 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -24,7 +24,7 @@
     NUM_TRANSFORMER_LAYERS,
     DENSE_DIMENSION,
     CONSTRAIN_SIMILARITIES,
-    RELATIVE_CONFIDENCE,
+    MODEL_CONFIDENCE,
 )
 from rasa.shared.nlu.constants import (
     ACTION_NAME,
@@ -57,8 +57,8 @@ def normalize(values: np.ndarray) -> np.ndarray:
     return new_values
 
 
-def sort_and_rank(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
-    """Sorts the values in descending order and keep only top `ranking_length` values.
+def filter_top_k(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
+    """Sorts the values in descending order and keeps only top `ranking_length` values.
 
     Other values will be set to 0.
     Args:
@@ -386,29 +386,15 @@ def override_defaults(
 
 
 def _check_confidence_setting(component_config) -> None:
-    if component_config[RELATIVE_CONFIDENCE]:
+    if component_config[MODEL_CONFIDENCE] == SOFTMAX:
         rasa.shared.utils.io.raise_warning(
-            f"{RELATIVE_CONFIDENCE} is set to `True`. It is recommended "
-            f"to set it to `False`. It will be set to `False` by default "
+            f"{MODEL_CONFIDENCE} is set to `softmax`. It is recommended "
+            f"to set it to `cosine`. It will be set to `cosine` by default "
             f"Rasa Open Source 3.0 onwards.",
             category=UserWarning,
         )
 
 
-def _check_similarity_confidence_setting(component_config) -> None:
-    if (
-        not component_config[CONSTRAIN_SIMILARITIES]
-        and not component_config[RELATIVE_CONFIDENCE]
-    ):
-        raise ValueError(
-            f"If {CONSTRAIN_SIMILARITIES} is set to False, "
-            f"{RELATIVE_CONFIDENCE} cannot be set to False as "
-            f"similarities need to be constrained during training "
-            f"time as well in order to correctly compute confidence values "
-            f"for each label at inference time."
-        )
-
-
 def _check_similarity_loss_setting(component_config) -> None:
     if (
         component_config[SIMILARITY_TYPE] == COSINE
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index be5f338b0f22..f6348f91c24e 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -30,7 +30,7 @@
     BILOU_FLAG,
     ENTITY_RECOGNITION,
     INTENT_CLASSIFICATION,
-    RELATIVE_CONFIDENCE,
+    MODEL_CONFIDENCE,
 )
 from rasa.nlu.components import ComponentBuilder
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
@@ -315,7 +315,7 @@ async def test_softmax_normalization(
 
 @pytest.mark.parametrize(
     "classifier_params, output_length",
-    [({RANDOM_SEED: 42, EPOCHS: 1, RELATIVE_CONFIDENCE: False}, LABEL_RANKING_LENGTH)],
+    [({RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: False}, LABEL_RANKING_LENGTH)],
 )
 async def test_softmax_with_absolute_confidence(
     component_builder,
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 974966c4ebc0..548922e0d0ba 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -51,7 +51,7 @@ def test_normalize():
 def test_sort_and_rank(
     input_values: List[float], ranking_length: int, output_values: List[float]
 ):
-    ranked_values = train_utils.sort_and_rank(np.array(input_values), ranking_length)
+    ranked_values = train_utils.filter_top_k(np.array(input_values), ranking_length)
     assert np.array_equal(ranked_values, output_values)
 
 

From 28e8c2687f5b251b6fbac6f8599bdd453b988046 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sun, 31 Jan 2021 22:01:44 +0100
Subject: [PATCH 12/44] update docs, test various options

---
 changelog/7616.improvement.md                 |  9 +++-
 docs/docs/components.mdx                      | 48 +++++++++++--------
 docs/docs/migration-guide.mdx                 | 10 ++--
 docs/docs/policies.mdx                        | 18 ++++---
 rasa/core/policies/ted_policy.py              |  2 +-
 rasa/utils/plotting.py                        | 36 +++++++++-----
 rasa/utils/tensorflow/layers.py               |  8 ++--
 rasa/utils/tensorflow/models.py               |  2 +-
 rasa/utils/train_utils.py                     |  2 +-
 tests/nlu/classifiers/test_diet_classifier.py | 34 +++++++++----
 10 files changed, 110 insertions(+), 59 deletions(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index c1cdb8c8c966..6462f338d0db 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -2,7 +2,14 @@ Added sigmoid cross-entropy loss on all similarity values to constrain them to a
 
 This affects the default behaviour of the loss function(`loss_type=cross_entropy`) inside machine learning (ML) components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`.
 If you notice a degradation in performance, set `constrain_similarities=False` in the respective ML component.
+You may need to tune fallback confidence thresholds to adapt to this change.
 
 Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
 
-Also, added an option `relative_confidence` to each ML component. Contrary to the default behaviour, if `relative_confidence` is set to `False`, the confidences for each label will be between 0 and 1 but they will not sum up to 1. It is also recommended to set `relative_confidence=False` as it will be made default in Rasa Open Source 3.0. You may need to tune fallback confidence thresholds after making this change.
\ No newline at end of file
+Also, added an option `model_confidence` to each ML component. It affects how model's confidence for each label is computed during inference. It can take three values -
+1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
+2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label is in the range `[-1,1]`.
+3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label in in an unbounded range.
+
+The default value is `softmax`, but we recommend using `cosine` as that will be the default value, Rasa Open Source 3.0 onwards.
+The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx
index 3a447293f1ed..b1cf35d4d99c 100644
--- a/docs/docs/components.mdx
+++ b/docs/docs/components.mdx
@@ -1620,13 +1620,19 @@ However, additional parameters exist that can be adapted.
 |                                 |                  | it to the loss function to ensure that similarity values are |
 |                                 |                  | approximately bounded. Used only when `loss_type=softmax`    |
 +---------------------------------+------------------+--------------------------------------------------------------+
-| relative_confidence             | True             | If `True`, applies softmax on all similarity values for pairs|
-|                                 |                  | of input and labels. This means that output confidences      |
-|                                 |                  | will always add up to 1.                                     |
-|                                 |                  | If `False`, applies sigmoid on all similarity values for     |
-|                                 |                  | pairs of input and labels. This means that confidence for    |
-|                                 |                  | each label will be between 0 and 1 but all of them won't add |
-|                                 |                  | up to 1.                                                     |
+| model_confidence                | "softmax"        | Affects how model's confidence for each intent               |
+|                                 |                  | is computed. It can take three values -                      |
+|                                 |                  | 1. `softmax` - Similarities between input and intent         |
+|                                 |                  | embeddings are post-processed with a softmax function,       |
+|                                 |                  | as a result of which confidence for all intents sum up to 1. |
+|                                 |                  | 2. `cosine` - Cosine similarity between input and intent     |
+|                                 |                  | embeddings. Confidence for each intent is in the             |
+|                                 |                  | range `[-1,1]`.                                              |
+|                                 |                  | 3. `inner` - Dot product similarity between input and intent |
+|                                 |                  | embeddings. Confidence for each intent is in an unbounded    |
+|                                 |                  | range.                                                       |
+|                                 |                  | This parameter does not affect the confidence for entity     |
+|                                 |                  | prediction.                                                  |
 +---------------------------------+------------------+--------------------------------------------------------------+
 ```
 
@@ -2821,18 +2827,22 @@ However, additional parameters exist that can be adapted.
 |                                 |                   | Requires `evaluate_on_number_of_examples > 0` and            |
 |                                 |                   | `evaluate_every_number_of_epochs > 0`                        |
 +---------------------------------+-------------------+--------------------------------------------------------------+
-| constrain_similarities          | True             | If `True`, applies sigmoid on all similarity terms and adds  |
-|                                 |                  | it to the loss function to ensure that similarity values are |
-|                                 |                  | approximately bounded. Used only when `loss_type=softmax`    |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| relative_confidence             | True             | If `True`, applies softmax on all similarity values for pairs|
-|                                 |                  | of input and labels. This means that output confidences      |
-|                                 |                  | will always add up to 1.                                     |
-|                                 |                  | If `False`, applies sigmoid on all similarity values for     |
-|                                 |                  | pairs of input and labels. This means that confidence for    |
-|                                 |                  | each label will be between 0 and 1 but all of them won't add |
-|                                 |                  | up to 1.                                                     |
-+---------------------------------+------------------+--------------------------------------------------------------+
+| constrain_similarities          | True             | If `True`, applies sigmoid on all similarity terms and adds   |
+|                                 |                  | it to the loss function to ensure that similarity values are  |
+|                                 |                  | approximately bounded. Used only when `loss_type=softmax`     |
++---------------------------------+------------------+---------------------------------------------------------------+
+| model_confidence                | "softmax"        | Affects how model's confidence for each response label        |
+|                                 |                  | is computed. It can take three values -                       |
+|                                 |                  | 1. `softmax` - Similarities between input and response label  |
+|                                 |                  | embeddings are post-processed with a softmax function,        |
+|                                 |                  | as a result of which confidence for all labels sum up to 1.   |
+|                                 |                  | 2. `cosine` - Cosine similarity between input and response    |
+|                                 |                  | label embeddings. Confidence for each label is in the         |
+|                                 |                  | range `[-1,1]`.                                               |
+|                                 |                  | 3. `inner` - Dot product similarity between input and response|
+|                                 |                  | label embeddings. Confidence for each label is in an          |
+|                                 |                  | unbounded range.                                              |
++---------------------------------+------------------+---------------------------------------------------------------+
 ```
 
 :::note
diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index 2c34fdc904a5..00b4a7940469 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -20,11 +20,13 @@ components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`. These include
 - The default loss function (`loss_type=cross_entropy`) adds a sigmoid cross-entropy loss of all similarity values to constrain
 them to an approximate range. If you notice a degradation in performance, set `constrain_similarities=False`
 in the respective ML component.
-- Added an option `relative_confidence` to each ML component. Contrary to the default behaviour,
-if `relative_confidence` is set to `False`, the confidences for each label will be between 0 and 1
-but they will not sum up to 1. It is also recommended to set `relative_confidence=False` as it will be made
-default in Rasa Open Source 3.0. You may need to tune fallback confidence thresholds after making this change.
 
+Also, a new option `model_confidence` has been added to each ML component. It affects how model's confidence for each label is computed during inference. It can take three values -
+1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
+2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label is in the range `[-1,1]`.
+3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label in in an unbounded range.
+The default value is `softmax`, but we recommend using `cosine` as that will be the default value, Rasa Open Source 3.0 onwards.
+The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
 
 ## Rasa 2.1 to Rasa 2.2
 
diff --git a/docs/docs/policies.mdx b/docs/docs/policies.mdx
index 26a855e9a02f..5f6939a75b7f 100644
--- a/docs/docs/policies.mdx
+++ b/docs/docs/policies.mdx
@@ -345,13 +345,17 @@ However, additional parameters exist that can be adapted.
 |                                       |                        | it to the loss function to ensure that similarity values are |
 |                                       |                        | approximately bounded. Used only when `loss_type=softmax`    |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
-| relative_confidence                   | True                   | If `True`, applies softmax on all similarity values for pairs|
-|                                       |                        | of input and labels. This means that output confidences      |
-|                                       |                        | will always add up to 1.                                     |
-|                                       |                        | If `False`, applies sigmoid on all similarity values for     |
-|                                       |                        | pairs of input and labels. This means that confidence for    |
-|                                       |                        | each label will be between 0 and 1 but all of them won't add |
-|                                       |                        | up to 1.                                                     |
+| model_confidence                      | "softmax"              | Affects how model's confidence for each action               |
+|                                       |                        | is computed. It can take three values -                      |
+|                                       |                        | 1. `softmax` - Similarities between input and action         |
+|                                       |                        | embeddings are post-processed with a softmax function,       |
+|                                       |                        | as a result of which confidence for all labels sum up to 1.  |
+|                                       |                        | 2. `cosine` - Cosine similarity between input and action     |
+|                                       |                        | embeddings. Confidence for each label is in the              |
+|                                       |                        | range `[-1,1]`.                                              |
+|                                       |                        | 3. `inner` - Dot product similarity between input and action |
+|                                       |                        | embeddings. Confidence for each label is in an               |
+|                                       |                        | unbounded range.                                             |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
 | split_entities_by_comma               | True                   | Splits a list of extracted entities by comma to treat each   |
 |                                       |                        | one of them as a single entity. Can either be `True`/`False` |
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 4046a086d0d9..a51328824509 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -283,7 +283,7 @@ class TEDPolicy(Policy):
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: True,
-        # Return softmax based probabilities during prediction.
+        # Model confidence to be returned during inference. Possible values - softmax, cosine, inner.
         MODEL_CONFIDENCE: SOFTMAX,
         # Split entities by comma, this makes sense e.g. for a list of
         # ingredients in a recipe, but it doesn't make sense for the parts of
diff --git a/rasa/utils/plotting.py b/rasa/utils/plotting.py
index 2de1769b3fd0..c670059fbe40 100644
--- a/rasa/utils/plotting.py
+++ b/rasa/utils/plotting.py
@@ -133,22 +133,32 @@ def plot_histogram(
     # Wine-ish colour for the confidences of hits.
     # Blue-ish colour for the confidences of misses.
     colors = ["#009292", "#920000"]
-    bins = [0.025 * i for i in range(1, 42)]
-    # bins = [1 * i for i in range(1, 31)]
+    n_bins = 25
+    max_value = max(max(hist_data[0]), max(hist_data[1]))
+    min_value = min(min(hist_data[0]), min(hist_data[1]))
+    bin_width = (max_value - min_value) / n_bins
+    bins = [min_value + (i * bin_width) for i in range(1, n_bins + 1)]
 
     binned_data_sets = [np.histogram(d, bins=bins)[0] for d in hist_data]
 
     max_xlims = [max(binned_data_set) for binned_data_set in binned_data_sets]
     max_xlims = [xlim + np.ceil(0.25 * xlim) for xlim in max_xlims]  # padding
 
-    min_ylim = bins[
-        min(
-            [
-                (binned_data_set != 0).argmax(axis=0)
-                for binned_data_set in binned_data_sets
-            ]
-        )
-    ]
+    min_ylim = (
+        bins[
+            min(
+                [
+                    (binned_data_set != 0).argmax(axis=0)
+                    for binned_data_set in binned_data_sets
+                ]
+            )
+        ]
+        - bin_width
+    )
+
+    max_ylim = max(bins) + bin_width
+
+    yticks = [float("{:.2f}".format(x)) for x in bins]
 
     centers = 0.5 * (0.05 + (bins + np.roll(bins, 0))[:-1])
     heights = 0.75 * np.diff(bins)
@@ -173,14 +183,14 @@ def plot_histogram(
     )
     axes[1].set(title="Wrong")
 
-    # axes[0].set(yticks=bins, xlim=(0, max_xlims[0]), ylim=(min_ylim, 1.0))
-    # axes[1].set(yticks=bins, xlim=(0, max_xlims[1]), ylim=(min_ylim, 1.0))
+    axes[0].set(yticks=yticks, xlim=(0, max_xlims[0]), ylim=(min_ylim, max_ylim))
+    axes[1].set(yticks=yticks, xlim=(0, max_xlims[1]), ylim=(min_ylim, max_ylim))
 
     axes[0].invert_xaxis()
     axes[0].yaxis.tick_right()
 
     fig.subplots_adjust(
-        wspace=0.14
+        wspace=0.17
     )  # get the graphs exactly far enough apart for yaxis labels
     fig.suptitle(title, fontsize="x-large", fontweight="bold")
 
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 95e02a7cfd8b..555de4eba27a 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -551,7 +551,7 @@ def __init__(
         same_sampling: bool = False,
         similarity_type: Optional[Text] = None,
         constrain_similarities: bool = True,
-        model_confidence: bool = True,
+        model_confidence: Text = SOFTMAX,
     ) -> None:
         """Declare instance variables with default values.
 
@@ -716,18 +716,18 @@ def _confidence_from_embeddings(
         """Computes model's prediction confidences from input and label embeddings.
 
         First compute the similarity from embeddings and then apply an activation
-        function as needed.
+        function if needed.
 
         Args:
             input_embeddings: Embeddings of input
             label_embeddings: Embeddings of labels
 
         Returns:
-            model confidence during prediction.
+            model's prediction confidence
         """
         # If model's prediction confidence is configured to be cosine similarity,
         # then normalize embeddings to unit vectors.
-        if self.model_confidence == COSINE or self.similarity_type == COSINE:
+        if self.model_confidence == COSINE:
             input_embeddings = tf.nn.l2_normalize(input_embeddings, axis=-1)
             label_embeddings = tf.nn.l2_normalize(label_embeddings, axis=-1)
 
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index aa459ca26c9e..cfa5ad025333 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -732,7 +732,6 @@ def _prepare_embed_layers(self, name: Text, prefix: Text = "embed") -> None:
             self.config[EMBEDDING_DIMENSION],
             self.config[REGULARIZATION_CONSTANT],
             name,
-            self.config[SIMILARITY_TYPE],
         )
 
     def _prepare_ffnn_layer(
@@ -792,6 +791,7 @@ def _prepare_dot_product_loss(
             scale_loss,
             # set to 1 to get deterministic behaviour
             parallel_iterations=1 if self.random_seed is not None else 1000,
+            similarity_type=self.config[SIMILARITY_TYPE],
             constrain_similarities=self.config[CONSTRAIN_SIMILARITIES],
             model_confidence=self.config[MODEL_CONFIDENCE],
         )
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 3dd97c02ddf4..2d822ce859a7 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -389,7 +389,7 @@ def _check_confidence_setting(component_config) -> None:
     if component_config[MODEL_CONFIDENCE] == SOFTMAX:
         rasa.shared.utils.io.raise_warning(
             f"{MODEL_CONFIDENCE} is set to `softmax`. It is recommended "
-            f"to set it to `cosine`. It will be set to `cosine` by default "
+            f"to set it to `cosine`. It will be set to `cosine` by default, "
             f"Rasa Open Source 3.0 onwards.",
             category=UserWarning,
         )
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index eb520804b798..2b2b4b6e2456 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -370,14 +370,31 @@ async def test_softmax_normalization(
 
 
 @pytest.mark.parametrize(
-    "classifier_params, output_length",
-    [({RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: False}, LABEL_RANKING_LENGTH)],
+    "classifier_params, prediction_min, prediction_max, output_length",
+    [
+        (
+            {RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "cosine"},
+            -1,
+            1,
+            LABEL_RANKING_LENGTH,
+        )
+    ],
+    [
+        (
+            {RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "inner"},
+            -1e9,
+            1e9,
+            LABEL_RANKING_LENGTH,
+        )
+    ],
 )
-async def test_softmax_with_absolute_confidence(
-    component_builder,
-    tmp_path,
-    classifier_params,
-    output_length,
+async def test_cross_entropy_without_normalization(
+    component_builder: ComponentBuilder,
+    tmp_path: Path,
+    classifier_params: Dict[Text, Any],
+    prediction_min: float,
+    prediction_max: float,
+    output_length: int,
     monkeypatch: MonkeyPatch,
 ):
     pipeline = as_pipeline(
@@ -408,7 +425,8 @@ async def test_softmax_with_absolute_confidence(
 
     # check each confidence is in range
     confidence_in_range = [
-        0.0 <= confidence <= 1.0 for confidence in intent_confidences
+        prediction_min <= confidence <= prediction_max
+        for confidence in intent_confidences
     ]
     assert all(confidence_in_range)
 

From 7eeb251224c7389005d1d575251526c297b23ef9 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sun, 31 Jan 2021 22:04:03 +0100
Subject: [PATCH 13/44] assertive

---
 changelog/7616.improvement.md | 2 +-
 docs/docs/migration-guide.mdx | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index 6462f338d0db..8b6d23e06483 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -2,7 +2,7 @@ Added sigmoid cross-entropy loss on all similarity values to constrain them to a
 
 This affects the default behaviour of the loss function(`loss_type=cross_entropy`) inside machine learning (ML) components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`.
 If you notice a degradation in performance, set `constrain_similarities=False` in the respective ML component.
-You may need to tune fallback confidence thresholds to adapt to this change.
+You should tune fallback confidence thresholds to adapt to this change.
 
 Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
 
diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index 00b4a7940469..71ae99a4201e 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -28,6 +28,8 @@ Also, a new option `model_confidence` has been added to each ML component. It af
 The default value is `softmax`, but we recommend using `cosine` as that will be the default value, Rasa Open Source 3.0 onwards.
 The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
 
+You should tune fallback confidence thresholds to adapt to these changes.
+
 ## Rasa 2.1 to Rasa 2.2
 
 ### General

From 0d175d46e2de491c9710d6712ee077c8b349d12b Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sun, 31 Jan 2021 23:23:02 +0100
Subject: [PATCH 14/44] fix plotting

---
 rasa/utils/plotting.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/plotting.py b/rasa/utils/plotting.py
index c670059fbe40..195060c631ee 100644
--- a/rasa/utils/plotting.py
+++ b/rasa/utils/plotting.py
@@ -134,8 +134,12 @@ def plot_histogram(
     # Blue-ish colour for the confidences of misses.
     colors = ["#009292", "#920000"]
     n_bins = 25
-    max_value = max(max(hist_data[0]), max(hist_data[1]))
-    min_value = min(min(hist_data[0]), min(hist_data[1]))
+    max_value = max(
+        [max(hist_data[0], default=0), max(hist_data[1], default=0)], default=0
+    )
+    min_value = min(
+        [min(hist_data[0], default=0), min(hist_data[1], default=0)], default=0
+    )
     bin_width = (max_value - min_value) / n_bins
     bins = [min_value + (i * bin_width) for i in range(1, n_bins + 1)]
 

From af82d2126df4c4aa17cfefbcbfe06457947ead7c Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 1 Feb 2021 13:44:08 +0100
Subject: [PATCH 15/44] fix ted, add line to migration

---
 docs/docs/migration-guide.mdx           |  2 ++
 rasa/core/policies/ted_policy.py        |  7 +++---
 rasa/nlu/classifiers/diet_classifier.py |  4 +++-
 rasa/nlu/selectors/response_selector.py |  4 +++-
 rasa/utils/tensorflow/layers.py         | 29 +++++++++++++++----------
 rasa/utils/train_utils.py               |  7 +++---
 6 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index 71ae99a4201e..131083686334 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -29,6 +29,8 @@ The default value is `softmax`, but we recommend using `cosine` as that will be
 The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
 
 You should tune fallback confidence thresholds to adapt to these changes.
+To maintain the behaviour of older minor versions of Rasa Open Source 2.x, set `constrain_similarities=False`
+and `model_confidence=softmax` to the respective ML component.
 
 ## Rasa 2.1 to Rasa 2.2
 
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index a51328824509..cab3fe4e6367 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -1727,15 +1727,14 @@ def batch_predict(
         ) = self._embed_dialogue(dialogue_in, tf_batch_data)
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
-        sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
+        sim_all, scores = self._tf_layers[
+            f"loss.{LABEL}"
+        ]._similarity_confidence_from_embeddings(
             dialogue_embed[:, :, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, tf.newaxis, :, :],
             dialogue_mask,
         )
 
-        scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
-            sim_all, self.config[SIMILARITY_TYPE]
-        )
         predictions = {
             "action_scores": scores,
             "similarities": sim_all,
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index dc21066dfa94..c3ee0ec93fe5 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1683,7 +1683,9 @@ def _batch_predict_intents(
         sentence_vector = self._last_token(text_transformed, sequence_lengths)
         sentence_vector_embed = self._tf_layers[f"embed.{TEXT}"](sentence_vector)
 
-        scores = self._tf_layers[f"loss.{LABEL}"]._confidence_from_embeddings(
+        _, scores = self._tf_layers[
+            f"loss.{LABEL}"
+        ]._similarity_confidence_from_embeddings(
             sentence_vector_embed[:, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, :, :],
         )
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 6ae739215a85..5bd814fb3877 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -763,7 +763,9 @@ def batch_predict(
         sentence_vector = self._last_token(text_transformed, sequence_lengths_text)
         sentence_vector_embed = self._tf_layers[f"embed.{TEXT}"](sentence_vector)
 
-        scores = self._tf_layers[f"loss.{LABEL}"]._confidence_from_embeddings(
+        _, scores = self._tf_layers[
+            f"loss.{LABEL}"
+        ]._similarity_confidence_from_embeddings(
             sentence_vector_embed[:, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, :, :],
         )
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 555de4eba27a..53e0d9179835 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -535,7 +535,7 @@ def f1_score(
 
 
 class DotProductLoss(tf.keras.layers.Layer):
-    """Dot-product loss layer"""
+    """Dot-product loss layer."""
 
     def __init__(
         self,
@@ -710,20 +710,24 @@ def sim(
 
         return sim
 
-    def _confidence_from_embeddings(
-        self, input_embeddings: tf.Tensor, label_embeddings: tf.Tensor
-    ) -> tf.Tensor:
-        """Computes model's prediction confidences from input and label embeddings.
+    def _similarity_confidence_from_embeddings(
+        self,
+        input_embeddings: tf.Tensor,
+        label_embeddings: tf.Tensor,
+        mask: Optional[tf.Tensor] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        """Computes similarity between input and label embeddings and model's confidence.
 
         First compute the similarity from embeddings and then apply an activation
-        function if needed.
+        function if needed to get the confidence.
 
         Args:
-            input_embeddings: Embeddings of input
-            label_embeddings: Embeddings of labels
+            input_embeddings: Embeddings of input.
+            label_embeddings: Embeddings of labels.
+            mask: Mask over input and output sequence.
 
         Returns:
-            model's prediction confidence
+            similarity between input and label embeddings and model's prediction confidence for each label.
         """
         # If model's prediction confidence is configured to be cosine similarity,
         # then normalize embeddings to unit vectors.
@@ -731,10 +735,11 @@ def _confidence_from_embeddings(
             input_embeddings = tf.nn.l2_normalize(input_embeddings, axis=-1)
             label_embeddings = tf.nn.l2_normalize(label_embeddings, axis=-1)
 
-        similarities = self.sim(input_embeddings, label_embeddings)
+        similarities = self.sim(input_embeddings, label_embeddings, mask)
+        confidences = similarities
         if self.model_confidence == SOFTMAX:
-            return tf.nn.softmax(similarities)
-        return similarities
+            confidences = tf.nn.softmax(similarities)
+        return similarities, confidences
 
     def _train_sim(
         self,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 2d822ce859a7..99058492c07c 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -94,12 +94,13 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
 
 
 def update_loss_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
-    """
-    If LOSS_TYPE is set to 'softmax', update it to 'cross_entropy' since former is deprecated.
+    """If LOSS_TYPE is set to 'softmax', update it to 'cross_entropy' since former is deprecated.
+
     Args:
         config: model configuration
 
-    Returns: updated model configuration
+    Returns:
+        updated model configuration
     """
     # TODO: Completely deprecate this with 3.0
     if config.get(LOSS_TYPE) == SOFTMAX:

From d11ab35e138b73cde188dff6fdadff1997076d19 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 1 Feb 2021 18:17:57 +0100
Subject: [PATCH 16/44] dummy change to trigger tests

---
 changelog/7616.improvement.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index 8b6d23e06483..774622ba7c90 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -11,5 +11,4 @@ Also, added an option `model_confidence` to each ML component. It affects how mo
 2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label is in the range `[-1,1]`.
 3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label in in an unbounded range.
 
-The default value is `softmax`, but we recommend using `cosine` as that will be the default value, Rasa Open Source 3.0 onwards.
-The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
+The default value is `softmax`, but we recommend using `cosine` as that will be the default value, Rasa Open Source 3.0 onwards. The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.

From 4cf0750f31cab94facac144cb68e14cfe5cc2d02 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Wed, 3 Feb 2021 18:46:49 +0100
Subject: [PATCH 17/44] add changes for autoconfig, defaults

---
 changelog/7616.improvement.md                 | 18 ++++---
 .../config_empty_en_after_dumping.yml         |  6 +++
 .../config_empty_en_after_dumping_core.yml    |  2 +
 .../config_empty_en_after_dumping_nlu.yml     |  4 ++
 .../config_empty_fr_after_dumping.yml         |  6 +++
 .../config_with_comments_after_dumping.yml    |  2 +
 docs/docs/components.mdx                      | 52 ++++++++++---------
 docs/docs/migration-guide.mdx                 | 17 +++---
 docs/docs/policies.mdx                        | 10 ++--
 rasa/core/policies/ted_policy.py              |  8 +--
 rasa/nlu/classifiers/diet_classifier.py       |  5 +-
 rasa/nlu/selectors/response_selector.py       |  4 +-
 rasa/shared/importers/default_config.yml      |  6 +++
 rasa/utils/train_utils.py                     | 13 +++++
 14 files changed, 100 insertions(+), 53 deletions(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index 774622ba7c90..91edd05a876b 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -1,14 +1,16 @@
-Added sigmoid cross-entropy loss on all similarity values to constrain them to an approximate range in `DotProductLoss`.
+Added an option `constrain_similarities` which adds sigmoid cross-entropy loss on all similarity values to constrain them to an approximate range in `DotProductLoss`.
 
-This affects the default behaviour of the loss function(`loss_type=cross_entropy`) inside machine learning (ML) components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`.
-If you notice a degradation in performance, set `constrain_similarities=False` in the respective ML component.
-You should tune fallback confidence thresholds to adapt to this change.
+This affects the behaviour of the loss function(`loss_type=cross_entropy`) inside machine learning (ML) components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`.
+By default, the parameter is set to `False` but users are encouraged to set it to `True` and re-train their assistants as it will be set to `True` by default, Rasa Open Source 3.0 onwards.  
+Once you re-train your assistant with this option set to `True`, you should also tune fallback confidence thresholds.
 
 Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
 
-Also, added an option `model_confidence` to each ML component. It affects how model's confidence for each label is computed during inference. It can take three values -
+Also, added an option `model_confidence` to each ML component. It affects how m mponent. It affects how model's confidence for each label is computed during inference. It can take three values -
 1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
-2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label is in the range `[-1,1]`.
-3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label in in an unbounded range.
+2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label will be in the range `[-1,1]`.
+3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label will be in an unbounded range.
 
-The default value is `softmax`, but we recommend using `cosine` as that will be the default value, Rasa Open Source 3.0 onwards. The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
+The default value is `softmax`, but we recommend using `cosine` as that will be the new default value, Rasa Open Source 3.0 onwards. The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
+
+The default auto-configuration is changed to use `constrain_similarities=True` and `model_confidence=cosine` in ML components so that new users start with the recommended configuration.
\ No newline at end of file
diff --git a/data/test_config/config_empty_en_after_dumping.yml b/data/test_config/config_empty_en_after_dumping.yml
index 20507a3944af..79c21d70c4a7 100644
--- a/data/test_config/config_empty_en_after_dumping.yml
+++ b/data/test_config/config_empty_en_after_dumping.yml
@@ -13,9 +13,13 @@ pipeline:
 #     max_ngram: 4
 #   - name: DIETClassifier
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: EntitySynonymMapper
 #   - name: ResponseSelector
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: FallbackClassifier
 #     threshold: 0.3
 #     ambiguity_threshold: 0.1
@@ -27,4 +31,6 @@ policies:
 #   - name: TEDPolicy
 #     max_history: 5
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: RulePolicy
diff --git a/data/test_config/config_empty_en_after_dumping_core.yml b/data/test_config/config_empty_en_after_dumping_core.yml
index 1488270ddf39..adb3c2a0af55 100644
--- a/data/test_config/config_empty_en_after_dumping_core.yml
+++ b/data/test_config/config_empty_en_after_dumping_core.yml
@@ -8,4 +8,6 @@ policies:
 #   - name: TEDPolicy
 #     max_history: 5
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: RulePolicy
diff --git a/data/test_config/config_empty_en_after_dumping_nlu.yml b/data/test_config/config_empty_en_after_dumping_nlu.yml
index a4cb5077bf58..8249b17a0e11 100644
--- a/data/test_config/config_empty_en_after_dumping_nlu.yml
+++ b/data/test_config/config_empty_en_after_dumping_nlu.yml
@@ -13,9 +13,13 @@ pipeline:
 #     max_ngram: 4
 #   - name: DIETClassifier
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: EntitySynonymMapper
 #   - name: ResponseSelector
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: FallbackClassifier
 #     threshold: 0.3
 #     ambiguity_threshold: 0.1
diff --git a/data/test_config/config_empty_fr_after_dumping.yml b/data/test_config/config_empty_fr_after_dumping.yml
index 8148c3ebee68..a2ea89f4bf0a 100644
--- a/data/test_config/config_empty_fr_after_dumping.yml
+++ b/data/test_config/config_empty_fr_after_dumping.yml
@@ -13,9 +13,13 @@ pipeline:
 #     max_ngram: 4
 #   - name: DIETClassifier
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: EntitySynonymMapper
 #   - name: ResponseSelector
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: FallbackClassifier
 #     threshold: 0.3
 #     ambiguity_threshold: 0.1
@@ -27,4 +31,6 @@ policies:
 #   - name: TEDPolicy
 #     max_history: 5
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: RulePolicy
diff --git a/data/test_config/config_with_comments_after_dumping.yml b/data/test_config/config_with_comments_after_dumping.yml
index 16b6129d18f9..ef0743f894de 100644
--- a/data/test_config/config_with_comments_after_dumping.yml
+++ b/data/test_config/config_with_comments_after_dumping.yml
@@ -27,6 +27,8 @@ policies: # even here
 #   - name: TEDPolicy
 #     max_history: 5
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: RulePolicy
 
 # comments everywhere
diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx
index 747285d3e063..5e36a9c61dca 100644
--- a/docs/docs/components.mdx
+++ b/docs/docs/components.mdx
@@ -1531,10 +1531,12 @@ However, additional parameters exist that can be adapted.
 | similarity_type                 | "auto"           | Type of similarity measure to use, either 'auto' or 'cosine' |
 |                                 |                  | or 'inner'.                                                  |
 +---------------------------------+------------------+--------------------------------------------------------------+
-| loss_type                       | "softmax"        | The type of the loss function, either 'softmax' or 'margin'. |
+| loss_type                       | "cross_entropy"  | The type of the loss function, either 'cross_entropy'        |
+|                                 |                  | or 'margin'.                                                 |
 +---------------------------------+------------------+--------------------------------------------------------------+
-| ranking_length                  | 10               | Number of top actions to normalize scores for loss type      |
-|                                 |                  | 'softmax'. Set to 0 to turn off normalization.               |
+| ranking_length                  | 10               | Number of top intents to normalize scores for. Applicable    |
+|                                 |                  | with loss type 'cross_entropy'. Set to 0 to disable          |
+|                                 |                  | normalization.                                               |
 +---------------------------------+------------------+--------------------------------------------------------------+
 | maximum_positive_similarity     | 0.8              | Indicates how similar the algorithm should try to make       |
 |                                 |                  | embedding vectors for correct labels.                        |
@@ -1616,9 +1618,9 @@ However, additional parameters exist that can be adapted.
 |                                 |                  | ...                                                          |
 |                                 |                  | ```                                                          |
 +---------------------------------+------------------+--------------------------------------------------------------+
-| constrain_similarities          | True             | If `True`, applies sigmoid on all similarity terms and adds  |
+| constrain_similarities          | False            | If `True`, applies sigmoid on all similarity terms and adds  |
 |                                 |                  | it to the loss function to ensure that similarity values are |
-|                                 |                  | approximately bounded. Used only when `loss_type=softmax`    |
+|                                 |                  | approximately bounded. Used only if `loss_type=cross_entropy`|
 +---------------------------------+------------------+--------------------------------------------------------------+
 | model_confidence                | "softmax"        | Affects how model's confidence for each intent               |
 |                                 |                  | is computed. It can take three values -                      |
@@ -2760,10 +2762,12 @@ However, additional parameters exist that can be adapted.
 | similarity_type                 | "auto"            | Type of similarity measure to use, either 'auto' or 'cosine' |
 |                                 |                   | or 'inner'.                                                  |
 +---------------------------------+-------------------+--------------------------------------------------------------+
-| loss_type                       | "softmax"         | The type of the loss function, either 'softmax' or 'margin'. |
+| loss_type                       | "cross_entropy"   | The type of the loss function, either 'cross_entropy'        |
+|                                 |                   | or 'margin'.                                                 |
 +---------------------------------+-------------------+--------------------------------------------------------------+
-| ranking_length                  | 10                | Number of top actions to normalize scores for loss type      |
-|                                 |                   | 'softmax'. Set to 0 to turn off normalization.               |
+| ranking_length                  | 10                | Number of top responses to normalize scores for. Applicable  |
+|                                 |                   | with loss type 'cross_entropy'. Set to 0 to disable          |
+|                                 |                   | normalization.                                               |
 +---------------------------------+-------------------+--------------------------------------------------------------+
 | maximum_positive_similarity     | 0.8               | Indicates how similar the algorithm should try to make       |
 |                                 |                   | embedding vectors for correct labels.                        |
@@ -2832,22 +2836,22 @@ However, additional parameters exist that can be adapted.
 |                                 |                   | Requires `evaluate_on_number_of_examples > 0` and            |
 |                                 |                   | `evaluate_every_number_of_epochs > 0`                        |
 +---------------------------------+-------------------+--------------------------------------------------------------+
-| constrain_similarities          | True             | If `True`, applies sigmoid on all similarity terms and adds   |
-|                                 |                  | it to the loss function to ensure that similarity values are  |
-|                                 |                  | approximately bounded. Used only when `loss_type=softmax`     |
-+---------------------------------+------------------+---------------------------------------------------------------+
-| model_confidence                | "softmax"        | Affects how model's confidence for each response label        |
-|                                 |                  | is computed. It can take three values -                       |
-|                                 |                  | 1. `softmax` - Similarities between input and response label  |
-|                                 |                  | embeddings are post-processed with a softmax function,        |
-|                                 |                  | as a result of which confidence for all labels sum up to 1.   |
-|                                 |                  | 2. `cosine` - Cosine similarity between input and response    |
-|                                 |                  | label embeddings. Confidence for each label is in the         |
-|                                 |                  | range `[-1,1]`.                                               |
-|                                 |                  | 3. `inner` - Dot product similarity between input and response|
-|                                 |                  | label embeddings. Confidence for each label is in an          |
-|                                 |                  | unbounded range.                                              |
-+---------------------------------+------------------+---------------------------------------------------------------+
+| constrain_similarities          | False             | If `True`, applies sigmoid on all similarity terms and adds  |
+|                                 |                   | it to the loss function to ensure that similarity values are |
+|                                 |                   | approximately bounded. Used only if `loss_type=cross_entropy`|
++---------------------------------+-------------------+--------------------------------------------------------------+
+| model_confidence                | "softmax"         | Affects how model's confidence for each response label       |
+|                                 |                   | is computed. It can take three values -                      |
+|                                 |                   | 1. `softmax` - Similarities between input and response label |
+|                                 |                   | embeddings are post-processed with a softmax function,       |
+|                                 |                   | as a result of which confidence for all labels sum up to 1.  |
+|                                 |                   | 2. `cosine` - Cosine similarity between input and response   |
+|                                 |                   | label embeddings. Confidence for each label is in the        |
+|                                 |                   | range `[-1,1]`.                                              |
+|                                 |                   | 3. `inner` - Dot product similarity between input and        |
+|                                 |                   | response label embeddings. Confidence for each label is in an|
+|                                 |                   | unbounded range.                                             |
++---------------------------------+-------------------+--------------------------------------------------------------+
 ```
 
 :::note
diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index 131083686334..4b908a1193c5 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -14,23 +14,20 @@ how you can migrate from one version to another.
 
 ### Machine Learning Components
 
-Few changes have been made to the default loss function inside machine learning (ML)
+Few changes have been made to the loss function inside machine learning (ML)
 components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`. These include:
-- Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
-- The default loss function (`loss_type=cross_entropy`) adds a sigmoid cross-entropy loss of all similarity values to constrain
-them to an approximate range. If you notice a degradation in performance, set `constrain_similarities=False`
-in the respective ML component.
+1. Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
+2. The default loss function (`loss_type=cross_entropy`) adds an optional sigmoid cross-entropy loss of all similarity values to constrain
+them to an approximate range. You can turn on this option by setting `constrain_similarities=True`.
 
 Also, a new option `model_confidence` has been added to each ML component. It affects how model's confidence for each label is computed during inference. It can take three values -
 1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
-2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label is in the range `[-1,1]`.
-3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label in in an unbounded range.
-The default value is `softmax`, but we recommend using `cosine` as that will be the default value, Rasa Open Source 3.0 onwards.
+2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label will be in the range `[-1,1]`.
+3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label will be in an unbounded range.
+The default value is `softmax`, but we recommend using `cosine` as that will be the new default value, Rasa Open Source 3.0 onwards.
 The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
 
 You should tune fallback confidence thresholds to adapt to these changes.
-To maintain the behaviour of older minor versions of Rasa Open Source 2.x, set `constrain_similarities=False`
-and `model_confidence=softmax` to the respective ML component.
 
 ## Rasa 2.1 to Rasa 2.2
 
diff --git a/docs/docs/policies.mdx b/docs/docs/policies.mdx
index ec6efa60c7e3..6255b3219aec 100644
--- a/docs/docs/policies.mdx
+++ b/docs/docs/policies.mdx
@@ -265,10 +265,12 @@ However, additional parameters exist that can be adapted.
 | similarity_type                       | "auto"                 | Type of similarity measure to use, either 'auto' or 'cosine' |
 |                                       |                        | or 'inner'.                                                  |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
-| loss_type                             | "softmax"              | The type of the loss function, either 'softmax' or 'margin'. |
+| loss_type                             | "cross_entropy"        | The type of the loss function, either 'cross_entropy'        |
+|                                       |                        | or 'margin'.                                                 |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
-| ranking_length                        | 10                     | Number of top actions to normalize scores for loss type      |
-|                                       |                        | 'softmax'. Set to 0 to turn off normalization.               |
+| ranking_length                        | 10                     | Number of top responses to normalize scores for. Applicable  |
+|                                       |                        | with loss type 'cross_entropy'. Set to 0 to disable          |
+|                                       |                        | normalization.                                               |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
 | maximum_positive_similarity           | 0.8                    | Indicates how similar the algorithm should try to make       |
 |                                       |                        | embedding vectors for correct labels.                        |
@@ -341,7 +343,7 @@ However, additional parameters exist that can be adapted.
 | entity_recognition                    | True                   | If 'True' entity recognition is trained and entities are     |
 |                                       |                        | extracted.                                                   |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
-| constrain_similarities                | True                   | If `True`, applies sigmoid on all similarity terms and adds  |
+| constrain_similarities                | False                  | If `True`, applies sigmoid on all similarity terms and adds  |
 |                                       |                        | it to the loss function to ensure that similarity values are |
 |                                       |                        | approximately bounded. Used only when `loss_type=softmax`    |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 0d61d25997b1..b7d3d2b0f540 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -215,9 +215,9 @@ class TEDPolicy(Policy):
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: AUTO,
-        # The type of the loss function, either 'softmax' or 'margin'.
+        # The type of the loss function, either 'cross_entropy' or 'margin'.
         LOSS_TYPE: CROSS_ENTROPY,
-        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Number of top actions to normalize scores for. Applicable with loss type 'cross_entropy'.
         # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
         # Indicates how similar the algorithm should try to make embedding vectors
@@ -282,7 +282,7 @@ class TEDPolicy(Policy):
         ENTITY_RECOGNITION: True,
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
-        CONSTRAIN_SIMILARITIES: True,
+        CONSTRAIN_SIMILARITIES: False,
         # Model confidence to be returned during inference. Possible values - softmax, cosine, inner.
         MODEL_CONFIDENCE: SOFTMAX,
         # 'BILOU_flag' determines whether to use BILOU tagging or not.
@@ -349,6 +349,8 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = rasa.utils.train_utils.override_defaults(
             self.defaults, new_config
         )
+
+        rasa.utils.train_utils._check_loss_setting(self.config)
         rasa.utils.train_utils._check_confidence_setting(self.config)
         rasa.utils.train_utils._check_similarity_loss_setting(self.config)
         self.config = rasa.utils.train_utils.update_loss_type(self.config)
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 33eda2b13fcf..4d4b5f3b97d9 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -181,7 +181,7 @@ def required_components(cls) -> List[Type[Component]]:
         SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'cross_entropy' or 'margin'.
         LOSS_TYPE: CROSS_ENTROPY,
-        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Number of top intents to normalize scores for. Applicable with loss type 'cross_entropy'.
         # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
         # Indicates how similar the algorithm should try to make embedding vectors
@@ -251,7 +251,7 @@ def required_components(cls) -> List[Type[Component]]:
         SPLIT_ENTITIES_BY_COMMA: True,
         # If 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
-        CONSTRAIN_SIMILARITIES: True,
+        CONSTRAIN_SIMILARITIES: False,
         # Model confidence to be returned during inference. Possible values - softmax, cosine, inner.
         MODEL_CONFIDENCE: SOFTMAX,
     }
@@ -293,6 +293,7 @@ def _check_config_parameters(self) -> None:
         self._check_masked_lm()
         self._check_share_hidden_layers_sizes()
 
+        train_utils._check_loss_setting(self.component_config)
         train_utils._check_confidence_setting(self.component_config)
         train_utils._check_similarity_loss_setting(self.component_config)
 
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index a10cf24437a6..e769c371fc67 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -176,7 +176,7 @@ def required_components(cls) -> List[Type[Component]]:
         SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'cross_entropy' or 'margin'.
         LOSS_TYPE: CROSS_ENTROPY,
-        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Number of top actions to normalize scores for. Applicable with loss type 'cross_entropy'.
         # Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
         # Indicates how similar the algorithm should try to make embedding vectors
@@ -237,7 +237,7 @@ def required_components(cls) -> List[Type[Component]]:
         CHECKPOINT_MODEL: False,
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
-        CONSTRAIN_SIMILARITIES: True,
+        CONSTRAIN_SIMILARITIES: False,
         # Model confidence to be returned during inference. Possible values - softmax, cosine, inner.
         MODEL_CONFIDENCE: SOFTMAX,
     }
diff --git a/rasa/shared/importers/default_config.yml b/rasa/shared/importers/default_config.yml
index 95c9716b0d4e..63d10d9249ab 100644
--- a/rasa/shared/importers/default_config.yml
+++ b/rasa/shared/importers/default_config.yml
@@ -13,9 +13,13 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
     epochs: 100
+    constrain_similarities: true
+    model_confidence: cosine
   - name: EntitySynonymMapper
   - name: ResponseSelector
     epochs: 100
+    constrain_similarities: true
+    model_confidence: cosine
   - name: FallbackClassifier
     threshold: 0.3
     ambiguity_threshold: 0.1
@@ -27,4 +31,6 @@ policies:
   - name: TEDPolicy
     max_history: 5
     epochs: 100
+    constrain_similarities: true
+    model_confidence: cosine
   - name: RulePolicy
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e6f09bf74f8b..253bc7bb07c4 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -396,6 +396,19 @@ def _check_confidence_setting(component_config) -> None:
         )
 
 
+def _check_loss_setting(component_config) -> None:
+    if not component_config[CONSTRAIN_SIMILARITIES] and component_config[LOSS_TYPE] in [
+        SOFTMAX,
+        CROSS_ENTROPY,
+    ]:
+        rasa.shared.utils.io.raise_warning(
+            f"{CONSTRAIN_SIMILARITIES} is set to `False`. It is recommended "
+            f"to set it to `True` when using cross-entropy loss. It will be set to `True` by default, "
+            f"Rasa Open Source 3.0 onwards.",
+            category=UserWarning,
+        )
+
+
 def _check_similarity_loss_setting(component_config) -> None:
     if (
         component_config[SIMILARITY_TYPE] == COSINE

From 827dc2bcb104b94a97811971b8a13d71629fe889 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Thu, 4 Feb 2021 12:35:19 +0100
Subject: [PATCH 18/44] fix test

---
 tests/nlu/classifiers/test_diet_classifier.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index 2b2b4b6e2456..bb0c2aa13cb0 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -377,15 +377,13 @@ async def test_softmax_normalization(
             -1,
             1,
             LABEL_RANKING_LENGTH,
-        )
-    ],
-    [
+        ),
         (
             {RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "inner"},
             -1e9,
             1e9,
             LABEL_RANKING_LENGTH,
-        )
+        ),
     ],
 )
 async def test_cross_entropy_without_normalization(

From 6e44c2fc2cdc1c44af6b1554854cb328c188a21c Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Fri, 5 Feb 2021 13:20:37 +0100
Subject: [PATCH 19/44] Apply suggestions from code review

Co-authored-by: Tobias Wochinger <t.wochinger@rasa.com>
Co-authored-by: Vladimir Vlasov <vladimir@rasa.com>
---
 docs/docs/migration-guide.mdx    | 2 +-
 rasa/core/policies/ted_policy.py | 2 +-
 rasa/utils/tensorflow/layers.py  | 4 ++--
 rasa/utils/train_utils.py        | 6 +++---
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index 4b908a1193c5..15fbaca57dc7 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -16,7 +16,7 @@ how you can migrate from one version to another.
 
 Few changes have been made to the loss function inside machine learning (ML)
 components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`. These include:
-1. Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
+1. Configuration option `loss_type=softmax` is now deprecated and will be removed in Rasa Open Source 3.0. Use `loss_type=cross_entropy` instead.
 2. The default loss function (`loss_type=cross_entropy`) adds an optional sigmoid cross-entropy loss of all similarity values to constrain
 them to an approximate range. You can turn on this option by setting `constrain_similarities=True`.
 
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 9b567aad47aa..19eb4eca337d 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -283,7 +283,7 @@ class TEDPolicy(Policy):
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: False,
-        # Model confidence to be returned during inference. Possible values - softmax, cosine, inner.
+        # Model confidence to be returned during inference. Possible values - 'softmax', 'cosine', 'inner'.
         MODEL_CONFIDENCE: SOFTMAX,
         # 'BILOU_flag' determines whether to use BILOU tagging or not.
         # If set to 'True' labelling is more rigorous, however more
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 53e0d9179835..ac2649ee60da 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -287,7 +287,7 @@ class Embed(tf.keras.layers.Layer):
     """
 
     def __init__(
-        self, embed_dim: int, reg_lambda: float, layer_name_suffix: Text,
+        self, embed_dim: int, reg_lambda: float, layer_name_suffix: Text
     ) -> None:
         """Initialize layer.
 
@@ -852,7 +852,7 @@ def _loss_cross_entropy(
         sim_neg_li: tf.Tensor,
         mask: Optional[tf.Tensor],
     ) -> tf.Tensor:
-        """Define cross entropy loss."""
+        """Defines cross entropy loss."""
         # Similarity terms between input and label should be optimized relative
         # to each other and hence use them as logits for softmax term
         softmax_logits = tf.concat([sim_pos, sim_neg_il, sim_neg_li], axis=-1)
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 253bc7bb07c4..9c64b8d34b09 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -392,7 +392,7 @@ def _check_confidence_setting(component_config) -> None:
             f"{MODEL_CONFIDENCE} is set to `softmax`. It is recommended "
             f"to set it to `cosine`. It will be set to `cosine` by default, "
             f"Rasa Open Source 3.0 onwards.",
-            category=UserWarning,
+            category=FutureWarning,
         )
 
 
@@ -405,7 +405,7 @@ def _check_loss_setting(component_config) -> None:
             f"{CONSTRAIN_SIMILARITIES} is set to `False`. It is recommended "
             f"to set it to `True` when using cross-entropy loss. It will be set to `True` by default, "
             f"Rasa Open Source 3.0 onwards.",
-            category=UserWarning,
+            category=FutureWarning,
         )
 
 
@@ -423,7 +423,7 @@ def _check_similarity_loss_setting(component_config) -> None:
             f"Ideally use `{SIMILARITY_TYPE}={INNER}`"
             f" and `{LOSS_TYPE}={CROSS_ENTROPY}` or"
             f"`{SIMILARITY_TYPE}={COSINE}` and `{LOSS_TYPE}={MARGIN}`.",
-            category=UserWarning,
+            category=FutureWarning,
         )
 
 

From f5d26e72970e4cfbe30f81dfe948072d0e831f55 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Fri, 5 Feb 2021 13:21:53 +0100
Subject: [PATCH 20/44] remove parallel iter and complex op

---
 changelog/7616.improvement.md    |  4 ++--
 rasa/core/policies/ted_policy.py |  2 +-
 rasa/utils/tensorflow/layers.py  | 14 ++++----------
 rasa/utils/tensorflow/models.py  |  2 --
 4 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index 91edd05a876b..6020823cbd49 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -1,12 +1,12 @@
 Added an option `constrain_similarities` which adds sigmoid cross-entropy loss on all similarity values to constrain them to an approximate range in `DotProductLoss`.
 
-This affects the behaviour of the loss function(`loss_type=cross_entropy`) inside machine learning (ML) components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`.
+This affects the behaviour of the loss function(`loss_type=cross_entropy`) inside machine learning (ML) components - [DIETClassifier](components.mdx#dietclassifier), [ResponseSelector](components.mdx#dietclassifier) and [TEDPolicy](policies.mdx#ted-policy).
 By default, the parameter is set to `False` but users are encouraged to set it to `True` and re-train their assistants as it will be set to `True` by default, Rasa Open Source 3.0 onwards.  
 Once you re-train your assistant with this option set to `True`, you should also tune fallback confidence thresholds.
 
 Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
 
-Also, added an option `model_confidence` to each ML component. It affects how m mponent. It affects how model's confidence for each label is computed during inference. It can take three values -
+Also, added an option `model_confidence` to each ML component. It affects how model's confidence for each label is computed during inference. It can take three values -
 1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
 2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label will be in the range `[-1,1]`.
 3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label will be in an unbounded range.
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 9b567aad47aa..dea768b2189f 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -283,7 +283,7 @@ class TEDPolicy(Policy):
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: False,
-        # Model confidence to be returned during inference. Possible values - softmax, cosine, inner.
+        # Model confidence to be returned during inference. Possible values - 'softmax', 'cosine' and 'inner'.
         MODEL_CONFIDENCE: SOFTMAX,
         # 'BILOU_flag' determines whether to use BILOU tagging or not.
         # If set to 'True' labelling is more rigorous, however more
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 53e0d9179835..04683c7c96a8 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -287,7 +287,7 @@ class Embed(tf.keras.layers.Layer):
     """
 
     def __init__(
-        self, embed_dim: int, reg_lambda: float, layer_name_suffix: Text,
+        self, embed_dim: int, reg_lambda: float, layer_name_suffix: Text
     ) -> None:
         """Initialize layer.
 
@@ -547,7 +547,6 @@ def __init__(
         neg_lambda: float,
         scale_loss: bool,
         name: Optional[Text] = None,
-        parallel_iterations: int = 1000,
         same_sampling: bool = False,
         similarity_type: Optional[Text] = None,
         constrain_similarities: bool = True,
@@ -573,8 +572,6 @@ def __init__(
             scale_loss: Boolean, if 'True' scale loss inverse proportionally to
                 the confidence of the correct prediction.
             name: Optional name of the layer.
-            parallel_iterations: Positive integer, the number of iterations allowed
-                to run in parallel.
             same_sampling: Boolean, if 'True' sample same negative labels
                 for the whole batch.
             similarity_type: Similarity measure to use, either 'cosine' or 'inner'.
@@ -593,7 +590,6 @@ def __init__(
         self.use_max_sim_neg = use_max_sim_neg
         self.neg_lambda = neg_lambda
         self.scale_loss = scale_loss
-        self.parallel_iterations = parallel_iterations
         self.same_sampling = same_sampling
         self.constrain_similarities = constrain_similarities
         self.model_confidence = model_confidence
@@ -852,7 +848,7 @@ def _loss_cross_entropy(
         sim_neg_li: tf.Tensor,
         mask: Optional[tf.Tensor],
     ) -> tf.Tensor:
-        """Define cross entropy loss."""
+        """Defines cross entropy loss."""
         # Similarity terms between input and label should be optimized relative
         # to each other and hence use them as logits for softmax term
         softmax_logits = tf.concat([sim_pos, sim_neg_il, sim_neg_li], axis=-1)
@@ -883,10 +879,8 @@ def _loss_cross_entropy(
 
             sigmoid_labels = tf.concat(
                 [
-                    tf.expand_dims(
-                        tf.ones_like(sigmoid_logits[..., 0], tf.float32), -1
-                    ),
-                    tf.zeros_like(sigmoid_logits[..., 1:], tf.float32),
+                    tf.ones_like(sigmoid_logits[..., :1]),
+                    tf.zeros_like(sigmoid_logits[..., 1:]),
                 ],
                 axis=-1,
             )
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index cfa5ad025333..5b4ae15c427f 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -789,8 +789,6 @@ def _prepare_dot_product_loss(
             self.config[USE_MAX_NEG_SIM],
             self.config[NEGATIVE_MARGIN_SCALE],
             scale_loss,
-            # set to 1 to get deterministic behaviour
-            parallel_iterations=1 if self.random_seed is not None else 1000,
             similarity_type=self.config[SIMILARITY_TYPE],
             constrain_similarities=self.config[CONSTRAIN_SIMILARITIES],
             model_confidence=self.config[MODEL_CONFIDENCE],

From a5286eb044c94be2911d75c3ca1963e3fd2911bc Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Fri, 5 Feb 2021 17:24:02 +0100
Subject: [PATCH 21/44] more review comments

---
 changelog/7616.improvement.md           | 26 +++++++++++-------
 docs/docs/components.mdx                |  8 +++---
 docs/docs/migration-guide.mdx           | 13 +++++++--
 docs/docs/policies.mdx                  |  6 ++---
 rasa/core/policies/ted_policy.py        | 22 +++++++--------
 rasa/nlu/classifiers/diet_classifier.py | 16 ++++-------
 rasa/utils/train_utils.py               | 36 ++++++-------------------
 7 files changed, 58 insertions(+), 69 deletions(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index 6020823cbd49..1c84680fad6a 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -1,16 +1,24 @@
-Added an option `constrain_similarities` which adds sigmoid cross-entropy loss on all similarity values to constrain them to an approximate range in `DotProductLoss`.
+Added two new parameters `constrain_similarities` and `model_confidence` to machine learning (ML) components - [DIETClassifier](components.mdx#dietclassifier), [ResponseSelector](components.mdx#dietclassifier) and [TEDPolicy](policies.mdx#ted-policy).
 
-This affects the behaviour of the loss function(`loss_type=cross_entropy`) inside machine learning (ML) components - [DIETClassifier](components.mdx#dietclassifier), [ResponseSelector](components.mdx#dietclassifier) and [TEDPolicy](policies.mdx#ted-policy).
-By default, the parameter is set to `False` but users are encouraged to set it to `True` and re-train their assistants as it will be set to `True` by default, Rasa Open Source 3.0 onwards.  
-Once you re-train your assistant with this option set to `True`, you should also tune fallback confidence thresholds.
+Setting `constrain_similarities=True` adds a sigmoid cross-entropy loss on all similarity values to restrict them to an approximate range in `DotProductLoss`. This should help the models to perform better on real world test sets.
+By default, the parameter is set to `False` to preserve the old behaviour but users are encouraged to set it to `True` and re-train their assistants as it will be set to `True` by default, Rasa Open Source 3.0 onwards.
 
-Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
-
-Also, added an option `model_confidence` to each ML component. It affects how model's confidence for each label is computed during inference. It can take three values -
+Parameter `model_confidence` affects how model's confidence for each label is computed during inference. It can take three values -
 1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
 2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label will be in the range `[-1,1]`.
 3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label will be in an unbounded range.
 
-The default value is `softmax`, but we recommend using `cosine` as that will be the new default value, Rasa Open Source 3.0 onwards. The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
+Setting `model_confidence=cosine` should help users tune the fallback thresholds of their assistant better. The default value is `softmax` to preserve the old behaviour, but we recommend using `cosine` as that will be the new default value, Rasa Open Source 3.0 onwards. The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
+
+With both the above recommendations, users should configure their ML component, e.g. `DIETClassifier`, as -
+```
+- name: DIETClassifier
+  model_confidence: cosine
+  constrain_similarities: True
+  ...
+```
+Once the assistant is re-trained with the above configuration, users should also tune fallback confidence thresholds.
+
+Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
 
-The default auto-configuration is changed to use `constrain_similarities=True` and `model_confidence=cosine` in ML components so that new users start with the recommended configuration.
\ No newline at end of file
+The default auto-configuration is changed to use `constrain_similarities=True` and `model_confidence=cosine` in ML components so that new users start with the recommended configuration. The config would look like this -
\ No newline at end of file
diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx
index 5e36a9c61dca..2025f126c127 100644
--- a/docs/docs/components.mdx
+++ b/docs/docs/components.mdx
@@ -1535,8 +1535,8 @@ However, additional parameters exist that can be adapted.
 |                                 |                  | or 'margin'.                                                 |
 +---------------------------------+------------------+--------------------------------------------------------------+
 | ranking_length                  | 10               | Number of top intents to normalize scores for. Applicable    |
-|                                 |                  | with loss type 'cross_entropy'. Set to 0 to disable          |
-|                                 |                  | normalization.                                               |
+|                                 |                  | only with loss type 'cross_entropy' and 'softmax'            |
+|                                 |                  | confidences. Set to 0 to disable normalization.              |
 +---------------------------------+------------------+--------------------------------------------------------------+
 | maximum_positive_similarity     | 0.8              | Indicates how similar the algorithm should try to make       |
 |                                 |                  | embedding vectors for correct labels.                        |
@@ -2766,8 +2766,8 @@ However, additional parameters exist that can be adapted.
 |                                 |                   | or 'margin'.                                                 |
 +---------------------------------+-------------------+--------------------------------------------------------------+
 | ranking_length                  | 10                | Number of top responses to normalize scores for. Applicable  |
-|                                 |                   | with loss type 'cross_entropy'. Set to 0 to disable          |
-|                                 |                   | normalization.                                               |
+|                                 |                   | only with loss type 'cross_entropy' and 'softmax'            |
+|                                 |                   | confidences. Set to 0 to disable normalization.              |
 +---------------------------------+-------------------+--------------------------------------------------------------+
 | maximum_positive_similarity     | 0.8               | Indicates how similar the algorithm should try to make       |
 |                                 |                   | embedding vectors for correct labels.                        |
diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index 15fbaca57dc7..9d5551e10410 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -18,7 +18,7 @@ Few changes have been made to the loss function inside machine learning (ML)
 components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`. These include:
 1. Configuration option `loss_type=softmax` is now deprecated and will be removed in Rasa Open Source 3.0. Use `loss_type=cross_entropy` instead.
 2. The default loss function (`loss_type=cross_entropy`) adds an optional sigmoid cross-entropy loss of all similarity values to constrain
-them to an approximate range. You can turn on this option by setting `constrain_similarities=True`.
+them to an approximate range. You can turn on this option by setting `constrain_similarities=True`. This should help the models to perform better on real world test sets.
 
 Also, a new option `model_confidence` has been added to each ML component. It affects how model's confidence for each label is computed during inference. It can take three values -
 1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
@@ -27,7 +27,16 @@ Also, a new option `model_confidence` has been added to each ML component. It af
 The default value is `softmax`, but we recommend using `cosine` as that will be the new default value, Rasa Open Source 3.0 onwards.
 The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
 
-You should tune fallback confidence thresholds to adapt to these changes.
+With both the above recommendations, users should configure their ML component, e.g. `DIETClassifier`, as -
+```
+- name: DIETClassifier
+  model_confidence: cosine
+  constrain_similarities: True
+  ...
+```
+Once the assistant is re-trained with the above configuration, users should also tune fallback confidence thresholds.
+
+Configuration option `loss_type=softmax` is also deprecatedin all ML components. Use `loss_type=cross_entropy` instead.
 
 ## Rasa 2.1 to Rasa 2.2
 
diff --git a/docs/docs/policies.mdx b/docs/docs/policies.mdx
index db2651335bb1..2b57bdaeb1b6 100644
--- a/docs/docs/policies.mdx
+++ b/docs/docs/policies.mdx
@@ -271,9 +271,9 @@ However, additional parameters exist that can be adapted.
 | loss_type                             | "cross_entropy"        | The type of the loss function, either 'cross_entropy'        |
 |                                       |                        | or 'margin'.                                                 |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
-| ranking_length                        | 10                     | Number of top responses to normalize scores for. Applicable  |
-|                                       |                        | with loss type 'cross_entropy'. Set to 0 to disable          |
-|                                       |                        | normalization.                                               |
+| ranking_length                        | 10                     | Number of top actions to normalize scores for. Applicable    |
+|                                       |                        | only with loss type 'cross_entropy' and 'softmax'            |
+|                                       |                        | confidences. Set to 0 to disable normalization.              |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
 | maximum_positive_similarity           | 0.8                    | Indicates how similar the algorithm should try to make       |
 |                                       |                        | embedding vectors for correct labels.                        |
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index a3811e611173..f00d838852b3 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -620,20 +620,18 @@ def predict_action_probabilities(
         # take correct prediction from batch
         confidence, is_e2e_prediction = self._pick_confidence(confidences, similarities)
 
-        if self.config[LOSS_TYPE] == CROSS_ENTROPY and self.config[RANKING_LENGTH] > 0:
-            confidence = rasa.utils.train_utils.filter_top_k(
-                confidence, self.config[RANKING_LENGTH]
+        if (
+            self.config[LOSS_TYPE] == CROSS_ENTROPY
+            and self.config[RANKING_LENGTH] > 0
+            and self.config[SIMILARITY_TYPE] == INNER
+            and self.config[MODEL_CONFIDENCE] == SOFTMAX
+        ):
+            # TODO: This should be removed in 3.0 when softmax as
+            #  model confidence and normalization is completely deprecated.
+            confidences = rasa.utils.train_utils.normalize(
+                confidences, self.config[RANKING_LENGTH]
             )
 
-            if (
-                self.config[SIMILARITY_TYPE] == INNER
-                and self.config[MODEL_CONFIDENCE] == SOFTMAX
-            ):
-                # TODO: This should be removed in 3.0 when softmax as
-                #  model confidence is completely deprecated.
-                # Normalize the values if returned probabilities are from softmax.
-                confidence = rasa.utils.train_utils.normalize(confidence)
-
         optional_events = self._create_optional_event_for_entities(
             output, is_e2e_prediction, interpreter, tracker
         )
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 4d4b5f3b97d9..5e42078eaf5c 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -867,21 +867,15 @@ def _predict_label(
         if (
             self.component_config[LOSS_TYPE] == CROSS_ENTROPY
             and self.component_config[RANKING_LENGTH] > 0
+            and self.component_config[SIMILARITY_TYPE] == INNER
+            and self.component_config[MODEL_CONFIDENCE] == SOFTMAX
         ):
-            message_sim = train_utils.filter_top_k(
+            # TODO: This should be removed in 3.0 when softmax as
+            #  model confidence and normalization is completely deprecated.
+            message_sim = train_utils.normalize(
                 message_sim, self.component_config[RANKING_LENGTH]
             )
 
-            if (
-                self.component_config[SIMILARITY_TYPE] == INNER
-                and self.component_config[MODEL_CONFIDENCE] == SOFTMAX
-            ):
-                # TODO: This should be removed in 3.0 when softmax as
-                #  model confidence is completely deprecated.
-                # Normalize the values if returned confidences are from
-                # softmax(hence relative to each other).
-                message_sim = train_utils.normalize(message_sim)
-
         message_sim[::-1].sort()
         message_sim = message_sim.tolist()
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 9c64b8d34b09..8728aeb2e363 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -40,38 +40,18 @@
     from rasa.nlu.tokenizers.tokenizer import Token
 
 
-def normalize(values: np.ndarray) -> np.ndarray:
+def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
     """Normalizes an array of positive numbers over the top `ranking_length` values.
-
-    Args:
-        values: Values to normalize
-
-    Returns:
-        Normalized values.
-    """
-    new_values = values.copy()
-
-    if np.sum(new_values) > 0:
-        new_values = new_values / np.sum(new_values)
-
-    return new_values
-
-
-def filter_top_k(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
-    """Sorts the values in descending order and keeps only top `ranking_length` values.
-
     Other values will be set to 0.
-    Args:
-        values: Values to sort and rank
-        ranking_length: number of values to maintain above 0.
-
-    Returns:
-        Modified values.
     """
     new_values = values.copy()  # prevent mutation of the input
     if 0 < ranking_length < len(new_values):
         ranked = sorted(new_values, reverse=True)
         new_values[new_values < ranked[ranking_length - 1]] = 0
+
+    if np.sum(new_values) > 0:
+        new_values = new_values / np.sum(new_values)
+
     return new_values
 
 
@@ -386,7 +366,7 @@ def override_defaults(
     return config
 
 
-def _check_confidence_setting(component_config) -> None:
+def _check_confidence_setting(component_config: Dict[Text, Any]) -> None:
     if component_config[MODEL_CONFIDENCE] == SOFTMAX:
         rasa.shared.utils.io.raise_warning(
             f"{MODEL_CONFIDENCE} is set to `softmax`. It is recommended "
@@ -396,7 +376,7 @@ def _check_confidence_setting(component_config) -> None:
         )
 
 
-def _check_loss_setting(component_config) -> None:
+def _check_loss_setting(component_config: Dict[Text, Any]) -> None:
     if not component_config[CONSTRAIN_SIMILARITIES] and component_config[LOSS_TYPE] in [
         SOFTMAX,
         CROSS_ENTROPY,
@@ -409,7 +389,7 @@ def _check_loss_setting(component_config) -> None:
         )
 
 
-def _check_similarity_loss_setting(component_config) -> None:
+def _check_similarity_loss_setting(component_config: Dict[Text, Any]) -> None:
     if (
         component_config[SIMILARITY_TYPE] == COSINE
         and component_config[LOSS_TYPE] == CROSS_ENTROPY

From bdadebfd01ca3fd01ec337f6156768b57052053f Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Fri, 5 Feb 2021 17:47:11 +0100
Subject: [PATCH 22/44] fix tests

---
 changelog/7616.improvement.md           |  4 ++--
 rasa/core/policies/ted_policy.py        |  4 ++--
 rasa/nlu/classifiers/diet_classifier.py |  4 ++--
 rasa/nlu/selectors/response_selector.py |  4 ++--
 tests/utils/test_train_utils.py         | 21 +++++++--------------
 5 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index 1c84680fad6a..63be53e61c5f 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -1,7 +1,7 @@
 Added two new parameters `constrain_similarities` and `model_confidence` to machine learning (ML) components - [DIETClassifier](components.mdx#dietclassifier), [ResponseSelector](components.mdx#dietclassifier) and [TEDPolicy](policies.mdx#ted-policy).
 
 Setting `constrain_similarities=True` adds a sigmoid cross-entropy loss on all similarity values to restrict them to an approximate range in `DotProductLoss`. This should help the models to perform better on real world test sets.
-By default, the parameter is set to `False` to preserve the old behaviour but users are encouraged to set it to `True` and re-train their assistants as it will be set to `True` by default, Rasa Open Source 3.0 onwards.
+By default, the parameter is set to `False` to preserve the old behaviour, but users are encouraged to set it to `True` and re-train their assistants as it will be set to `True` by default, Rasa Open Source 3.0 onwards.
 
 Parameter `model_confidence` affects how model's confidence for each label is computed during inference. It can take three values -
 1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
@@ -21,4 +21,4 @@ Once the assistant is re-trained with the above configuration, users should also
 
 Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
 
-The default auto-configuration is changed to use `constrain_similarities=True` and `model_confidence=cosine` in ML components so that new users start with the recommended configuration. The config would look like this -
\ No newline at end of file
+The default auto-configuration is changed to use `constrain_similarities=True` and `model_confidence=cosine` in ML components so that new users start with the recommended configuration.
\ No newline at end of file
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index f00d838852b3..e8081dd9ae0c 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -218,8 +218,8 @@ class TEDPolicy(Policy):
         SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'cross_entropy' or 'margin'.
         LOSS_TYPE: CROSS_ENTROPY,
-        # Number of top actions to normalize scores for. Applicable with loss type 'cross_entropy'.
-        # Set to 0 to turn off normalization.
+        # Number of top actions to normalize scores for. Applicable with loss type 'cross_entropy'
+        # and 'softmax' confidences. Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
         # Indicates how similar the algorithm should try to make embedding vectors
         # for correct labels.
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 5e42078eaf5c..314d03fac680 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -181,8 +181,8 @@ def required_components(cls) -> List[Type[Component]]:
         SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'cross_entropy' or 'margin'.
         LOSS_TYPE: CROSS_ENTROPY,
-        # Number of top intents to normalize scores for. Applicable with loss type 'cross_entropy'.
-        # Set to 0 to turn off normalization.
+        # Number of top intents to normalize scores for. Applicable with loss type 'cross_entropy'
+        # and 'softmax' confidences. Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
         # Indicates how similar the algorithm should try to make embedding vectors
         # for correct labels.
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index e769c371fc67..6f099f38df8d 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -176,8 +176,8 @@ def required_components(cls) -> List[Type[Component]]:
         SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'cross_entropy' or 'margin'.
         LOSS_TYPE: CROSS_ENTROPY,
-        # Number of top actions to normalize scores for. Applicable with loss type 'cross_entropy'.
-        # Set to 0 to turn off normalization.
+        # Number of top actions to normalize scores for. Applicable with loss type 'cross_entropy'
+        # and 'softmax' confidences. Set to 0 to turn off normalization.
         RANKING_LENGTH: 10,
         # Indicates how similar the algorithm should try to make embedding vectors
         # for correct labels.
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 548922e0d0ba..74dccd2ad5df 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -36,23 +36,16 @@ def test_align_token_features():
     assert np.all(actual_features[0][4] == np.mean(token_features[0][5:10], axis=0))
 
 
-def test_normalize():
-    input_values = [0.7, 0.1, 0.1]
-    normalized_values = train_utils.normalize(np.array(input_values))
-    assert np.allclose(
-        normalized_values, np.array([0.77777778, 0.11111111, 0.11111111]), atol=1e-5
-    )
-
-
 @pytest.mark.parametrize(
     "input_values, ranking_length, output_values",
-    [([0.5, 0.8, 0.1], 2, [0.5, 0.8, 0.0]), ([0.5, 0.3, 0.9], 5, [0.5, 0.3, 0.9]),],
+    [
+        ([0.2, 0.7, 0.1], 2, [0.2222222, 0.77777778, 0.0]),
+        ([0.1, 0.7, 0.1], 5, [0.11111111, 0.77777778, 0.11111111]),
+    ],
 )
-def test_sort_and_rank(
-    input_values: List[float], ranking_length: int, output_values: List[float]
-):
-    ranked_values = train_utils.filter_top_k(np.array(input_values), ranking_length)
-    assert np.array_equal(ranked_values, output_values)
+def test_normalize(input_values, ranking_length, output_values):
+    normalized_values = train_utils.normalize(np.array(input_values), ranking_length)
+    assert np.allclose(normalized_values, np.array(output_values), atol=1e-5)
 
 
 @pytest.mark.parametrize(

From 3a3b0f3cd0b7835435a8f805566aa8a8e96e7a94 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Fri, 5 Feb 2021 19:10:33 +0100
Subject: [PATCH 23/44] add conditions

---
 rasa/core/policies/ted_policy.py        | 16 +++++----------
 rasa/nlu/classifiers/diet_classifier.py |  8 ++------
 rasa/utils/train_utils.py               | 26 +++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index e8081dd9ae0c..875fa79529f3 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -346,9 +346,8 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
             self.defaults, new_config
         )
 
-        rasa.utils.train_utils._check_loss_setting(self.config)
-        rasa.utils.train_utils._check_confidence_setting(self.config)
-        rasa.utils.train_utils._check_similarity_loss_setting(self.config)
+        rasa.utils.train_utils.validate_configuration_settings(self.config)
+
         self.config = rasa.utils.train_utils.update_loss_type(self.config)
         self.config = rasa.utils.train_utils.update_similarity_type(self.config)
         self.config = rasa.utils.train_utils.update_evaluation_parameters(self.config)
@@ -620,16 +619,11 @@ def predict_action_probabilities(
         # take correct prediction from batch
         confidence, is_e2e_prediction = self._pick_confidence(confidences, similarities)
 
-        if (
-            self.config[LOSS_TYPE] == CROSS_ENTROPY
-            and self.config[RANKING_LENGTH] > 0
-            and self.config[SIMILARITY_TYPE] == INNER
-            and self.config[MODEL_CONFIDENCE] == SOFTMAX
-        ):
+        if self.config[RANKING_LENGTH] > 0 and self.config[MODEL_CONFIDENCE] == SOFTMAX:
             # TODO: This should be removed in 3.0 when softmax as
             #  model confidence and normalization is completely deprecated.
-            confidences = rasa.utils.train_utils.normalize(
-                confidences, self.config[RANKING_LENGTH]
+            confidence = rasa.utils.train_utils.normalize(
+                confidence, self.config[RANKING_LENGTH]
             )
 
         optional_events = self._create_optional_event_for_entities(
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 314d03fac680..20f1f2348731 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -293,9 +293,7 @@ def _check_config_parameters(self) -> None:
         self._check_masked_lm()
         self._check_share_hidden_layers_sizes()
 
-        train_utils._check_loss_setting(self.component_config)
-        train_utils._check_confidence_setting(self.component_config)
-        train_utils._check_similarity_loss_setting(self.component_config)
+        train_utils.validate_configuration_settings(self.component_config)
 
         self.component_config = train_utils.update_loss_type(self.component_config)
 
@@ -865,9 +863,7 @@ def _predict_label(
         label_ids = message_sim.argsort()[::-1]
 
         if (
-            self.component_config[LOSS_TYPE] == CROSS_ENTROPY
-            and self.component_config[RANKING_LENGTH] > 0
-            and self.component_config[SIMILARITY_TYPE] == INNER
+            self.component_config[RANKING_LENGTH] > 0
             and self.component_config[MODEL_CONFIDENCE] == SOFTMAX
         ):
             # TODO: This should be removed in 3.0 when softmax as
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 8728aeb2e363..eee26f9002d2 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -34,6 +34,7 @@
 )
 from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS
 from rasa.core.constants import DIALOGUE
+from rasa.shared.exceptions import InvalidConfigException
 
 if TYPE_CHECKING:
     from rasa.nlu.extractors.extractor import EntityTagSpec
@@ -366,6 +367,17 @@ def override_defaults(
     return config
 
 
+def validate_configuration_settings(component_config: Dict[Text, Any]) -> None:
+    """Performs checks to validate that combination of parameters in the configuration are correctly set.
+
+    Args:
+        component_config: Configuration to validate.
+    """
+    _check_loss_setting(component_config)
+    _check_confidence_setting(component_config)
+    _check_similarity_loss_setting(component_config)
+
+
 def _check_confidence_setting(component_config: Dict[Text, Any]) -> None:
     if component_config[MODEL_CONFIDENCE] == SOFTMAX:
         rasa.shared.utils.io.raise_warning(
@@ -374,6 +386,20 @@ def _check_confidence_setting(component_config: Dict[Text, Any]) -> None:
             f"Rasa Open Source 3.0 onwards.",
             category=FutureWarning,
         )
+        if component_config[LOSS_TYPE] not in [SOFTMAX, CROSS_ENTROPY]:
+            raise InvalidConfigException(
+                f"{LOSS_TYPE}={component_config[LOSS_TYPE]} and "
+                f"{MODEL_CONFIDENCE}={SOFTMAX} is not a valid "
+                f"combination. You can use {MODEL_CONFIDENCE}={SOFTMAX} "
+                f"only with {LOSS_TYPE}={CROSS_ENTROPY}."
+            )
+        if component_config[SIMILARITY_TYPE] != INNER:
+            raise InvalidConfigException(
+                f"{SIMILARITY_TYPE}={component_config[SIMILARITY_TYPE]} and "
+                f"{MODEL_CONFIDENCE}={SOFTMAX} is not a valid "
+                f"combination. You can use {MODEL_CONFIDENCE}={SOFTMAX} "
+                f"only with {SIMILARITY_TYPE}={INNER}."
+            )
 
 
 def _check_loss_setting(component_config: Dict[Text, Any]) -> None:

From cf27ec4d953b53ea8e200d8eb401152878ad4d66 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sun, 7 Feb 2021 17:48:10 +0100
Subject: [PATCH 24/44] add tests for diet and ted

---
 rasa/core/policies/ted_policy.py        |   2 +
 rasa/nlu/classifiers/diet_classifier.py |   4 +
 rasa/utils/tensorflow/layers.py         |   5 +-
 rasa/utils/train_utils.py               |  24 ++++-
 tests/nlu/selectors/test_selectors.py   | 126 ++++++++++++++++++++++++
 5 files changed, 158 insertions(+), 3 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 875fa79529f3..f75f489c1bf3 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -346,6 +346,8 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
             self.defaults, new_config
         )
 
+        self.config = rasa.utils.train_utils.update_confidence_type(self.config)
+
         rasa.utils.train_utils.validate_configuration_settings(self.config)
 
         self.config = rasa.utils.train_utils.update_loss_type(self.config)
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 20f1f2348731..de140e6565e8 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -293,6 +293,10 @@ def _check_config_parameters(self) -> None:
         self._check_masked_lm()
         self._check_share_hidden_layers_sizes()
 
+        self.component_config = train_utils.update_confidence_type(
+            self.component_config
+        )
+
         train_utils.validate_configuration_settings(self.component_config)
 
         self.component_config = train_utils.update_loss_type(self.component_config)
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 04683c7c96a8..354e7430eadd 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -12,6 +12,7 @@
     INNER,
     CROSS_ENTROPY,
 )
+from rasa.shared.exceptions import RasaException
 
 logger = logging.getLogger(__name__)
 
@@ -595,7 +596,7 @@ def __init__(
         self.model_confidence = model_confidence
         self.similarity_type = similarity_type
         if self.similarity_type and self.similarity_type not in {COSINE, INNER}:
-            raise ValueError(
+            raise RasaException(
                 f"Wrong similarity type '{self.similarity_type}', "
                 f"should be '{COSINE}' or '{INNER}'."
             )
@@ -918,7 +919,7 @@ def _chosen_loss(self) -> Callable:
         elif self.loss_type == CROSS_ENTROPY:
             return self._loss_cross_entropy
         else:
-            raise ValueError(
+            raise RasaException(
                 f"Wrong loss type '{self.loss_type}', "
                 f"should be '{MARGIN}' or '{CROSS_ENTROPY}'"
             )
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index eee26f9002d2..1a31b1eb69fd 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -367,6 +367,28 @@ def override_defaults(
     return config
 
 
+def update_confidence_type(component_config: Dict[Text, Any]) -> Dict[Text, Any]:
+    """Set model confidence to cosine if margin loss is used.
+
+    Args:
+        component_config: model configuration
+
+    Returns:
+        updated model configuration
+    """
+    # TODO: Remove this once model_confidence is set to cosine by default.
+    if (
+        component_config[LOSS_TYPE] == MARGIN
+        and component_config[MODEL_CONFIDENCE] == SOFTMAX
+    ):
+        rasa.shared.utils.io.raise_warning(
+            f"Overriding defaults by setting {MODEL_CONFIDENCE} to "
+            f"{COSINE} as {LOSS_TYPE} is set to {MARGIN} in the configuration."
+        )
+        component_config[MODEL_CONFIDENCE] = COSINE
+    return component_config
+
+
 def validate_configuration_settings(component_config: Dict[Text, Any]) -> None:
     """Performs checks to validate that combination of parameters in the configuration are correctly set.
 
@@ -393,7 +415,7 @@ def _check_confidence_setting(component_config: Dict[Text, Any]) -> None:
                 f"combination. You can use {MODEL_CONFIDENCE}={SOFTMAX} "
                 f"only with {LOSS_TYPE}={CROSS_ENTROPY}."
             )
-        if component_config[SIMILARITY_TYPE] != INNER:
+        if component_config[SIMILARITY_TYPE] not in [INNER, AUTO]:
             raise InvalidConfigException(
                 f"{SIMILARITY_TYPE}={component_config[SIMILARITY_TYPE]} and "
                 f"{MODEL_CONFIDENCE}={SOFTMAX} is not a valid "
diff --git a/tests/nlu/selectors/test_selectors.py b/tests/nlu/selectors/test_selectors.py
index 6d5c4aabea9d..c7b1748a8b21 100644
--- a/tests/nlu/selectors/test_selectors.py
+++ b/tests/nlu/selectors/test_selectors.py
@@ -3,6 +3,8 @@
 import pytest
 import numpy as np
 from typing import List, Dict, Text, Any
+from mock import Mock
+from _pytest.monkeypatch import MonkeyPatch
 
 import rasa.model
 from rasa.nlu import train
@@ -19,12 +21,20 @@
     EVAL_NUM_EPOCHS,
     EVAL_NUM_EXAMPLES,
     CHECKPOINT_MODEL,
+    MODEL_CONFIDENCE,
+    RANDOM_SEED,
+    RANKING_LENGTH,
+    LOSS_TYPE,
 )
+from rasa.utils import train_utils
+from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.shared.nlu.constants import TEXT
 from rasa.shared.constants import DIAGNOSTIC_DATA
 from rasa.nlu.selectors.response_selector import ResponseSelector
 from rasa.shared.nlu.training_data.message import Message
 from rasa.shared.nlu.training_data.training_data import TrainingData
+from tests.nlu.classifiers.test_diet_classifier import as_pipeline
+from tests.conftest import DEFAULT_NLU_DATA
 
 
 @pytest.mark.parametrize(
@@ -315,3 +325,119 @@ async def test_process_gives_diagnostic_data(trained_response_selector_bot: Path
     assert "attention_weights" in diagnostic_data[name]
     # By default, ResponseSelector has `number_of_transformer_layers = 0`
     assert diagnostic_data[name].get("attention_weights") is None
+
+
+@pytest.mark.parametrize(
+    "classifier_params, prediction_min, prediction_max, output_length",
+    [
+        ({RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "cosine"}, -1, 1, 9,),
+        ({RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "inner"}, -1e9, 1e9, 9,),
+    ],
+)
+async def test_cross_entropy_without_normalization(
+    component_builder: ComponentBuilder,
+    tmp_path: Path,
+    classifier_params: Dict[Text, Any],
+    prediction_min: float,
+    prediction_max: float,
+    output_length: int,
+    monkeypatch: MonkeyPatch,
+):
+    pipeline = as_pipeline(
+        "WhitespaceTokenizer", "CountVectorsFeaturizer", "ResponseSelector"
+    )
+    assert pipeline[2]["name"] == "ResponseSelector"
+    pipeline[2].update(classifier_params)
+
+    _config = RasaNLUModelConfig({"pipeline": pipeline})
+    (trained_model, _, persisted_path) = await train(
+        _config,
+        path=str(tmp_path),
+        data="data/test_selectors",
+        component_builder=component_builder,
+    )
+    loaded = Interpreter.load(persisted_path, component_builder)
+
+    mock = Mock()
+    monkeypatch.setattr(train_utils, "normalize", mock.normalize)
+
+    parse_data = loaded.parse("hello")
+    response_ranking = parse_data.get("response_selector").get("default").get("ranking")
+
+    # check that the output was correctly truncated
+    assert len(response_ranking) == output_length
+
+    response_confidences = [response.get("confidence") for response in response_ranking]
+
+    # check each confidence is in range
+    confidence_in_range = [
+        prediction_min <= confidence <= prediction_max
+        for confidence in response_confidences
+    ]
+    assert all(confidence_in_range)
+
+    # normalize shouldn't have been called
+    mock.normalize.assert_not_called()
+
+
+@pytest.mark.parametrize(
+    "classifier_params", [({LOSS_TYPE: "margin", RANDOM_SEED: 42, EPOCHS: 1})],
+)
+async def test_margin_loss_is_not_normalized(
+    monkeypatch, component_builder, tmpdir, classifier_params
+):
+    pipeline = as_pipeline(
+        "WhitespaceTokenizer", "CountVectorsFeaturizer", "ResponseSelector"
+    )
+    assert pipeline[2]["name"] == "ResponseSelector"
+    pipeline[2].update(classifier_params)
+
+    mock = Mock()
+    monkeypatch.setattr(train_utils, "normalize", mock.normalize)
+
+    _config = RasaNLUModelConfig({"pipeline": pipeline})
+    (trained_model, _, persisted_path) = await train(
+        _config,
+        path=str(tmpdir),
+        data="data/test_selectors",
+        component_builder=component_builder,
+    )
+    loaded = Interpreter.load(persisted_path, component_builder)
+
+    parse_data = loaded.parse("hello")
+    response_ranking = parse_data.get("response_selector").get("default").get("ranking")
+
+    # check that the output was not normalized
+    mock.normalize.assert_not_called()
+
+    # check that the output was correctly truncated
+    assert len(response_ranking) == 9
+
+
+@pytest.mark.parametrize(
+    "classifier_params, data_path, output_length",
+    [
+        ({RANDOM_SEED: 42, EPOCHS: 2}, "data/test_selectors", 2),
+        ({RANDOM_SEED: 42, RANKING_LENGTH: 0, EPOCHS: 2}, "data/test_selectors", 2),
+        ({RANDOM_SEED: 42, RANKING_LENGTH: 1, EPOCHS: 2}, "data/test_selectors", 1),
+    ],
+)
+async def test_softmax_ranking(
+    component_builder, tmp_path, classifier_params, data_path, output_length,
+):
+    pipeline = as_pipeline(
+        "WhitespaceTokenizer", "CountVectorsFeaturizer", "ResponseSelector"
+    )
+    assert pipeline[2]["name"] == "ResponseSelector"
+    pipeline[2].update(classifier_params)
+
+    _config = RasaNLUModelConfig({"pipeline": pipeline})
+    (trained_model, _, persisted_path) = await train(
+        _config, path=str(tmp_path), data=data_path, component_builder=component_builder
+    )
+    loaded = Interpreter.load(persisted_path, component_builder)
+
+    parse_data = loaded.parse("hello")
+    response_ranking = parse_data.get("response_selector").get("default").get("ranking")
+    # check that the output was correctly truncated after normalization
+    assert len(response_ranking) == output_length

From 476e59820e36e7988865de909b7a67e78ad53e0c Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sun, 7 Feb 2021 17:55:59 +0100
Subject: [PATCH 25/44] add types

---
 tests/nlu/selectors/test_selectors.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/nlu/selectors/test_selectors.py b/tests/nlu/selectors/test_selectors.py
index c7b1748a8b21..0f0be42ab7e6 100644
--- a/tests/nlu/selectors/test_selectors.py
+++ b/tests/nlu/selectors/test_selectors.py
@@ -384,7 +384,10 @@ async def test_cross_entropy_without_normalization(
     "classifier_params", [({LOSS_TYPE: "margin", RANDOM_SEED: 42, EPOCHS: 1})],
 )
 async def test_margin_loss_is_not_normalized(
-    monkeypatch, component_builder, tmpdir, classifier_params
+    monkeypatch: MonkeyPatch,
+    component_builder: ComponentBuilder,
+    tmpdir: Path,
+    classifier_params: Dict[Text, int],
 ):
     pipeline = as_pipeline(
         "WhitespaceTokenizer", "CountVectorsFeaturizer", "ResponseSelector"
@@ -417,13 +420,17 @@ async def test_margin_loss_is_not_normalized(
 @pytest.mark.parametrize(
     "classifier_params, data_path, output_length",
     [
-        ({RANDOM_SEED: 42, EPOCHS: 2}, "data/test_selectors", 2),
-        ({RANDOM_SEED: 42, RANKING_LENGTH: 0, EPOCHS: 2}, "data/test_selectors", 2),
-        ({RANDOM_SEED: 42, RANKING_LENGTH: 1, EPOCHS: 2}, "data/test_selectors", 1),
+        ({RANDOM_SEED: 42, EPOCHS: 2}, "data/test_selectors", 9),
+        ({RANDOM_SEED: 42, RANKING_LENGTH: 0, EPOCHS: 2}, "data/test_selectors", 9),
+        ({RANDOM_SEED: 42, RANKING_LENGTH: 2, EPOCHS: 2}, "data/test_selectors", 2),
     ],
 )
 async def test_softmax_ranking(
-    component_builder, tmp_path, classifier_params, data_path, output_length,
+    component_builder: ComponentBuilder,
+    tmp_path: Path,
+    classifier_params: Dict[Text, int],
+    data_path: Text,
+    output_length: int,
 ):
     pipeline = as_pipeline(
         "WhitespaceTokenizer", "CountVectorsFeaturizer", "ResponseSelector"

From 3d554e390a492b993906ea192da59274e7194976 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sun, 7 Feb 2021 18:07:20 +0100
Subject: [PATCH 26/44] added tests for TED

---
 tests/core/policies/test_ted_policy.py | 98 ++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/tests/core/policies/test_ted_policy.py b/tests/core/policies/test_ted_policy.py
index ea790d422127..bae7475be18c 100644
--- a/tests/core/policies/test_ted_policy.py
+++ b/tests/core/policies/test_ted_policy.py
@@ -32,8 +32,12 @@
     SCALE_LOSS,
     SIMILARITY_TYPE,
     VALUE_RELATIVE_ATTENTION,
+    MODEL_CONFIDENCE,
+    COSINE,
+    INNER,
 )
 from tests.core.test_policies import PolicyTestCollection
+from rasa.shared.constants import DEFAULT_SENDER_ID
 
 UTTER_GREET_ACTION = "utter_greet"
 GREET_INTENT_NAME = "greet"
@@ -330,6 +334,100 @@ def test_normalization(
         mock.normalize.assert_not_called()
 
 
+class TestTEDPolicyCosineConfidence(TestTEDPolicy):
+    def create_policy(
+        self, featurizer: Optional[TrackerFeaturizer], priority: int
+    ) -> Policy:
+        return TEDPolicy(
+            featurizer=featurizer, priority=priority, **{MODEL_CONFIDENCE: COSINE}
+        )
+
+    def test_normalization(
+        self,
+        trained_policy: Policy,
+        tracker: DialogueStateTracker,
+        default_domain: Domain,
+        monkeypatch: MonkeyPatch,
+    ):
+        # first check the output is what we expect
+        predicted_probabilities = trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        ).probabilities
+        # there should be no normalization
+        confidence_in_range = [
+            -1 <= confidence <= 1 for confidence in predicted_probabilities
+        ]
+        assert all(confidence_in_range)
+
+        # also check our function is not called
+        mock = Mock()
+        monkeypatch.setattr(train_utils, "normalize", mock.normalize)
+        trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        )
+
+        mock.normalize.assert_not_called()
+
+    def test_prediction_on_empty_tracker(
+        self, trained_policy: Policy, default_domain: Domain
+    ):
+        tracker = DialogueStateTracker(DEFAULT_SENDER_ID, default_domain.slots)
+        prediction = trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        )
+        assert not prediction.is_end_to_end_prediction
+        assert len(prediction.probabilities) == default_domain.num_actions
+        assert max(prediction.probabilities) <= 1.0
+        assert min(prediction.probabilities) >= -1.0
+
+
+class TestTEDPolicyInnerConfidence(TestTEDPolicy):
+    def create_policy(
+        self, featurizer: Optional[TrackerFeaturizer], priority: int
+    ) -> Policy:
+        return TEDPolicy(
+            featurizer=featurizer, priority=priority, **{MODEL_CONFIDENCE: INNER}
+        )
+
+    def test_normalization(
+        self,
+        trained_policy: Policy,
+        tracker: DialogueStateTracker,
+        default_domain: Domain,
+        monkeypatch: MonkeyPatch,
+    ):
+        # first check the output is what we expect
+        predicted_probabilities = trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        ).probabilities
+        # there should be no normalization
+        confidence_in_range = [
+            -1e9 <= confidence <= 1e9 for confidence in predicted_probabilities
+        ]
+        assert all(confidence_in_range)
+
+        # also check our function is not called
+        mock = Mock()
+        monkeypatch.setattr(train_utils, "normalize", mock.normalize)
+        trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        )
+
+        mock.normalize.assert_not_called()
+
+    def test_prediction_on_empty_tracker(
+        self, trained_policy: Policy, default_domain: Domain
+    ):
+        tracker = DialogueStateTracker(DEFAULT_SENDER_ID, default_domain.slots)
+        prediction = trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        )
+        assert not prediction.is_end_to_end_prediction
+        assert len(prediction.probabilities) == default_domain.num_actions
+        assert max(prediction.probabilities) <= 1e9
+        assert min(prediction.probabilities) >= -1e9
+
+
 class TestTEDPolicyLowRankingLength(TestTEDPolicy):
     def create_policy(
         self, featurizer: Optional[TrackerFeaturizer], priority: int

From 54f9ee4549f7e710d108d8b0be683ec45e6a17c9 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sun, 7 Feb 2021 18:56:36 +0100
Subject: [PATCH 27/44] change plotting strategy, testing

---
 rasa/utils/plotting.py                 | 26 ++++++++++++++++++++++----
 rasa/utils/train_utils.py              |  1 +
 tests/core/policies/test_ted_policy.py | 12 ++++++++++++
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/rasa/utils/plotting.py b/rasa/utils/plotting.py
index 195060c631ee..cf4661fd7f9b 100644
--- a/rasa/utils/plotting.py
+++ b/rasa/utils/plotting.py
@@ -128,6 +128,23 @@ def plot_histogram(
     """
     import matplotlib.pyplot as plt
 
+    def get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[float]:
+        total_values = len(data[0]) + len(data[1])
+        bin_max_size = int(total_values * bin_size_frac)
+
+        all_values = sorted(data[0] + data[1])
+        bins = []
+        bin_count = 0
+        for value in all_values:
+            bin_count += 1
+            if bin_count == bin_max_size:
+                bins.append(value)
+                bin_count = 0
+        if bin_count:
+            bins.append(all_values[-1])
+
+        return bins
+
     plt.gcf().clear()
 
     # Wine-ish colour for the confidences of hits.
@@ -140,8 +157,8 @@ def plot_histogram(
     min_value = min(
         [min(hist_data[0], default=0), min(hist_data[1], default=0)], default=0
     )
-    bin_width = (max_value - min_value) / n_bins
-    bins = [min_value + (i * bin_width) for i in range(1, n_bins + 1)]
+
+    bins = get_bins(hist_data)
 
     binned_data_sets = [np.histogram(d, bins=bins)[0] for d in hist_data]
 
@@ -157,10 +174,11 @@ def plot_histogram(
                 ]
             )
         ]
-        - bin_width
+        # - bins[0]
     )
 
-    max_ylim = max(bins) + bin_width
+    # max_ylim = max(bins) + bin_width
+    max_ylim = max(bins)
 
     yticks = [float("{:.2f}".format(x)) for x in bins]
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 1a31b1eb69fd..84f0fe09e7ff 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -43,6 +43,7 @@
 
 def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
     """Normalizes an array of positive numbers over the top `ranking_length` values.
+
     Other values will be set to 0.
     """
     new_values = values.copy()  # prevent mutation of the input
diff --git a/tests/core/policies/test_ted_policy.py b/tests/core/policies/test_ted_policy.py
index bae7475be18c..d1a5c55e3b36 100644
--- a/tests/core/policies/test_ted_policy.py
+++ b/tests/core/policies/test_ted_policy.py
@@ -287,6 +287,18 @@ def test_normalization(
         # function should not get called for margin loss_type
         mock.normalize.assert_not_called()
 
+    def test_prediction_on_empty_tracker(
+        self, trained_policy: Policy, default_domain: Domain
+    ):
+        tracker = DialogueStateTracker(DEFAULT_SENDER_ID, default_domain.slots)
+        prediction = trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        )
+        assert not prediction.is_end_to_end_prediction
+        assert len(prediction.probabilities) == default_domain.num_actions
+        assert max(prediction.probabilities) <= 1.0
+        assert min(prediction.probabilities) >= -1.0
+
 
 class TestTEDPolicyWithEval(TestTEDPolicy):
     def create_policy(

From 5734612c9a84195fc85a9df9f58a14fc46eba70d Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sun, 7 Feb 2021 19:00:10 +0100
Subject: [PATCH 28/44] change function call

---
 rasa/utils/plotting.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/plotting.py b/rasa/utils/plotting.py
index cf4661fd7f9b..f391f84a5717 100644
--- a/rasa/utils/plotting.py
+++ b/rasa/utils/plotting.py
@@ -128,7 +128,7 @@ def plot_histogram(
     """
     import matplotlib.pyplot as plt
 
-    def get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[float]:
+    def _get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[float]:
         total_values = len(data[0]) + len(data[1])
         bin_max_size = int(total_values * bin_size_frac)
 
@@ -158,7 +158,7 @@ def get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[float
         [min(hist_data[0], default=0), min(hist_data[1], default=0)], default=0
     )
 
-    bins = get_bins(hist_data)
+    bins = _get_bins(hist_data)
 
     binned_data_sets = [np.histogram(d, bins=bins)[0] for d in hist_data]
 

From 1dc6930661e534daf22b2244268036810d6bff77 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sun, 7 Feb 2021 20:05:54 +0100
Subject: [PATCH 29/44] self review, add types, docformats

---
 rasa/core/policies/ted_policy.py        |  1 +
 rasa/nlu/classifiers/diet_classifier.py |  3 ++-
 rasa/nlu/selectors/response_selector.py |  2 +-
 rasa/utils/plotting.py                  | 15 +++++++++++----
 tests/core/policies/test_ted_policy.py  |  6 ++++++
 tests/nlu/selectors/test_selectors.py   |  6 ++----
 6 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index f75f489c1bf3..8a86095ed2b1 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -807,6 +807,7 @@ def load(
         model_data_example = RasaModelData(
             label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data=loaded_data
         )
+        meta = rasa.utils.train_utils.update_confidence_type(meta)
         meta = rasa.utils.train_utils.update_similarity_type(meta)
         meta = rasa.utils.train_utils.update_loss_type(meta)
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index de140e6565e8..673315ec35f0 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -252,7 +252,7 @@ def required_components(cls) -> List[Type[Component]]:
         # If 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: False,
-        # Model confidence to be returned during inference. Possible values - softmax, cosine, inner.
+        # Model confidence to be returned during inference. Possible values - 'softmax', 'cosine', 'inner'.
         MODEL_CONFIDENCE: SOFTMAX,
     }
 
@@ -1019,6 +1019,7 @@ def load(
             data_example,
         ) = cls._load_from_files(meta, model_dir)
 
+        meta = train_utils.update_confidence_type(meta)
         meta = train_utils.update_similarity_type(meta)
         meta = train_utils.update_loss_type(meta)
 
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index 6f099f38df8d..b66426fef78b 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -238,7 +238,7 @@ def required_components(cls) -> List[Type[Component]]:
         # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
         # ensure that similarity values are approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: False,
-        # Model confidence to be returned during inference. Possible values - softmax, cosine, inner.
+        # Model confidence to be returned during inference. Possible values - 'softmax', 'cosine', 'inner'.
         MODEL_CONFIDENCE: SOFTMAX,
     }
 
diff --git a/rasa/utils/plotting.py b/rasa/utils/plotting.py
index f391f84a5717..999826c50246 100644
--- a/rasa/utils/plotting.py
+++ b/rasa/utils/plotting.py
@@ -5,6 +5,7 @@
 import numpy as np
 from typing import List, Text, Optional, Union, Any
 import matplotlib
+from matplotlib.ticker import FormatStrFormatter
 
 import rasa.shared.utils.io
 from rasa.constants import RESULTS_FILE
@@ -158,7 +159,10 @@ def _get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[floa
         [min(hist_data[0], default=0), min(hist_data[1], default=0)], default=0
     )
 
-    bins = _get_bins(hist_data)
+    bin_width = (max_value - min_value) / n_bins
+    bins = [min_value + (i * bin_width) for i in range(1, n_bins + 1)]
+
+    # bins = _get_bins(hist_data)
 
     binned_data_sets = [np.histogram(d, bins=bins)[0] for d in hist_data]
 
@@ -174,11 +178,11 @@ def _get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[floa
                 ]
             )
         ]
-        # - bins[0]
+        - bins[0]
     )
 
-    # max_ylim = max(bins) + bin_width
-    max_ylim = max(bins)
+    max_ylim = max(bins) + bin_width
+    # max_ylim = max(bins)
 
     yticks = [float("{:.2f}".format(x)) for x in bins]
 
@@ -203,11 +207,14 @@ def _get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[floa
         color=colors[1],
         label="misses",
     )
+    axes[0].set_yscale("log")
     axes[1].set(title="Wrong")
 
     axes[0].set(yticks=yticks, xlim=(0, max_xlims[0]), ylim=(min_ylim, max_ylim))
     axes[1].set(yticks=yticks, xlim=(0, max_xlims[1]), ylim=(min_ylim, max_ylim))
 
+    axes[0].yaxis.set_major_formatter(FormatStrFormatter("%.2f"))
+
     axes[0].invert_xaxis()
     axes[0].yaxis.tick_right()
 
diff --git a/tests/core/policies/test_ted_policy.py b/tests/core/policies/test_ted_policy.py
index d1a5c55e3b36..b6bed59d98ae 100644
--- a/tests/core/policies/test_ted_policy.py
+++ b/tests/core/policies/test_ted_policy.py
@@ -354,6 +354,9 @@ def create_policy(
             featurizer=featurizer, priority=priority, **{MODEL_CONFIDENCE: COSINE}
         )
 
+    def test_similarity_type(self, trained_policy: TEDPolicy):
+        assert trained_policy.config[SIMILARITY_TYPE] == COSINE
+
     def test_normalization(
         self,
         trained_policy: Policy,
@@ -401,6 +404,9 @@ def create_policy(
             featurizer=featurizer, priority=priority, **{MODEL_CONFIDENCE: INNER}
         )
 
+    def test_similarity_type(self, trained_policy: TEDPolicy):
+        assert trained_policy.config[SIMILARITY_TYPE] == INNER
+
     def test_normalization(
         self,
         trained_policy: Policy,
diff --git a/tests/nlu/selectors/test_selectors.py b/tests/nlu/selectors/test_selectors.py
index 0f0be42ab7e6..e21c4dde4d49 100644
--- a/tests/nlu/selectors/test_selectors.py
+++ b/tests/nlu/selectors/test_selectors.py
@@ -27,14 +27,12 @@
     LOSS_TYPE,
 )
 from rasa.utils import train_utils
-from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.shared.nlu.constants import TEXT
 from rasa.shared.constants import DIAGNOSTIC_DATA
 from rasa.nlu.selectors.response_selector import ResponseSelector
 from rasa.shared.nlu.training_data.message import Message
 from rasa.shared.nlu.training_data.training_data import TrainingData
 from tests.nlu.classifiers.test_diet_classifier import as_pipeline
-from tests.conftest import DEFAULT_NLU_DATA
 
 
 @pytest.mark.parametrize(
@@ -330,8 +328,8 @@ async def test_process_gives_diagnostic_data(trained_response_selector_bot: Path
 @pytest.mark.parametrize(
     "classifier_params, prediction_min, prediction_max, output_length",
     [
-        ({RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "cosine"}, -1, 1, 9,),
-        ({RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "inner"}, -1e9, 1e9, 9,),
+        ({RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "cosine"}, -1, 1, 9),
+        ({RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "inner"}, -1e9, 1e9, 9),
     ],
 )
 async def test_cross_entropy_without_normalization(

From ab1e7b36dcdba59876fc7fbe9bcb746c1d9b6e7c Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Sun, 7 Feb 2021 20:07:18 +0100
Subject: [PATCH 30/44] revert back plotting changes

---
 rasa/utils/plotting.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/rasa/utils/plotting.py b/rasa/utils/plotting.py
index 999826c50246..3d4963b2e348 100644
--- a/rasa/utils/plotting.py
+++ b/rasa/utils/plotting.py
@@ -159,10 +159,10 @@ def _get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[floa
         [min(hist_data[0], default=0), min(hist_data[1], default=0)], default=0
     )
 
-    bin_width = (max_value - min_value) / n_bins
-    bins = [min_value + (i * bin_width) for i in range(1, n_bins + 1)]
+    # bin_width = (max_value - min_value) / n_bins
+    # bins = [min_value + (i * bin_width) for i in range(1, n_bins + 1)]
 
-    # bins = _get_bins(hist_data)
+    bins = _get_bins(hist_data)
 
     binned_data_sets = [np.histogram(d, bins=bins)[0] for d in hist_data]
 
@@ -178,11 +178,11 @@ def _get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[floa
                 ]
             )
         ]
-        - bins[0]
+        # - bins[0]
     )
 
-    max_ylim = max(bins) + bin_width
-    # max_ylim = max(bins)
+    # max_ylim = max(bins) + bin_width
+    max_ylim = max(bins)
 
     yticks = [float("{:.2f}".format(x)) for x in bins]
 

From f2da6bb61459af06bcc105ff1a9d31f9137663cf Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 13:05:45 +0100
Subject: [PATCH 31/44] final plotting style

---
 rasa/nlu/test.py                       |  2 +-
 rasa/utils/plotting.py                 | 31 +++++---------------------
 tests/core/policies/test_ted_policy.py | 13 ++++++-----
 3 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index 837aec238855..e9f819d9e243 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -927,7 +927,7 @@ def evaluate_entities(
                     merged_targets,
                     merged_predictions,
                     merged_confidences,
-                    title="Entity Confusion matrix",
+                    title="Entity Prediction Confidence Distribution",
                     hist_filename=histogram_filename,
                 )
 
diff --git a/rasa/utils/plotting.py b/rasa/utils/plotting.py
index 3d4963b2e348..c816f26f77a9 100644
--- a/rasa/utils/plotting.py
+++ b/rasa/utils/plotting.py
@@ -129,23 +129,6 @@ def plot_histogram(
     """
     import matplotlib.pyplot as plt
 
-    def _get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[float]:
-        total_values = len(data[0]) + len(data[1])
-        bin_max_size = int(total_values * bin_size_frac)
-
-        all_values = sorted(data[0] + data[1])
-        bins = []
-        bin_count = 0
-        for value in all_values:
-            bin_count += 1
-            if bin_count == bin_max_size:
-                bins.append(value)
-                bin_count = 0
-        if bin_count:
-            bins.append(all_values[-1])
-
-        return bins
-
     plt.gcf().clear()
 
     # Wine-ish colour for the confidences of hits.
@@ -159,10 +142,8 @@ def _get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[floa
         [min(hist_data[0], default=0), min(hist_data[1], default=0)], default=0
     )
 
-    # bin_width = (max_value - min_value) / n_bins
-    # bins = [min_value + (i * bin_width) for i in range(1, n_bins + 1)]
-
-    bins = _get_bins(hist_data)
+    bin_width = (max_value - min_value) / n_bins
+    bins = [min_value + (i * bin_width) for i in range(1, n_bins + 1)]
 
     binned_data_sets = [np.histogram(d, bins=bins)[0] for d in hist_data]
 
@@ -178,11 +159,10 @@ def _get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[floa
                 ]
             )
         ]
-        # - bins[0]
+        - bin_width
     )
 
-    # max_ylim = max(bins) + bin_width
-    max_ylim = max(bins)
+    max_ylim = max(bins) + bin_width
 
     yticks = [float("{:.2f}".format(x)) for x in bins]
 
@@ -207,13 +187,14 @@ def _get_bins(data: List[List[float]], bin_size_frac: float = 0.04) -> List[floa
         color=colors[1],
         label="misses",
     )
-    axes[0].set_yscale("log")
+
     axes[1].set(title="Wrong")
 
     axes[0].set(yticks=yticks, xlim=(0, max_xlims[0]), ylim=(min_ylim, max_ylim))
     axes[1].set(yticks=yticks, xlim=(0, max_xlims[1]), ylim=(min_ylim, max_ylim))
 
     axes[0].yaxis.set_major_formatter(FormatStrFormatter("%.2f"))
+    axes[0].yaxis.set_minor_formatter(FormatStrFormatter("%.2f"))
 
     axes[0].invert_xaxis()
     axes[0].yaxis.tick_right()
diff --git a/tests/core/policies/test_ted_policy.py b/tests/core/policies/test_ted_policy.py
index b6bed59d98ae..de3c7668e008 100644
--- a/tests/core/policies/test_ted_policy.py
+++ b/tests/core/policies/test_ted_policy.py
@@ -268,7 +268,10 @@ def create_policy(
         )
 
     def test_similarity_type(self, trained_policy: TEDPolicy):
-        assert trained_policy.config[SIMILARITY_TYPE] == "cosine"
+        assert trained_policy.config[SIMILARITY_TYPE] == COSINE
+
+    def test_confidence_type(self, trained_policy: TEDPolicy):
+        assert trained_policy.config[MODEL_CONFIDENCE] == COSINE
 
     def test_normalization(
         self,
@@ -354,8 +357,8 @@ def create_policy(
             featurizer=featurizer, priority=priority, **{MODEL_CONFIDENCE: COSINE}
         )
 
-    def test_similarity_type(self, trained_policy: TEDPolicy):
-        assert trained_policy.config[SIMILARITY_TYPE] == COSINE
+    def test_confidence_type(self, trained_policy: TEDPolicy):
+        assert trained_policy.config[MODEL_CONFIDENCE] == COSINE
 
     def test_normalization(
         self,
@@ -404,8 +407,8 @@ def create_policy(
             featurizer=featurizer, priority=priority, **{MODEL_CONFIDENCE: INNER}
         )
 
-    def test_similarity_type(self, trained_policy: TEDPolicy):
-        assert trained_policy.config[SIMILARITY_TYPE] == INNER
+    def test_confidence_type(self, trained_policy: TEDPolicy):
+        assert trained_policy.config[MODEL_CONFIDENCE] == INNER
 
     def test_normalization(
         self,

From 8fb7ea2aa0d568a45046c7121a21cd64f3e70117 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 13:08:53 +0100
Subject: [PATCH 32/44] change epochs to 1

---
 tests/nlu/selectors/test_selectors.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/nlu/selectors/test_selectors.py b/tests/nlu/selectors/test_selectors.py
index e21c4dde4d49..02dd94394cea 100644
--- a/tests/nlu/selectors/test_selectors.py
+++ b/tests/nlu/selectors/test_selectors.py
@@ -418,9 +418,9 @@ async def test_margin_loss_is_not_normalized(
 @pytest.mark.parametrize(
     "classifier_params, data_path, output_length",
     [
-        ({RANDOM_SEED: 42, EPOCHS: 2}, "data/test_selectors", 9),
-        ({RANDOM_SEED: 42, RANKING_LENGTH: 0, EPOCHS: 2}, "data/test_selectors", 9),
-        ({RANDOM_SEED: 42, RANKING_LENGTH: 2, EPOCHS: 2}, "data/test_selectors", 2),
+        ({RANDOM_SEED: 42, EPOCHS: 1}, "data/test_selectors", 9),
+        ({RANDOM_SEED: 42, RANKING_LENGTH: 0, EPOCHS: 1}, "data/test_selectors", 9),
+        ({RANDOM_SEED: 42, RANKING_LENGTH: 2, EPOCHS: 1}, "data/test_selectors", 2),
     ],
 )
 async def test_softmax_ranking(

From 2724b1951c286b75ff8b226c6f1bec4e73985499 Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 14:36:13 +0100
Subject: [PATCH 33/44] Partial suggestions from code review

Co-authored-by: Tobias Wochinger <t.wochinger@rasa.com>
---
 changelog/7616.improvement.md           | 8 ++++----
 docs/docs/components.mdx                | 4 ++--
 docs/docs/migration-guide.mdx           | 1 -
 docs/docs/policies.mdx                  | 4 ++--
 rasa/core/policies/ted_policy.py        | 1 -
 rasa/nlu/classifiers/diet_classifier.py | 1 -
 6 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index 63be53e61c5f..cc31ff1dcc73 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -3,15 +3,15 @@ Added two new parameters `constrain_similarities` and `model_confidence` to mach
 Setting `constrain_similarities=True` adds a sigmoid cross-entropy loss on all similarity values to restrict them to an approximate range in `DotProductLoss`. This should help the models to perform better on real world test sets.
 By default, the parameter is set to `False` to preserve the old behaviour, but users are encouraged to set it to `True` and re-train their assistants as it will be set to `True` by default, Rasa Open Source 3.0 onwards.
 
-Parameter `model_confidence` affects how model's confidence for each label is computed during inference. It can take three values -
+Parameter `model_confidence` affects how model's confidence for each label is computed during inference. It can take three values:
 1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
 2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label will be in the range `[-1,1]`.
 3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label will be in an unbounded range.
 
 Setting `model_confidence=cosine` should help users tune the fallback thresholds of their assistant better. The default value is `softmax` to preserve the old behaviour, but we recommend using `cosine` as that will be the new default value, Rasa Open Source 3.0 onwards. The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
 
-With both the above recommendations, users should configure their ML component, e.g. `DIETClassifier`, as -
-```
+With both the above recommendations, users should configure their ML component, e.g. `DIETClassifier`, as
+```yaml
 - name: DIETClassifier
   model_confidence: cosine
   constrain_similarities: True
@@ -21,4 +21,4 @@ Once the assistant is re-trained with the above configuration, users should also
 
 Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
 
-The default auto-configuration is changed to use `constrain_similarities=True` and `model_confidence=cosine` in ML components so that new users start with the recommended configuration.
\ No newline at end of file
+The default auto-configuration is changed to use `constrain_similarities=True` and `model_confidence=cosine` in ML components so that new users start with the recommended configuration.
diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx
index 2025f126c127..ef5ac83db5a1 100644
--- a/docs/docs/components.mdx
+++ b/docs/docs/components.mdx
@@ -1623,7 +1623,7 @@ However, additional parameters exist that can be adapted.
 |                                 |                  | approximately bounded. Used only if `loss_type=cross_entropy`|
 +---------------------------------+------------------+--------------------------------------------------------------+
 | model_confidence                | "softmax"        | Affects how model's confidence for each intent               |
-|                                 |                  | is computed. It can take three values -                      |
+|                                 |                  | is computed. It can take three values                        |
 |                                 |                  | 1. `softmax` - Similarities between input and intent         |
 |                                 |                  | embeddings are post-processed with a softmax function,       |
 |                                 |                  | as a result of which confidence for all intents sum up to 1. |
@@ -2841,7 +2841,7 @@ However, additional parameters exist that can be adapted.
 |                                 |                   | approximately bounded. Used only if `loss_type=cross_entropy`|
 +---------------------------------+-------------------+--------------------------------------------------------------+
 | model_confidence                | "softmax"         | Affects how model's confidence for each response label       |
-|                                 |                   | is computed. It can take three values -                      |
+|                                 |                   | is computed. It can take three values                        |
 |                                 |                   | 1. `softmax` - Similarities between input and response label |
 |                                 |                   | embeddings are post-processed with a softmax function,       |
 |                                 |                   | as a result of which confidence for all labels sum up to 1.  |
diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index 9d5551e10410..38d0d2fb6854 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -36,7 +36,6 @@ With both the above recommendations, users should configure their ML component,
 ```
 Once the assistant is re-trained with the above configuration, users should also tune fallback confidence thresholds.
 
-Configuration option `loss_type=softmax` is also deprecatedin all ML components. Use `loss_type=cross_entropy` instead.
 
 ## Rasa 2.1 to Rasa 2.2
 
diff --git a/docs/docs/policies.mdx b/docs/docs/policies.mdx
index 2b57bdaeb1b6..bc2d4c1c4e85 100644
--- a/docs/docs/policies.mdx
+++ b/docs/docs/policies.mdx
@@ -348,10 +348,10 @@ However, additional parameters exist that can be adapted.
 +---------------------------------------+------------------------+--------------------------------------------------------------+
 | constrain_similarities                | False                  | If `True`, applies sigmoid on all similarity terms and adds  |
 |                                       |                        | it to the loss function to ensure that similarity values are |
-|                                       |                        | approximately bounded. Used only when `loss_type=softmax`    |
+|                                       |                        | approximately bounded. Used only when `loss_type=softmax`.   |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
 | model_confidence                      | "softmax"              | Affects how model's confidence for each action               |
-|                                       |                        | is computed. It can take three values -                      |
+|                                       |                        | is computed. It can take three values                        |
 |                                       |                        | 1. `softmax` - Similarities between input and action         |
 |                                       |                        | embeddings are post-processed with a softmax function,       |
 |                                       |                        | as a result of which confidence for all labels sum up to 1.  |
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 8a86095ed2b1..4d99e218b143 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -83,7 +83,6 @@
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
     CROSS_ENTROPY,
-    INNER,
     AUTO,
     BALANCED,
     TENSORBOARD_LOG_DIR,
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 673315ec35f0..2a2d46db7343 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -86,7 +86,6 @@
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
     AUTO,
-    INNER,
     BALANCED,
     CROSS_ENTROPY,
     TENSORBOARD_LOG_LEVEL,

From 8c66bd8393b3cf126cc3565974702ff5b1a0048e Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 14:45:16 +0100
Subject: [PATCH 34/44] Apply doc suggestions from code review

Co-authored-by: Melinda Loubser <32034278+melindaloubser1@users.noreply.github.com>
Co-authored-by: Vladimir Vlasov <vladimir@rasa.com>
---
 changelog/7616.improvement.md |  2 +-
 docs/docs/migration-guide.mdx | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index cc31ff1dcc73..bce9f7e27d17 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -8,7 +8,7 @@ Parameter `model_confidence` affects how model's confidence for each label is co
 2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label will be in the range `[-1,1]`.
 3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label will be in an unbounded range.
 
-Setting `model_confidence=cosine` should help users tune the fallback thresholds of their assistant better. The default value is `softmax` to preserve the old behaviour, but we recommend using `cosine` as that will be the new default value, Rasa Open Source 3.0 onwards. The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
+Setting `model_confidence=cosine` should help users tune the fallback thresholds of their assistant better. The default value is `softmax` to preserve the old behaviour, but we recommend using `cosine` as that will be the new default value from Rasa Open Source 3.0 onwards. The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
 
 With both the above recommendations, users should configure their ML component, e.g. `DIETClassifier`, as
 ```yaml
diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index 38d0d2fb6854..b6bc792bd169 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -14,20 +14,20 @@ how you can migrate from one version to another.
 
 ### Machine Learning Components
 
-Few changes have been made to the loss function inside machine learning (ML)
-components - `DIETClassifier`, `ResponseSelector` and `TEDPolicy`. These include:
+A few changes have been made to the loss function inside machine learning (ML)
+components `DIETClassifier`, `ResponseSelector` and `TEDPolicy`. These include:
 1. Configuration option `loss_type=softmax` is now deprecated and will be removed in Rasa Open Source 3.0. Use `loss_type=cross_entropy` instead.
-2. The default loss function (`loss_type=cross_entropy`) adds an optional sigmoid cross-entropy loss of all similarity values to constrain
+2. The default loss function (`loss_type=cross_entropy`) can add an optional sigmoid cross-entropy loss of all similarity values to constrain
 them to an approximate range. You can turn on this option by setting `constrain_similarities=True`. This should help the models to perform better on real world test sets.
 
-Also, a new option `model_confidence` has been added to each ML component. It affects how model's confidence for each label is computed during inference. It can take three values -
+Also, a new option `model_confidence` has been added to each ML component. It affects how model's confidence for each label is computed during inference. It can take one of three values:
 1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
 2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label will be in the range `[-1,1]`.
 3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label will be in an unbounded range.
-The default value is `softmax`, but we recommend using `cosine` as that will be the new default value, Rasa Open Source 3.0 onwards.
+The default value is `softmax`, but we recommend using `cosine` as that will be the new default value from Rasa Open Source 3.0 onwards.
 The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
 
-With both the above recommendations, users should configure their ML component, e.g. `DIETClassifier`, as -
+With both the above recommendations, users should configure their ML component, e.g. `DIETClassifier`, as:
 ```
 - name: DIETClassifier
   model_confidence: cosine

From 06a70eeb8b498d8d425a8cedbe252d327afcb523 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 15:11:06 +0100
Subject: [PATCH 35/44] refactor loss, add docstrings

---
 changelog/7616.improvement.md           |  2 +-
 rasa/core/policies/ted_policy.py        | 13 ++--
 rasa/nlu/classifiers/diet_classifier.py | 13 ++--
 rasa/nlu/selectors/response_selector.py | 26 +++++--
 rasa/utils/tensorflow/layers.py         | 98 +++++++++++++++----------
 5 files changed, 96 insertions(+), 56 deletions(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index 63be53e61c5f..ed757edf664f 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -21,4 +21,4 @@ Once the assistant is re-trained with the above configuration, users should also
 
 Configuration option `loss_type=softmax` is now deprecated. Use `loss_type=cross_entropy` instead.
 
-The default auto-configuration is changed to use `constrain_similarities=True` and `model_confidence=cosine` in ML components so that new users start with the recommended configuration.
\ No newline at end of file
+The default [auto-configuration](https://rasa.com/docs/rasa/model-configuration#suggested-config) is changed to use `constrain_similarities=True` and `model_confidence=cosine` in ML components so that new users start with the recommended configuration.
\ No newline at end of file
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 8a86095ed2b1..ecf5672736fc 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -218,8 +218,9 @@ class TEDPolicy(Policy):
         SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'cross_entropy' or 'margin'.
         LOSS_TYPE: CROSS_ENTROPY,
-        # Number of top actions to normalize scores for. Applicable with loss type 'cross_entropy'
-        # and 'softmax' confidences. Set to 0 to turn off normalization.
+        # Number of top actions to normalize scores for. Applicable with
+        # loss type 'cross_entropy' and 'softmax' confidences. Set to 0
+        # to turn off normalization.
         RANKING_LENGTH: 10,
         # Indicates how similar the algorithm should try to make embedding vectors
         # for correct labels.
@@ -281,10 +282,12 @@ class TEDPolicy(Policy):
         FEATURIZERS: [],
         # If set to true, entities are predicted in user utterances.
         ENTITY_RECOGNITION: True,
-        # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
-        # ensure that similarity values are approximately bounded. Used inside softmax loss only.
+        # if 'True' applies sigmoid on all similarity terms and adds
+        # it to the loss function to ensure that similarity values are
+        # approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: False,
-        # Model confidence to be returned during inference. Possible values - 'softmax', 'cosine' and 'inner'.
+        # Model confidence to be returned during inference. Possible values -
+        # 'softmax', 'cosine' and 'inner'.
         MODEL_CONFIDENCE: SOFTMAX,
         # 'BILOU_flag' determines whether to use BILOU tagging or not.
         # If set to 'True' labelling is more rigorous, however more
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 673315ec35f0..5b85a5568c26 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -181,8 +181,9 @@ def required_components(cls) -> List[Type[Component]]:
         SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'cross_entropy' or 'margin'.
         LOSS_TYPE: CROSS_ENTROPY,
-        # Number of top intents to normalize scores for. Applicable with loss type 'cross_entropy'
-        # and 'softmax' confidences. Set to 0 to turn off normalization.
+        # Number of top intents to normalize scores for. Applicable with
+        # loss type 'cross_entropy' and 'softmax' confidences. Set to 0
+        # to turn off normalization.
         RANKING_LENGTH: 10,
         # Indicates how similar the algorithm should try to make embedding vectors
         # for correct labels.
@@ -249,10 +250,12 @@ def required_components(cls) -> List[Type[Component]]:
         # Split entities by comma, this makes sense e.g. for a list of ingredients
         # in a recipie, but it doesn't make sense for the parts of an address
         SPLIT_ENTITIES_BY_COMMA: True,
-        # If 'True' applies sigmoid on all similarity terms and adds it to the loss function to
-        # ensure that similarity values are approximately bounded. Used inside softmax loss only.
+        # If 'True' applies sigmoid on all similarity terms and adds
+        # it to the loss function to ensure that similarity values are
+        # approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: False,
-        # Model confidence to be returned during inference. Possible values - 'softmax', 'cosine', 'inner'.
+        # Model confidence to be returned during inference. Possible values -
+        # 'softmax', 'cosine', 'inner'.
         MODEL_CONFIDENCE: SOFTMAX,
     }
 
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index b66426fef78b..f6aa535f6298 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -176,8 +176,9 @@ def required_components(cls) -> List[Type[Component]]:
         SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'cross_entropy' or 'margin'.
         LOSS_TYPE: CROSS_ENTROPY,
-        # Number of top actions to normalize scores for. Applicable with loss type 'cross_entropy'
-        # and 'softmax' confidences. Set to 0 to turn off normalization.
+        # Number of top actions to normalize scores for. Applicable with
+        # loss type 'cross_entropy' and 'softmax' confidences. Set to 0
+        # to turn off normalization.
         RANKING_LENGTH: 10,
         # Indicates how similar the algorithm should try to make embedding vectors
         # for correct labels.
@@ -235,10 +236,12 @@ def required_components(cls) -> List[Type[Component]]:
         FEATURIZERS: [],
         # Perform model checkpointing
         CHECKPOINT_MODEL: False,
-        # if 'True' applies sigmoid on all similarity terms and adds it to the loss function to
-        # ensure that similarity values are approximately bounded. Used inside softmax loss only.
+        # if 'True' applies sigmoid on all similarity terms and adds it
+        # to the loss function to ensure that similarity values are
+        # approximately bounded. Used inside softmax loss only.
         CONSTRAIN_SIMILARITIES: False,
-        # Model confidence to be returned during inference. Possible values - 'softmax', 'cosine', 'inner'.
+        # Model confidence to be returned during inference. Possible values -
+        # 'softmax', 'cosine', 'inner'.
         MODEL_CONFIDENCE: SOFTMAX,
     }
 
@@ -252,7 +255,18 @@ def __init__(
         responses: Optional[Dict[Text, List[Dict[Text, Any]]]] = None,
         finetune_mode: bool = False,
     ) -> None:
-        """Declare instance variables with default values."""
+        """Declare instance variables with default values.
+
+        Args:
+            component_config: Configuration for the component.
+            index_label_id_mapping: Mapping between label and index used for encoding.
+            entity_tag_specs: Format specification all entity tags.
+            model: Model architecture.
+            all_retrieval_intents: All retrieval intents defined in the data.
+            responses: All responses defined in the data.
+            finetune_mode: If `True` loads the model with pre-trained weights,
+                otherwise initializes it with random weights.
+        """
         component_config = component_config or {}
 
         # the following properties cannot be adapted for the ResponseSelector
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 354e7430eadd..30281b851f57 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -582,6 +582,9 @@ def __init__(
                 Used inside _loss_cross_entropy() only.
             model_confidence: Model confidence to be returned during inference.
                 Possible values - softmax, cosine, inner.
+
+        Raises:
+            RasaException: When `similarity_type` is not one of 'cosine' or 'inner'.
         """
         super().__init__(name=name)
         self.num_neg = num_neg
@@ -595,7 +598,7 @@ def __init__(
         self.constrain_similarities = constrain_similarities
         self.model_confidence = model_confidence
         self.similarity_type = similarity_type
-        if self.similarity_type and self.similarity_type not in {COSINE, INNER}:
+        if not self.similarity_type or self.similarity_type not in {COSINE, INNER}:
             raise RasaException(
                 f"Wrong similarity type '{self.similarity_type}', "
                 f"should be '{COSINE}' or '{INNER}'."
@@ -850,49 +853,15 @@ def _loss_cross_entropy(
         mask: Optional[tf.Tensor],
     ) -> tf.Tensor:
         """Defines cross entropy loss."""
-        # Similarity terms between input and label should be optimized relative
-        # to each other and hence use them as logits for softmax term
-        softmax_logits = tf.concat([sim_pos, sim_neg_il, sim_neg_li], axis=-1)
-
-        if not self.constrain_similarities:
-            # Concatenate other similarity terms as well. Due to this,
-            # similarity values between input and label may not be
-            # approximately bounded in a defined range.
-            softmax_logits = tf.concat(
-                [softmax_logits, sim_neg_ii, sim_neg_ll], axis=-1
-            )
-
-        # create label_ids for softmax
-        softmax_label_ids = tf.zeros_like(softmax_logits[..., 0], tf.int32)
-
-        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-            labels=softmax_label_ids, logits=softmax_logits
+        loss = self._compute_softmax_loss(
+            sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li
         )
 
-        loss = softmax_loss
-
         if self.constrain_similarities:
-            # Constrain similarity values in a range by applying sigmoid
-            # on them individually so that they saturate at extreme values.
-            sigmoid_logits = tf.concat(
-                [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
-            )
-
-            sigmoid_labels = tf.concat(
-                [
-                    tf.ones_like(sigmoid_logits[..., :1]),
-                    tf.zeros_like(sigmoid_logits[..., 1:]),
-                ],
-                axis=-1,
+            loss += self._compute_sigmoid_loss(
+                sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li
             )
 
-            sigmoid_loss = tf.nn.sigmoid_cross_entropy_with_logits(
-                labels=sigmoid_labels, logits=sigmoid_logits
-            )
-
-            # average over logits axis
-            loss += tf.reduce_mean(sigmoid_loss, axis=-1)
-
         if self.scale_loss:
             # in case of cross entropy log_likelihood = -loss
             loss *= _scale_loss(-loss)
@@ -910,6 +879,57 @@ def _loss_cross_entropy(
         # average the loss over the batch
         return tf.reduce_mean(loss)
 
+    def _compute_sigmoid_loss(
+        self,
+        sim_pos: tf.Tensor,
+        sim_neg_il: tf.Tensor,
+        sim_neg_ll: tf.Tensor,
+        sim_neg_ii: tf.Tensor,
+        sim_neg_li: tf.Tensor,
+    ) -> tf.Tensor:
+        # Constrain similarity values in a range by applying sigmoid
+        # on them individually so that they saturate at extreme values.
+        sigmoid_logits = tf.concat(
+            [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
+        )
+        sigmoid_labels = tf.concat(
+            [
+                tf.ones_like(sigmoid_logits[..., :1]),
+                tf.zeros_like(sigmoid_logits[..., 1:]),
+            ],
+            axis=-1,
+        )
+        sigmoid_loss = tf.nn.sigmoid_cross_entropy_with_logits(
+            labels=sigmoid_labels, logits=sigmoid_logits
+        )
+        # average over logits axis
+        return tf.reduce_mean(sigmoid_loss, axis=-1)
+
+    def _compute_softmax_loss(
+        self,
+        sim_pos: tf.Tensor,
+        sim_neg_il: tf.Tensor,
+        sim_neg_ll: tf.Tensor,
+        sim_neg_ii: tf.Tensor,
+        sim_neg_li: tf.Tensor,
+    ) -> tf.Tensor:
+        # Similarity terms between input and label should be optimized relative
+        # to each other and hence use them as logits for softmax term
+        softmax_logits = tf.concat([sim_pos, sim_neg_il, sim_neg_li], axis=-1)
+        if not self.constrain_similarities:
+            # Concatenate other similarity terms as well. Due to this,
+            # similarity values between input and label may not be
+            # approximately bounded in a defined range.
+            softmax_logits = tf.concat(
+                [softmax_logits, sim_neg_ii, sim_neg_ll], axis=-1
+            )
+        # create label_ids for softmax
+        softmax_label_ids = tf.zeros_like(softmax_logits[..., 0], tf.int32)
+        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            labels=softmax_label_ids, logits=softmax_logits
+        )
+        return softmax_loss
+
     @property
     def _chosen_loss(self) -> Callable:
         """Use loss depending on given option."""

From 6f9cd90c2a5e9e71d661155917641d0b5907af14 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 15:22:10 +0100
Subject: [PATCH 36/44] remove none for similarity_type

---
 rasa/utils/tensorflow/layers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 30281b851f57..49d46acb2152 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -547,9 +547,9 @@ def __init__(
         use_max_sim_neg: bool,
         neg_lambda: float,
         scale_loss: bool,
+        similarity_type: Text,
         name: Optional[Text] = None,
         same_sampling: bool = False,
-        similarity_type: Optional[Text] = None,
         constrain_similarities: bool = True,
         model_confidence: Text = SOFTMAX,
     ) -> None:
@@ -572,10 +572,10 @@ def __init__(
                 used only if 'loss_type' is set to 'margin'.
             scale_loss: Boolean, if 'True' scale loss inverse proportionally to
                 the confidence of the correct prediction.
+            similarity_type: Similarity measure to use, either 'cosine' or 'inner'.
             name: Optional name of the layer.
             same_sampling: Boolean, if 'True' sample same negative labels
                 for the whole batch.
-            similarity_type: Similarity measure to use, either 'cosine' or 'inner'.
             constrain_similarities: Boolean, if 'True' applies sigmoid on all
                 similarity terms and adds to the loss function to
                 ensure that similarity values are approximately bounded.
@@ -598,7 +598,7 @@ def __init__(
         self.constrain_similarities = constrain_similarities
         self.model_confidence = model_confidence
         self.similarity_type = similarity_type
-        if not self.similarity_type or self.similarity_type not in {COSINE, INNER}:
+        if self.similarity_type not in {COSINE, INNER}:
             raise RasaException(
                 f"Wrong similarity type '{self.similarity_type}', "
                 f"should be '{COSINE}' or '{INNER}'."

From 6697c0dd6ef1a8f703037211df676c78fe491309 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 16:45:29 +0100
Subject: [PATCH 37/44] override defaults during load so that new parameters
 are filled in before model is initialized

---
 rasa/core/policies/ted_policy.py        | 1 +
 rasa/nlu/classifiers/diet_classifier.py | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 62e6a50e3d11..251305f93adf 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -809,6 +809,7 @@ def load(
         model_data_example = RasaModelData(
             label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data=loaded_data
         )
+        meta = rasa.utils.train_utils.override_defaults(cls.defaults, meta)
         meta = rasa.utils.train_utils.update_confidence_type(meta)
         meta = rasa.utils.train_utils.update_similarity_type(meta)
         meta = rasa.utils.train_utils.update_loss_type(meta)
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index f11096b85363..a40de9a50a07 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -332,6 +332,11 @@ def __init__(
 
         super().__init__(component_config)
 
+        print(
+            self.component_config[CONSTRAIN_SIMILARITIES],
+            self.component_config[MODEL_CONFIDENCE],
+        )
+
         self._check_config_parameters()
 
         # transform numbers to labels
@@ -1021,6 +1026,7 @@ def load(
             data_example,
         ) = cls._load_from_files(meta, model_dir)
 
+        meta = train_utils.override_defaults(cls.defaults, meta)
         meta = train_utils.update_confidence_type(meta)
         meta = train_utils.update_similarity_type(meta)
         meta = train_utils.update_loss_type(meta)

From 13d8aa85f06ef2a6f807845eb9ace2eff30eef1c Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 17:23:35 +0100
Subject: [PATCH 38/44] change call to deprecated function check

---
 changelog/7616.improvement.md           | 2 +-
 rasa/core/policies/ted_policy.py        | 4 ++--
 rasa/nlu/classifiers/diet_classifier.py | 6 ++++--
 rasa/utils/tensorflow/layers.py         | 4 ++--
 rasa/utils/train_utils.py               | 2 +-
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
index d687089711f5..6eb78ea7c073 100644
--- a/changelog/7616.improvement.md
+++ b/changelog/7616.improvement.md
@@ -19,6 +19,6 @@ With both the above recommendations, users should configure their ML component,
 ```
 Once the assistant is re-trained with the above configuration, users should also tune fallback confidence thresholds.
 
-Configuration option `loss_type=softmax` is now deprecated and will be in Rasa Open Source 3.0.0 . Use `loss_type=cross_entropy` instead.
+Configuration option `loss_type=softmax` is now deprecated and will be removed in Rasa Open Source 3.0.0 . Use `loss_type=cross_entropy` instead.
 
 The default [auto-configuration](model-configuration.mdx#suggested-config) is changed to use `constrain_similarities=True` and `model_confidence=cosine` in ML components so that new users start with the recommended configuration.
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 251305f93adf..8eaa404ebd5d 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -352,7 +352,7 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
 
         rasa.utils.train_utils.validate_configuration_settings(self.config)
 
-        self.config = rasa.utils.train_utils.update_loss_type(self.config)
+        self.config = rasa.utils.train_utils.update_deprecated_loss_type(self.config)
         self.config = rasa.utils.train_utils.update_similarity_type(self.config)
         self.config = rasa.utils.train_utils.update_evaluation_parameters(self.config)
 
@@ -812,7 +812,7 @@ def load(
         meta = rasa.utils.train_utils.override_defaults(cls.defaults, meta)
         meta = rasa.utils.train_utils.update_confidence_type(meta)
         meta = rasa.utils.train_utils.update_similarity_type(meta)
-        meta = rasa.utils.train_utils.update_loss_type(meta)
+        meta = rasa.utils.train_utils.update_deprecated_loss_type(meta)
 
         meta[EPOCHS] = epoch_override
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index a40de9a50a07..ec7cf506a435 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -301,7 +301,9 @@ def _check_config_parameters(self) -> None:
 
         train_utils.validate_configuration_settings(self.component_config)
 
-        self.component_config = train_utils.update_loss_type(self.component_config)
+        self.component_config = train_utils.update_deprecated_loss_type(
+            self.component_config
+        )
 
         self.component_config = train_utils.update_similarity_type(
             self.component_config
@@ -1029,7 +1031,7 @@ def load(
         meta = train_utils.override_defaults(cls.defaults, meta)
         meta = train_utils.update_confidence_type(meta)
         meta = train_utils.update_similarity_type(meta)
-        meta = train_utils.update_loss_type(meta)
+        meta = train_utils.update_deprecated_loss_type(meta)
 
         model = cls._load_model(
             entity_tag_specs,
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 49d46acb2152..44823059671e 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -581,7 +581,7 @@ def __init__(
                 ensure that similarity values are approximately bounded.
                 Used inside _loss_cross_entropy() only.
             model_confidence: Model confidence to be returned during inference.
-                Possible values - softmax, cosine, inner.
+                Possible values - 'softmax', 'cosine' and 'inner'.
 
         Raises:
             RasaException: When `similarity_type` is not one of 'cosine' or 'inner'.
@@ -879,8 +879,8 @@ def _loss_cross_entropy(
         # average the loss over the batch
         return tf.reduce_mean(loss)
 
+    @staticmethod
     def _compute_sigmoid_loss(
-        self,
         sim_pos: tf.Tensor,
         sim_neg_il: tf.Tensor,
         sim_neg_ll: tf.Tensor,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 84f0fe09e7ff..e0a3d3d09532 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -75,7 +75,7 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
     return config
 
 
-def update_loss_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
+def update_deprecated_loss_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
     """If LOSS_TYPE is set to 'softmax', update it to 'cross_entropy' since former is deprecated.
 
     Args:

From 9ea25dd93409f2e42b7d72afbf42a87a0f70af0d Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 17:56:56 +0100
Subject: [PATCH 39/44] more comments

---
 rasa/utils/train_utils.py             | 10 +++++-----
 tests/nlu/selectors/test_selectors.py |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e0a3d3d09532..ecf0910729ea 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -406,8 +406,8 @@ def _check_confidence_setting(component_config: Dict[Text, Any]) -> None:
         rasa.shared.utils.io.raise_warning(
             f"{MODEL_CONFIDENCE} is set to `softmax`. It is recommended "
             f"to set it to `cosine`. It will be set to `cosine` by default, "
-            f"Rasa Open Source 3.0 onwards.",
-            category=FutureWarning,
+            f"Rasa Open Source 3.0.0 onwards.",
+            category=UserWarning,
         )
         if component_config[LOSS_TYPE] not in [SOFTMAX, CROSS_ENTROPY]:
             raise InvalidConfigException(
@@ -433,8 +433,8 @@ def _check_loss_setting(component_config: Dict[Text, Any]) -> None:
         rasa.shared.utils.io.raise_warning(
             f"{CONSTRAIN_SIMILARITIES} is set to `False`. It is recommended "
             f"to set it to `True` when using cross-entropy loss. It will be set to `True` by default, "
-            f"Rasa Open Source 3.0 onwards.",
-            category=FutureWarning,
+            f"Rasa Open Source 3.0.0 onwards.",
+            category=UserWarning,
         )
 
 
@@ -452,7 +452,7 @@ def _check_similarity_loss_setting(component_config: Dict[Text, Any]) -> None:
             f"Ideally use `{SIMILARITY_TYPE}={INNER}`"
             f" and `{LOSS_TYPE}={CROSS_ENTROPY}` or"
             f"`{SIMILARITY_TYPE}={COSINE}` and `{LOSS_TYPE}={MARGIN}`.",
-            category=FutureWarning,
+            category=UserWarning,
         )
 
 
diff --git a/tests/nlu/selectors/test_selectors.py b/tests/nlu/selectors/test_selectors.py
index 02dd94394cea..610ef3304efb 100644
--- a/tests/nlu/selectors/test_selectors.py
+++ b/tests/nlu/selectors/test_selectors.py
@@ -384,7 +384,7 @@ async def test_cross_entropy_without_normalization(
 async def test_margin_loss_is_not_normalized(
     monkeypatch: MonkeyPatch,
     component_builder: ComponentBuilder,
-    tmpdir: Path,
+    tmp_path: Path,
     classifier_params: Dict[Text, int],
 ):
     pipeline = as_pipeline(
@@ -399,7 +399,7 @@ async def test_margin_loss_is_not_normalized(
     _config = RasaNLUModelConfig({"pipeline": pipeline})
     (trained_model, _, persisted_path) = await train(
         _config,
-        path=str(tmpdir),
+        path=str(tmp_path),
         data="data/test_selectors",
         component_builder=component_builder,
     )

From ffcfdd154354c5b101725892ad0ef6066426ffbc Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 19:01:15 +0100
Subject: [PATCH 40/44] add tests for config checks

---
 rasa/utils/train_utils.py       |  1 +
 tests/utils/test_train_utils.py | 59 ++++++++++++++++++++++++++++++++-
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index ecf0910729ea..2c643506356b 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -430,6 +430,7 @@ def _check_loss_setting(component_config: Dict[Text, Any]) -> None:
         SOFTMAX,
         CROSS_ENTROPY,
     ]:
+        print("raising")
         rasa.shared.utils.io.raise_warning(
             f"{CONSTRAIN_SIMILARITIES} is set to `False`. It is recommended "
             f"to set it to `True` when using cross-entropy loss. It will be set to `True` by default, "
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 74dccd2ad5df..ccddfd15be7b 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -2,7 +2,9 @@
 
 import numpy as np
 import pytest
-from typing import List
+from typing import Text
+from _pytest.logging import LogCaptureFixture
+import logging
 
 import rasa.utils.train_utils as train_utils
 from rasa.nlu.constants import NUMBER_OF_SUB_TOKENS
@@ -11,6 +13,18 @@
     SPLIT_ENTITIES_BY_COMMA_DEFAULT_VALUE,
     SPLIT_ENTITIES_BY_COMMA,
 )
+from rasa.utils.tensorflow.constants import (
+    MODEL_CONFIDENCE,
+    SIMILARITY_TYPE,
+    LOSS_TYPE,
+    COSINE,
+    SOFTMAX,
+    INNER,
+    CROSS_ENTROPY,
+    MARGIN,
+    CONSTRAIN_SIMILARITIES,
+)
+from rasa.shared.exceptions import RasaException, InvalidConfigException
 
 
 def test_align_token_features():
@@ -74,3 +88,46 @@ def test_init_split_entities_config(
         )
         == expected_initialized_config
     )
+
+
+@pytest.mark.parametrize(
+    "component_config, raises_exception",
+    [
+        ({MODEL_CONFIDENCE: SOFTMAX, LOSS_TYPE: MARGIN}, True),
+        ({MODEL_CONFIDENCE: SOFTMAX, LOSS_TYPE: SOFTMAX}, False),
+        ({MODEL_CONFIDENCE: SOFTMAX, LOSS_TYPE: CROSS_ENTROPY}, False),
+        ({MODEL_CONFIDENCE: COSINE, LOSS_TYPE: MARGIN}, False),
+        ({MODEL_CONFIDENCE: COSINE, LOSS_TYPE: SOFTMAX}, False),
+        ({MODEL_CONFIDENCE: COSINE, LOSS_TYPE: CROSS_ENTROPY}, False),
+        ({MODEL_CONFIDENCE: INNER, LOSS_TYPE: MARGIN}, False),
+        ({MODEL_CONFIDENCE: INNER, LOSS_TYPE: SOFTMAX}, False),
+        ({MODEL_CONFIDENCE: INNER, LOSS_TYPE: CROSS_ENTROPY}, False),
+    ],
+)
+def test_confidence_loss_settings(
+    component_config: Dict[Text, Any], raises_exception: bool
+):
+    component_config[SIMILARITY_TYPE] = INNER
+    if raises_exception:
+        with pytest.raises(InvalidConfigException):
+            train_utils._check_confidence_setting(component_config)
+
+
+@pytest.mark.parametrize(
+    "component_config, raises_exception",
+    [
+        ({MODEL_CONFIDENCE: SOFTMAX, SIMILARITY_TYPE: INNER}, False),
+        ({MODEL_CONFIDENCE: SOFTMAX, SIMILARITY_TYPE: COSINE}, True),
+        ({MODEL_CONFIDENCE: COSINE, SIMILARITY_TYPE: INNER}, False),
+        ({MODEL_CONFIDENCE: COSINE, SIMILARITY_TYPE: COSINE}, False),
+        ({MODEL_CONFIDENCE: INNER, SIMILARITY_TYPE: INNER}, False),
+        ({MODEL_CONFIDENCE: INNER, SIMILARITY_TYPE: COSINE}, False),
+    ],
+)
+def test_confidence_similarity_settings(
+    component_config: Dict[Text, Any], raises_exception: bool
+):
+    component_config[LOSS_TYPE] = SOFTMAX
+    if raises_exception:
+        with pytest.raises(InvalidConfigException):
+            train_utils._check_confidence_setting(component_config)

From 25abb8d780d1c96dc89c9f0d2735dd86a9eb2184 Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 19:04:23 +0100
Subject: [PATCH 41/44] remove prints

---
 rasa/nlu/classifiers/diet_classifier.py | 5 -----
 rasa/utils/train_utils.py               | 1 -
 tests/utils/test_train_utils.py         | 5 +----
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index ec7cf506a435..3292f9361e09 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -334,11 +334,6 @@ def __init__(
 
         super().__init__(component_config)
 
-        print(
-            self.component_config[CONSTRAIN_SIMILARITIES],
-            self.component_config[MODEL_CONFIDENCE],
-        )
-
         self._check_config_parameters()
 
         # transform numbers to labels
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 2c643506356b..ecf0910729ea 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -430,7 +430,6 @@ def _check_loss_setting(component_config: Dict[Text, Any]) -> None:
         SOFTMAX,
         CROSS_ENTROPY,
     ]:
-        print("raising")
         rasa.shared.utils.io.raise_warning(
             f"{CONSTRAIN_SIMILARITIES} is set to `False`. It is recommended "
             f"to set it to `True` when using cross-entropy loss. It will be set to `True` by default, "
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index ccddfd15be7b..33952b15a393 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 from typing import Text
-from _pytest.logging import LogCaptureFixture
-import logging
 
 import rasa.utils.train_utils as train_utils
 from rasa.nlu.constants import NUMBER_OF_SUB_TOKENS
@@ -22,9 +20,8 @@
     INNER,
     CROSS_ENTROPY,
     MARGIN,
-    CONSTRAIN_SIMILARITIES,
 )
-from rasa.shared.exceptions import RasaException, InvalidConfigException
+from rasa.shared.exceptions import InvalidConfigException
 
 
 def test_align_token_features():

From c2e74e108d208251d87429fba4d3c5dc776a5428 Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 19:25:34 +0100
Subject: [PATCH 42/44] Update docs/docs/migration-guide.mdx

---
 docs/docs/migration-guide.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index c66866edd102..342ba47f1c4e 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -22,7 +22,7 @@ them to an approximate range. You can turn on this option by setting `constrain_
 
 Also, a new option `model_confidence` has been added to each ML component. It affects how model's confidence for each label is computed during inference. It can take one of three values:
 1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
-2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label will be in the range `[-1,1]`.
+2. `cosine` - Cosine similarity between input and label embeddings. Confidence for each label will be in the range `[-1,1]`.
 3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label will be in an unbounded range.
 The default value is `softmax`, but we recommend using `cosine` as that will be the new default value from Rasa Open Source 3.0.0 onwards.
 The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.

From c2b9b9371e5875eaf6e10038708d1547f1a6c98c Mon Sep 17 00:00:00 2001
From: Daksh <d.varshneya@rasa.com>
Date: Mon, 8 Feb 2021 19:26:14 +0100
Subject: [PATCH 43/44] fix test

---
 tests/utils/test_train_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 33952b15a393..9b9d15f2cc93 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -108,6 +108,8 @@ def test_confidence_loss_settings(
     if raises_exception:
         with pytest.raises(InvalidConfigException):
             train_utils._check_confidence_setting(component_config)
+    else:
+        train_utils._check_confidence_setting(component_config)
 
 
 @pytest.mark.parametrize(
@@ -128,3 +130,5 @@ def test_confidence_similarity_settings(
     if raises_exception:
         with pytest.raises(InvalidConfigException):
             train_utils._check_confidence_setting(component_config)
+    else:
+        train_utils._check_confidence_setting(component_config)

From c69fdbb496d4623ac4b72694b176d2836c1c8269 Mon Sep 17 00:00:00 2001
From: Daksh Varshneya <d.varshneya@rasa.com>
Date: Tue, 9 Feb 2021 09:42:38 +0100
Subject: [PATCH 44/44] Update rasa/utils/tensorflow/layers.py

---
 rasa/utils/tensorflow/layers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 44823059671e..bd3945eced1b 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -933,7 +933,6 @@ def _compute_softmax_loss(
     @property
     def _chosen_loss(self) -> Callable:
         """Use loss depending on given option."""
-
         if self.loss_type == MARGIN:
             return self._loss_margin
         elif self.loss_type == CROSS_ENTROPY: