diff --git a/changelog/7616.improvement.md b/changelog/7616.improvement.md
new file mode 100644
index 000000000000..6eb78ea7c073
--- /dev/null
+++ b/changelog/7616.improvement.md
@@ -0,0 +1,24 @@
+Added two new parameters `constrain_similarities` and `model_confidence` to machine learning (ML) components - [DIETClassifier](components.mdx#dietclassifier), [ResponseSelector](components.mdx#dietclassifier) and [TEDPolicy](policies.mdx#ted-policy).
+
+Setting `constrain_similarities=True` adds a sigmoid cross-entropy loss on all similarity values to restrict them to an approximate range in `DotProductLoss`. This should help the models to perform better on real world test sets.
+By default, the parameter is set to `False` to preserve the old behaviour, but users are encouraged to set it to `True` and re-train their assistants as it will be set to `True` by default from Rasa Open Source 3.0.0 onwards.
+
+Parameter `model_confidence` affects how model's confidence for each label is computed during inference. It can take three values:
+1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
+2. `cosine` - Cosine similarity between input label embeddings. Confidence for each label will be in the range `[-1,1]`.
+3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label will be in an unbounded range.
+
+Setting `model_confidence=cosine` should help users tune the fallback thresholds of their assistant better. The default value is `softmax` to preserve the old behaviour, but we recommend using `cosine` as that will be the new default value from Rasa Open Source 3.0.0 onwards. The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
+
+With both the above recommendations, users should configure their ML component, e.g. `DIETClassifier`, as
+```yaml
+- name: DIETClassifier
+  model_confidence: cosine
+  constrain_similarities: True
+  ...
+```
+Once the assistant is re-trained with the above configuration, users should also tune fallback confidence thresholds.
+
+Configuration option `loss_type=softmax` is now deprecated and will be removed in Rasa Open Source 3.0.0 . Use `loss_type=cross_entropy` instead.
+
+The default [auto-configuration](model-configuration.mdx#suggested-config) is changed to use `constrain_similarities=True` and `model_confidence=cosine` in ML components so that new users start with the recommended configuration.
diff --git a/data/test_config/config_empty_en_after_dumping.yml b/data/test_config/config_empty_en_after_dumping.yml
index 20507a3944af..79c21d70c4a7 100644
--- a/data/test_config/config_empty_en_after_dumping.yml
+++ b/data/test_config/config_empty_en_after_dumping.yml
@@ -13,9 +13,13 @@ pipeline:
 #     max_ngram: 4
 #   - name: DIETClassifier
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: EntitySynonymMapper
 #   - name: ResponseSelector
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: FallbackClassifier
 #     threshold: 0.3
 #     ambiguity_threshold: 0.1
@@ -27,4 +31,6 @@ policies:
 #   - name: TEDPolicy
 #     max_history: 5
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: RulePolicy
diff --git a/data/test_config/config_empty_en_after_dumping_core.yml b/data/test_config/config_empty_en_after_dumping_core.yml
index 1488270ddf39..adb3c2a0af55 100644
--- a/data/test_config/config_empty_en_after_dumping_core.yml
+++ b/data/test_config/config_empty_en_after_dumping_core.yml
@@ -8,4 +8,6 @@ policies:
 #   - name: TEDPolicy
 #     max_history: 5
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: RulePolicy
diff --git a/data/test_config/config_empty_en_after_dumping_nlu.yml b/data/test_config/config_empty_en_after_dumping_nlu.yml
index a4cb5077bf58..8249b17a0e11 100644
--- a/data/test_config/config_empty_en_after_dumping_nlu.yml
+++ b/data/test_config/config_empty_en_after_dumping_nlu.yml
@@ -13,9 +13,13 @@ pipeline:
 #     max_ngram: 4
 #   - name: DIETClassifier
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: EntitySynonymMapper
 #   - name: ResponseSelector
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: FallbackClassifier
 #     threshold: 0.3
 #     ambiguity_threshold: 0.1
diff --git a/data/test_config/config_empty_fr_after_dumping.yml b/data/test_config/config_empty_fr_after_dumping.yml
index 8148c3ebee68..a2ea89f4bf0a 100644
--- a/data/test_config/config_empty_fr_after_dumping.yml
+++ b/data/test_config/config_empty_fr_after_dumping.yml
@@ -13,9 +13,13 @@ pipeline:
 #     max_ngram: 4
 #   - name: DIETClassifier
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: EntitySynonymMapper
 #   - name: ResponseSelector
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: FallbackClassifier
 #     threshold: 0.3
 #     ambiguity_threshold: 0.1
@@ -27,4 +31,6 @@ policies:
 #   - name: TEDPolicy
 #     max_history: 5
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: RulePolicy
diff --git a/data/test_config/config_with_comments_after_dumping.yml b/data/test_config/config_with_comments_after_dumping.yml
index 16b6129d18f9..ef0743f894de 100644
--- a/data/test_config/config_with_comments_after_dumping.yml
+++ b/data/test_config/config_with_comments_after_dumping.yml
@@ -27,6 +27,8 @@ policies: # even here
 #   - name: TEDPolicy
 #     max_history: 5
 #     epochs: 100
+#     constrain_similarities: true
+#     model_confidence: cosine
 #   - name: RulePolicy
 
 # comments everywhere
diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx
index 490e108ff358..ef5ac83db5a1 100644
--- a/docs/docs/components.mdx
+++ b/docs/docs/components.mdx
@@ -1531,10 +1531,12 @@ However, additional parameters exist that can be adapted.
 | similarity_type                 | "auto"           | Type of similarity measure to use, either 'auto' or 'cosine' |
 |                                 |                  | or 'inner'.                                                  |
 +---------------------------------+------------------+--------------------------------------------------------------+
-| loss_type                       | "softmax"        | The type of the loss function, either 'softmax' or 'margin'. |
+| loss_type                       | "cross_entropy"  | The type of the loss function, either 'cross_entropy'        |
+|                                 |                  | or 'margin'.                                                 |
 +---------------------------------+------------------+--------------------------------------------------------------+
-| ranking_length                  | 10               | Number of top actions to normalize scores for loss type      |
-|                                 |                  | 'softmax'. Set to 0 to turn off normalization.               |
+| ranking_length                  | 10               | Number of top intents to normalize scores for. Applicable    |
+|                                 |                  | only with loss type 'cross_entropy' and 'softmax'            |
+|                                 |                  | confidences. Set to 0 to disable normalization.              |
 +---------------------------------+------------------+--------------------------------------------------------------+
 | maximum_positive_similarity     | 0.8              | Indicates how similar the algorithm should try to make       |
 |                                 |                  | embedding vectors for correct labels.                        |
@@ -1616,6 +1618,24 @@ However, additional parameters exist that can be adapted.
 |                                 |                  | ...                                                          |
 |                                 |                  | ```                                                          |
 +---------------------------------+------------------+--------------------------------------------------------------+
+| constrain_similarities          | False            | If `True`, applies sigmoid on all similarity terms and adds  |
+|                                 |                  | it to the loss function to ensure that similarity values are |
+|                                 |                  | approximately bounded. Used only if `loss_type=cross_entropy`|
++---------------------------------+------------------+--------------------------------------------------------------+
+| model_confidence                | "softmax"        | Affects how model's confidence for each intent               |
+|                                 |                  | is computed. It can take three values                        |
+|                                 |                  | 1. `softmax` - Similarities between input and intent         |
+|                                 |                  | embeddings are post-processed with a softmax function,       |
+|                                 |                  | as a result of which confidence for all intents sum up to 1. |
+|                                 |                  | 2. `cosine` - Cosine similarity between input and intent     |
+|                                 |                  | embeddings. Confidence for each intent is in the             |
+|                                 |                  | range `[-1,1]`.                                              |
+|                                 |                  | 3. `inner` - Dot product similarity between input and intent |
+|                                 |                  | embeddings. Confidence for each intent is in an unbounded    |
+|                                 |                  | range.                                                       |
+|                                 |                  | This parameter does not affect the confidence for entity     |
+|                                 |                  | prediction.                                                  |
++---------------------------------+------------------+--------------------------------------------------------------+
 ```
 
 :::note
@@ -2742,10 +2762,12 @@ However, additional parameters exist that can be adapted.
 | similarity_type                 | "auto"            | Type of similarity measure to use, either 'auto' or 'cosine' |
 |                                 |                   | or 'inner'.                                                  |
 +---------------------------------+-------------------+--------------------------------------------------------------+
-| loss_type                       | "softmax"         | The type of the loss function, either 'softmax' or 'margin'. |
+| loss_type                       | "cross_entropy"   | The type of the loss function, either 'cross_entropy'        |
+|                                 |                   | or 'margin'.                                                 |
 +---------------------------------+-------------------+--------------------------------------------------------------+
-| ranking_length                  | 10                | Number of top actions to normalize scores for loss type      |
-|                                 |                   | 'softmax'. Set to 0 to turn off normalization.               |
+| ranking_length                  | 10                | Number of top responses to normalize scores for. Applicable  |
+|                                 |                   | only with loss type 'cross_entropy' and 'softmax'            |
+|                                 |                   | confidences. Set to 0 to disable normalization.              |
 +---------------------------------+-------------------+--------------------------------------------------------------+
 | maximum_positive_similarity     | 0.8               | Indicates how similar the algorithm should try to make       |
 |                                 |                   | embedding vectors for correct labels.                        |
@@ -2814,6 +2836,22 @@ However, additional parameters exist that can be adapted.
 |                                 |                   | Requires `evaluate_on_number_of_examples > 0` and            |
 |                                 |                   | `evaluate_every_number_of_epochs > 0`                        |
 +---------------------------------+-------------------+--------------------------------------------------------------+
+| constrain_similarities          | False             | If `True`, applies sigmoid on all similarity terms and adds  |
+|                                 |                   | it to the loss function to ensure that similarity values are |
+|                                 |                   | approximately bounded. Used only if `loss_type=cross_entropy`|
++---------------------------------+-------------------+--------------------------------------------------------------+
+| model_confidence                | "softmax"         | Affects how model's confidence for each response label       |
+|                                 |                   | is computed. It can take three values                        |
+|                                 |                   | 1. `softmax` - Similarities between input and response label |
+|                                 |                   | embeddings are post-processed with a softmax function,       |
+|                                 |                   | as a result of which confidence for all labels sum up to 1.  |
+|                                 |                   | 2. `cosine` - Cosine similarity between input and response   |
+|                                 |                   | label embeddings. Confidence for each label is in the        |
+|                                 |                   | range `[-1,1]`.                                              |
+|                                 |                   | 3. `inner` - Dot product similarity between input and        |
+|                                 |                   | response label embeddings. Confidence for each label is in an|
+|                                 |                   | unbounded range.                                             |
++---------------------------------+-------------------+--------------------------------------------------------------+
 ```
 
 :::note
diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index 203b3d7bcc3b..5dd3c9e7dbf0 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -10,6 +10,33 @@ description: |
 This page contains information about changes between major versions and
 how you can migrate from one version to another.
 
+## Rasa 2.2 to Rasa 2.3
+
+### Machine Learning Components
+
+A few changes have been made to the loss function inside machine learning (ML)
+components `DIETClassifier`, `ResponseSelector` and `TEDPolicy`. These include:
+1. Configuration option `loss_type=softmax` is now deprecated and will be removed in Rasa Open Source 3.0.0. Use `loss_type=cross_entropy` instead.
+2. The default loss function (`loss_type=cross_entropy`) can add an optional sigmoid cross-entropy loss of all similarity values to constrain
+them to an approximate range. You can turn on this option by setting `constrain_similarities=True`. This should help the models to perform better on real world test sets.
+
+Also, a new option `model_confidence` has been added to each ML component. It affects how a model's confidence for each label is computed during inference. It can take one of three values:
+1. `softmax` - Similarities between input and label embeddings are post-processed with a softmax function, as a result of which confidence for all labels sum up to 1.
+2. `cosine` - Cosine similarity between input and label embeddings. Confidence for each label will be in the range `[-1,1]`.
+3. `inner` - Dot product similarity between input and label embeddings. Confidence for each label will be in an unbounded range.
+The default value is `softmax`, but we recommend using `cosine` as that will be the new default value from Rasa Open Source 3.0.0 onwards.
+The value of this option does not affect how confidences are computed for entity predictions in `DIETClassifier` and `TEDPolicy`.
+
+With both the above recommendations, users should configure their ML component, e.g. `DIETClassifier`, as:
+```
+- name: DIETClassifier
+  model_confidence: cosine
+  constrain_similarities: True
+  ...
+```
+Once the assistant is re-trained with the above configuration, users should also tune fallback confidence thresholds.
+
+
 ## Rasa 2.1 to Rasa 2.2
 
 ### General
diff --git a/docs/docs/policies.mdx b/docs/docs/policies.mdx
index 9292c5fb05fb..bc2d4c1c4e85 100644
--- a/docs/docs/policies.mdx
+++ b/docs/docs/policies.mdx
@@ -268,10 +268,12 @@ However, additional parameters exist that can be adapted.
 | similarity_type                       | "auto"                 | Type of similarity measure to use, either 'auto' or 'cosine' |
 |                                       |                        | or 'inner'.                                                  |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
-| loss_type                             | "softmax"              | The type of the loss function, either 'softmax' or 'margin'. |
+| loss_type                             | "cross_entropy"        | The type of the loss function, either 'cross_entropy'        |
+|                                       |                        | or 'margin'.                                                 |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
-| ranking_length                        | 10                     | Number of top actions to normalize scores for loss type      |
-|                                       |                        | 'softmax'. Set to 0 to turn off normalization.               |
+| ranking_length                        | 10                     | Number of top actions to normalize scores for. Applicable    |
+|                                       |                        | only with loss type 'cross_entropy' and 'softmax'            |
+|                                       |                        | confidences. Set to 0 to disable normalization.              |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
 | maximum_positive_similarity           | 0.8                    | Indicates how similar the algorithm should try to make       |
 |                                       |                        | embedding vectors for correct labels.                        |
@@ -344,6 +346,22 @@ However, additional parameters exist that can be adapted.
 | entity_recognition                    | True                   | If 'True' entity recognition is trained and entities are     |
 |                                       |                        | extracted.                                                   |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
+| constrain_similarities                | False                  | If `True`, applies sigmoid on all similarity terms and adds  |
+|                                       |                        | it to the loss function to ensure that similarity values are |
+|                                       |                        | approximately bounded. Used only when `loss_type=softmax`.   |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| model_confidence                      | "softmax"              | Affects how model's confidence for each action               |
+|                                       |                        | is computed. It can take three values                        |
+|                                       |                        | 1. `softmax` - Similarities between input and action         |
+|                                       |                        | embeddings are post-processed with a softmax function,       |
+|                                       |                        | as a result of which confidence for all labels sum up to 1.  |
+|                                       |                        | 2. `cosine` - Cosine similarity between input and action     |
+|                                       |                        | embeddings. Confidence for each label is in the              |
+|                                       |                        | range `[-1,1]`.                                              |
+|                                       |                        | 3. `inner` - Dot product similarity between input and action |
+|                                       |                        | embeddings. Confidence for each label is in an               |
+|                                       |                        | unbounded range.                                             |
++---------------------------------------+------------------------+--------------------------------------------------------------+
 | BILOU_flag                            | True                   | If 'True', additional BILOU tags are added to entity labels. |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
 | split_entities_by_comma               | True                   | Splits a list of extracted entities by comma to treat each   |
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 09b3a7a7cdb4..8eaa404ebd5d 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -82,7 +82,7 @@
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
-    SOFTMAX,
+    CROSS_ENTROPY,
     AUTO,
     BALANCED,
     TENSORBOARD_LOG_DIR,
@@ -102,6 +102,9 @@
     HIDDEN_LAYERS_SIZES,
     FEATURIZERS,
     ENTITY_RECOGNITION,
+    CONSTRAIN_SIMILARITIES,
+    MODEL_CONFIDENCE,
+    SOFTMAX,
     BILOU_FLAG,
 )
 from rasa.shared.core.events import EntitiesAdded, Event
@@ -212,10 +215,11 @@ class TEDPolicy(Policy):
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: AUTO,
-        # The type of the loss function, either 'softmax' or 'margin'.
-        LOSS_TYPE: SOFTMAX,
-        # Number of top actions to normalize scores for loss type 'softmax'.
-        # Set to 0 to turn off normalization.
+        # The type of the loss function, either 'cross_entropy' or 'margin'.
+        LOSS_TYPE: CROSS_ENTROPY,
+        # Number of top actions to normalize scores for. Applicable with
+        # loss type 'cross_entropy' and 'softmax' confidences. Set to 0
+        # to turn off normalization.
         RANKING_LENGTH: 10,
         # Indicates how similar the algorithm should try to make embedding vectors
         # for correct labels.
@@ -277,6 +281,13 @@ class TEDPolicy(Policy):
         FEATURIZERS: [],
         # If set to true, entities are predicted in user utterances.
         ENTITY_RECOGNITION: True,
+        # if 'True' applies sigmoid on all similarity terms and adds
+        # it to the loss function to ensure that similarity values are
+        # approximately bounded. Used inside softmax loss only.
+        CONSTRAIN_SIMILARITIES: False,
+        # Model confidence to be returned during inference. Possible values -
+        # 'softmax', 'cosine' and 'inner'.
+        MODEL_CONFIDENCE: SOFTMAX,
         # 'BILOU_flag' determines whether to use BILOU tagging or not.
         # If set to 'True' labelling is more rigorous, however more
         # examples per entity are required.
@@ -336,6 +347,12 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = rasa.utils.train_utils.override_defaults(
             self.defaults, new_config
         )
+
+        self.config = rasa.utils.train_utils.update_confidence_type(self.config)
+
+        rasa.utils.train_utils.validate_configuration_settings(self.config)
+
+        self.config = rasa.utils.train_utils.update_deprecated_loss_type(self.config)
         self.config = rasa.utils.train_utils.update_similarity_type(self.config)
         self.config = rasa.utils.train_utils.update_evaluation_parameters(self.config)
 
@@ -606,7 +623,9 @@ def predict_action_probabilities(
         # take correct prediction from batch
         confidence, is_e2e_prediction = self._pick_confidence(confidences, similarities)
 
-        if self.config[LOSS_TYPE] == SOFTMAX and self.config[RANKING_LENGTH] > 0:
+        if self.config[RANKING_LENGTH] > 0 and self.config[MODEL_CONFIDENCE] == SOFTMAX:
+            # TODO: This should be removed in 3.0 when softmax as
+            #  model confidence and normalization is completely deprecated.
             confidence = rasa.utils.train_utils.normalize(
                 confidence, self.config[RANKING_LENGTH]
             )
@@ -790,7 +809,10 @@ def load(
         model_data_example = RasaModelData(
             label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data=loaded_data
         )
+        meta = rasa.utils.train_utils.override_defaults(cls.defaults, meta)
+        meta = rasa.utils.train_utils.update_confidence_type(meta)
         meta = rasa.utils.train_utils.update_similarity_type(meta)
+        meta = rasa.utils.train_utils.update_deprecated_loss_type(meta)
 
         meta[EPOCHS] = epoch_override
 
@@ -1710,15 +1732,14 @@ def batch_predict(
         ) = self._embed_dialogue(dialogue_in, tf_batch_data)
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
-        sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
+        sim_all, scores = self._tf_layers[
+            f"loss.{LABEL}"
+        ]._similarity_confidence_from_embeddings(
             dialogue_embed[:, :, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, tf.newaxis, :, :],
             dialogue_mask,
         )
 
-        scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
-            sim_all, self.config[SIMILARITY_TYPE]
-        )
         predictions = {
             "action_scores": scores,
             "similarities": sim_all,
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c50204fd1912..3292f9361e09 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -85,9 +85,9 @@
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
-    SOFTMAX,
     AUTO,
     BALANCED,
+    CROSS_ENTROPY,
     TENSORBOARD_LOG_LEVEL,
     CONCAT_DIMENSION,
     FEATURIZERS,
@@ -97,6 +97,9 @@
     SEQUENCE_LENGTH,
     DENSE_DIMENSION,
     MASK,
+    CONSTRAIN_SIMILARITIES,
+    MODEL_CONFIDENCE,
+    SOFTMAX,
 )
 
 logger = logging.getLogger(__name__)
@@ -175,10 +178,11 @@ def required_components(cls) -> List[Type[Component]]:
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: AUTO,
-        # The type of the loss function, either 'softmax' or 'margin'.
-        LOSS_TYPE: SOFTMAX,
-        # Number of top actions to normalize scores for loss type 'softmax'.
-        # Set to 0 to turn off normalization.
+        # The type of the loss function, either 'cross_entropy' or 'margin'.
+        LOSS_TYPE: CROSS_ENTROPY,
+        # Number of top intents to normalize scores for. Applicable with
+        # loss type 'cross_entropy' and 'softmax' confidences. Set to 0
+        # to turn off normalization.
         RANKING_LENGTH: 10,
         # Indicates how similar the algorithm should try to make embedding vectors
         # for correct labels.
@@ -245,6 +249,13 @@ def required_components(cls) -> List[Type[Component]]:
         # Split entities by comma, this makes sense e.g. for a list of ingredients
         # in a recipie, but it doesn't make sense for the parts of an address
         SPLIT_ENTITIES_BY_COMMA: True,
+        # If 'True' applies sigmoid on all similarity terms and adds
+        # it to the loss function to ensure that similarity values are
+        # approximately bounded. Used inside softmax loss only.
+        CONSTRAIN_SIMILARITIES: False,
+        # Model confidence to be returned during inference. Possible values -
+        # 'softmax', 'cosine', 'inner'.
+        MODEL_CONFIDENCE: SOFTMAX,
     }
 
     # init helpers
@@ -284,6 +295,16 @@ def _check_config_parameters(self) -> None:
         self._check_masked_lm()
         self._check_share_hidden_layers_sizes()
 
+        self.component_config = train_utils.update_confidence_type(
+            self.component_config
+        )
+
+        train_utils.validate_configuration_settings(self.component_config)
+
+        self.component_config = train_utils.update_deprecated_loss_type(
+            self.component_config
+        )
+
         self.component_config = train_utils.update_similarity_type(
             self.component_config
         )
@@ -850,9 +871,11 @@ def _predict_label(
         label_ids = message_sim.argsort()[::-1]
 
         if (
-            self.component_config[LOSS_TYPE] == SOFTMAX
-            and self.component_config[RANKING_LENGTH] > 0
+            self.component_config[RANKING_LENGTH] > 0
+            and self.component_config[MODEL_CONFIDENCE] == SOFTMAX
         ):
+            # TODO: This should be removed in 3.0 when softmax as
+            #  model confidence and normalization is completely deprecated.
             message_sim = train_utils.normalize(
                 message_sim, self.component_config[RANKING_LENGTH]
             )
@@ -1000,7 +1023,10 @@ def load(
             data_example,
         ) = cls._load_from_files(meta, model_dir)
 
+        meta = train_utils.override_defaults(cls.defaults, meta)
+        meta = train_utils.update_confidence_type(meta)
         meta = train_utils.update_similarity_type(meta)
+        meta = train_utils.update_deprecated_loss_type(meta)
 
         model = cls._load_model(
             entity_tag_specs,
@@ -1651,12 +1677,11 @@ def _batch_predict_intents(
         sentence_vector = self._last_token(text_transformed, sequence_lengths)
         sentence_vector_embed = self._tf_layers[f"embed.{TEXT}"](sentence_vector)
 
-        sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
+        _, scores = self._tf_layers[
+            f"loss.{LABEL}"
+        ]._similarity_confidence_from_embeddings(
             sentence_vector_embed[:, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, :, :],
         )
-        scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
-            sim_all, self.config[SIMILARITY_TYPE]
-        )
 
         return {"i_scores": scores}
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index d1c4626f647e..f6aa535f6298 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -66,7 +66,7 @@
     MAX_RELATIVE_POSITION,
     RETRIEVAL_INTENT,
     USE_TEXT_AS_LABEL,
-    SOFTMAX,
+    CROSS_ENTROPY,
     AUTO,
     BALANCED,
     TENSORBOARD_LOG_DIR,
@@ -75,6 +75,9 @@
     FEATURIZERS,
     CHECKPOINT_MODEL,
     DENSE_DIMENSION,
+    CONSTRAIN_SIMILARITIES,
+    MODEL_CONFIDENCE,
+    SOFTMAX,
 )
 from rasa.nlu.constants import (
     RESPONSE_SELECTOR_PROPERTY_NAME,
@@ -171,10 +174,11 @@ def required_components(cls) -> List[Type[Component]]:
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: AUTO,
-        # The type of the loss function, either 'softmax' or 'margin'.
-        LOSS_TYPE: SOFTMAX,
-        # Number of top actions to normalize scores for loss type 'softmax'.
-        # Set to 0 to turn off normalization.
+        # The type of the loss function, either 'cross_entropy' or 'margin'.
+        LOSS_TYPE: CROSS_ENTROPY,
+        # Number of top actions to normalize scores for. Applicable with
+        # loss type 'cross_entropy' and 'softmax' confidences. Set to 0
+        # to turn off normalization.
         RANKING_LENGTH: 10,
         # Indicates how similar the algorithm should try to make embedding vectors
         # for correct labels.
@@ -232,6 +236,13 @@ def required_components(cls) -> List[Type[Component]]:
         FEATURIZERS: [],
         # Perform model checkpointing
         CHECKPOINT_MODEL: False,
+        # if 'True' applies sigmoid on all similarity terms and adds it
+        # to the loss function to ensure that similarity values are
+        # approximately bounded. Used inside softmax loss only.
+        CONSTRAIN_SIMILARITIES: False,
+        # Model confidence to be returned during inference. Possible values -
+        # 'softmax', 'cosine', 'inner'.
+        MODEL_CONFIDENCE: SOFTMAX,
     }
 
     def __init__(
@@ -244,7 +255,18 @@ def __init__(
         responses: Optional[Dict[Text, List[Dict[Text, Any]]]] = None,
         finetune_mode: bool = False,
     ) -> None:
+        """Declare instance variables with default values.
 
+        Args:
+            component_config: Configuration for the component.
+            index_label_id_mapping: Mapping between label and index used for encoding.
+            entity_tag_specs: Format specification all entity tags.
+            model: Model architecture.
+            all_retrieval_intents: All retrieval intents defined in the data.
+            responses: All responses defined in the data.
+            finetune_mode: If `True` loads the model with pre-trained weights,
+                otherwise initializes it with random weights.
+        """
         component_config = component_config or {}
 
         # the following properties cannot be adapted for the ResponseSelector
@@ -755,13 +777,12 @@ def batch_predict(
         sentence_vector = self._last_token(text_transformed, sequence_lengths_text)
         sentence_vector_embed = self._tf_layers[f"embed.{TEXT}"](sentence_vector)
 
-        sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
+        _, scores = self._tf_layers[
+            f"loss.{LABEL}"
+        ]._similarity_confidence_from_embeddings(
             sentence_vector_embed[:, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, :, :],
         )
-        scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
-            sim_all, self.config[SIMILARITY_TYPE]
-        )
         out["i_scores"] = scores
 
         return out
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index 837aec238855..e9f819d9e243 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -927,7 +927,7 @@ def evaluate_entities(
                     merged_targets,
                     merged_predictions,
                     merged_confidences,
-                    title="Entity Confusion matrix",
+                    title="Entity Prediction Confidence Distribution",
                     hist_filename=histogram_filename,
                 )
 
diff --git a/rasa/shared/importers/default_config.yml b/rasa/shared/importers/default_config.yml
index 95c9716b0d4e..63d10d9249ab 100644
--- a/rasa/shared/importers/default_config.yml
+++ b/rasa/shared/importers/default_config.yml
@@ -13,9 +13,13 @@ pipeline:
     max_ngram: 4
   - name: DIETClassifier
     epochs: 100
+    constrain_similarities: true
+    model_confidence: cosine
   - name: EntitySynonymMapper
   - name: ResponseSelector
     epochs: 100
+    constrain_similarities: true
+    model_confidence: cosine
   - name: FallbackClassifier
     threshold: 0.3
     ambiguity_threshold: 0.1
@@ -27,4 +31,6 @@ policies:
   - name: TEDPolicy
     max_history: 5
     epochs: 100
+    constrain_similarities: true
+    model_confidence: cosine
   - name: RulePolicy
diff --git a/rasa/utils/plotting.py b/rasa/utils/plotting.py
index 7eba3d6d0f7f..c816f26f77a9 100644
--- a/rasa/utils/plotting.py
+++ b/rasa/utils/plotting.py
@@ -5,6 +5,7 @@
 import numpy as np
 from typing import List, Text, Optional, Union, Any
 import matplotlib
+from matplotlib.ticker import FormatStrFormatter
 
 import rasa.shared.utils.io
 from rasa.constants import RESULTS_FILE
@@ -133,21 +134,37 @@ def plot_histogram(
     # Wine-ish colour for the confidences of hits.
     # Blue-ish colour for the confidences of misses.
     colors = ["#009292", "#920000"]
-    bins = [0.05 * i for i in range(1, 21)]
+    n_bins = 25
+    max_value = max(
+        [max(hist_data[0], default=0), max(hist_data[1], default=0)], default=0
+    )
+    min_value = min(
+        [min(hist_data[0], default=0), min(hist_data[1], default=0)], default=0
+    )
+
+    bin_width = (max_value - min_value) / n_bins
+    bins = [min_value + (i * bin_width) for i in range(1, n_bins + 1)]
 
     binned_data_sets = [np.histogram(d, bins=bins)[0] for d in hist_data]
 
     max_xlims = [max(binned_data_set) for binned_data_set in binned_data_sets]
     max_xlims = [xlim + np.ceil(0.25 * xlim) for xlim in max_xlims]  # padding
 
-    min_ylim = bins[
-        min(
-            [
-                (binned_data_set != 0).argmax(axis=0)
-                for binned_data_set in binned_data_sets
-            ]
-        )
-    ]
+    min_ylim = (
+        bins[
+            min(
+                [
+                    (binned_data_set != 0).argmax(axis=0)
+                    for binned_data_set in binned_data_sets
+                ]
+            )
+        ]
+        - bin_width
+    )
+
+    max_ylim = max(bins) + bin_width
+
+    yticks = [float("{:.2f}".format(x)) for x in bins]
 
     centers = 0.5 * (0.05 + (bins + np.roll(bins, 0))[:-1])
     heights = 0.75 * np.diff(bins)
@@ -170,16 +187,20 @@ def plot_histogram(
         color=colors[1],
         label="misses",
     )
+
     axes[1].set(title="Wrong")
 
-    axes[0].set(yticks=bins, xlim=(0, max_xlims[0]), ylim=(min_ylim, 1.0))
-    axes[1].set(yticks=bins, xlim=(0, max_xlims[1]), ylim=(min_ylim, 1.0))
+    axes[0].set(yticks=yticks, xlim=(0, max_xlims[0]), ylim=(min_ylim, max_ylim))
+    axes[1].set(yticks=yticks, xlim=(0, max_xlims[1]), ylim=(min_ylim, max_ylim))
+
+    axes[0].yaxis.set_major_formatter(FormatStrFormatter("%.2f"))
+    axes[0].yaxis.set_minor_formatter(FormatStrFormatter("%.2f"))
 
     axes[0].invert_xaxis()
     axes[0].yaxis.tick_right()
 
     fig.subplots_adjust(
-        wspace=0.14
+        wspace=0.17
     )  # get the graphs exactly far enough apart for yaxis labels
     fig.suptitle(title, fontsize="x-large", fontweight="bold")
 
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 29c046258dac..d43c85066b9e 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -38,6 +38,7 @@
 DROP_RATE_ATTENTION = "drop_rate_attention"
 DROP_RATE_DIALOGUE = "drop_rate_dialogue"
 DROP_RATE_LABEL = "drop_rate_label"
+CONSTRAIN_SIMILARITIES = "constrain_similarities"
 
 WEIGHT_SPARSITY = "weight_sparsity"
 
@@ -52,6 +53,7 @@
 DENSE_INPUT_DROPOUT = "use_dense_input_dropout"
 
 RANKING_LENGTH = "ranking_length"
+MODEL_CONFIDENCE = "model_confidence"
 
 BILOU_FLAG = "BILOU_flag"
 
@@ -64,6 +66,7 @@
 AUTO = "auto"
 INNER = "inner"
 COSINE = "cosine"
+CROSS_ENTROPY = "cross_entropy"
 
 BALANCED = "balanced"
 
diff --git a/rasa/utils/tensorflow/exceptions.py b/rasa/utils/tensorflow/exceptions.py
new file mode 100644
index 000000000000..53e1cd4703c1
--- /dev/null
+++ b/rasa/utils/tensorflow/exceptions.py
@@ -0,0 +1,5 @@
+from rasa.shared.exceptions import RasaException
+
+
+class TFLayerConfigException(RasaException):
+    """Raised when wrong parameters are passed to tensorflow layers."""
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 3e9007d90af7..4b1266d20b3e 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -5,7 +5,14 @@
 import rasa.utils.tensorflow.crf
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras import backend as K
-from rasa.utils.tensorflow.constants import SOFTMAX, MARGIN, COSINE, INNER
+from rasa.utils.tensorflow.constants import (
+    SOFTMAX,
+    MARGIN,
+    COSINE,
+    INNER,
+    CROSS_ENTROPY,
+)
+from rasa.utils.tensorflow.exceptions import TFLayerConfigException
 
 logger = logging.getLogger(__name__)
 
@@ -269,13 +276,6 @@ def call(
 class Embed(tf.keras.layers.Layer):
     """Dense embedding layer.
 
-    Arguments:
-        embed_dim: Positive integer, dimensionality of the output space.
-        reg_lambda: Float; regularization factor.
-        layer_name_suffix: Text added to the name of the layers.
-        similarity_type: Optional type of similarity measure to use,
-            either 'cosine' or 'inner'.
-
     Input shape:
         N-D tensor with shape: `(batch_size, ..., input_dim)`.
         The most common situation would be
@@ -288,20 +288,16 @@ class Embed(tf.keras.layers.Layer):
     """
 
     def __init__(
-        self,
-        embed_dim: int,
-        reg_lambda: float,
-        layer_name_suffix: Text,
-        similarity_type: Optional[Text] = None,
+        self, embed_dim: int, reg_lambda: float, layer_name_suffix: Text
     ) -> None:
-        super().__init__(name=f"embed_{layer_name_suffix}")
+        """Initialize layer.
 
-        self.similarity_type = similarity_type
-        if self.similarity_type and self.similarity_type not in {COSINE, INNER}:
-            raise ValueError(
-                f"Wrong similarity type '{self.similarity_type}', "
-                f"should be '{COSINE}' or '{INNER}'."
-            )
+        Args:
+            embed_dim: Dimensionality of the output space.
+            reg_lambda: Regularization factor.
+            layer_name_suffix: Text added to the name of the layers.
+        """
+        super().__init__(name=f"embed_{layer_name_suffix}")
 
         regularizer = tf.keras.regularizers.l2(reg_lambda)
         self._dense = tf.keras.layers.Dense(
@@ -313,10 +309,8 @@ def __init__(
 
     # noinspection PyMethodOverriding
     def call(self, x: tf.Tensor) -> tf.Tensor:
+        """Apply dense layer."""
         x = self._dense(x)
-        if self.similarity_type == COSINE:
-            x = tf.nn.l2_normalize(x, axis=-1)
-
         return x
 
 
@@ -542,31 +536,7 @@ def f1_score(
 
 
 class DotProductLoss(tf.keras.layers.Layer):
-    """Dot-product loss layer.
-
-    Arguments:
-        num_neg: Positive integer, the number of incorrect labels;
-            the algorithm will minimize their similarity to the input.
-        loss_type: The type of the loss function, either 'softmax' or 'margin'.
-        mu_pos: Float, indicates how similar the algorithm should
-            try to make embedding vectors for correct labels;
-            should be 0.0 < ... < 1.0 for 'cosine' similarity type.
-        mu_neg: Float, maximum negative similarity for incorrect labels,
-            should be -1.0 < ... < 1.0 for 'cosine' similarity type.
-        use_max_sim_neg: Boolean, if 'True' the algorithm only minimizes
-            maximum similarity over incorrect intent labels,
-            used only if 'loss_type' is set to 'margin'.
-        neg_lambda: Float, the scale of how important is to minimize
-            the maximum similarity between embeddings of different labels,
-            used only if 'loss_type' is set to 'margin'.
-        scale_loss: Boolean, if 'True' scale loss inverse proportionally to
-            the confidence of the correct prediction.
-        name: Optional name of the layer.
-        parallel_iterations: Positive integer, the number of iterations allowed
-            to run in parallel.
-        same_sampling: Boolean, if 'True' sample same negative labels
-            for the whole batch.
-    """
+    """Dot-product loss layer."""
 
     def __init__(
         self,
@@ -577,10 +547,45 @@ def __init__(
         use_max_sim_neg: bool,
         neg_lambda: float,
         scale_loss: bool,
+        similarity_type: Text,
         name: Optional[Text] = None,
-        parallel_iterations: int = 1000,
         same_sampling: bool = False,
+        constrain_similarities: bool = True,
+        model_confidence: Text = SOFTMAX,
     ) -> None:
+        """Declare instance variables with default values.
+
+        Args:
+            num_neg: Positive integer, the number of incorrect labels;
+                the algorithm will minimize their similarity to the input.
+            loss_type: The type of the loss function, either 'cross_entropy' or 'margin'.
+            mu_pos: Float, indicates how similar the algorithm should
+                try to make embedding vectors for correct labels;
+                should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+            mu_neg: Float, maximum negative similarity for incorrect labels,
+                should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+            use_max_sim_neg: Boolean, if 'True' the algorithm only minimizes
+                maximum similarity over incorrect intent labels,
+                used only if 'loss_type' is set to 'margin'.
+            neg_lambda: Float, the scale of how important is to minimize
+                the maximum similarity between embeddings of different labels,
+                used only if 'loss_type' is set to 'margin'.
+            scale_loss: Boolean, if 'True' scale loss inverse proportionally to
+                the confidence of the correct prediction.
+            similarity_type: Similarity measure to use, either 'cosine' or 'inner'.
+            name: Optional name of the layer.
+            same_sampling: Boolean, if 'True' sample same negative labels
+                for the whole batch.
+            constrain_similarities: Boolean, if 'True' applies sigmoid on all
+                similarity terms and adds to the loss function to
+                ensure that similarity values are approximately bounded.
+                Used inside _loss_cross_entropy() only.
+            model_confidence: Model confidence to be returned during inference.
+                Possible values - 'softmax', 'cosine' and 'inner'.
+
+        Raises:
+            LayerConfigException: When `similarity_type` is not one of 'cosine' or 'inner'.
+        """
         super().__init__(name=name)
         self.num_neg = num_neg
         self.loss_type = loss_type
@@ -589,8 +594,15 @@ def __init__(
         self.use_max_sim_neg = use_max_sim_neg
         self.neg_lambda = neg_lambda
         self.scale_loss = scale_loss
-        self.parallel_iterations = parallel_iterations
         self.same_sampling = same_sampling
+        self.constrain_similarities = constrain_similarities
+        self.model_confidence = model_confidence
+        self.similarity_type = similarity_type
+        if self.similarity_type not in {COSINE, INNER}:
+            raise TFLayerConfigException(
+                f"Wrong similarity type '{self.similarity_type}', "
+                f"should be '{COSINE}' or '{INNER}'."
+            )
 
     @staticmethod
     def _make_flat(x: tf.Tensor) -> tf.Tensor:
@@ -685,24 +697,49 @@ def _sample_negatives(
             labels_bad_negs,
         )
 
-    @staticmethod
-    def sim(a: tf.Tensor, b: tf.Tensor, mask: Optional[tf.Tensor] = None) -> tf.Tensor:
+    def sim(
+        self, a: tf.Tensor, b: tf.Tensor, mask: Optional[tf.Tensor] = None
+    ) -> tf.Tensor:
         """Calculate similarity between given tensors."""
-
+        if self.similarity_type == COSINE:
+            a = tf.nn.l2_normalize(a, axis=-1)
+            b = tf.nn.l2_normalize(b, axis=-1)
         sim = tf.reduce_sum(a * b, axis=-1)
         if mask is not None:
             sim *= tf.expand_dims(mask, 2)
 
         return sim
 
-    @staticmethod
-    def confidence_from_sim(sim: tf.Tensor, similarity_type: Text) -> tf.Tensor:
-        if similarity_type == COSINE:
-            # clip negative values to zero
-            return tf.nn.relu(sim)
-        else:
-            # normalize result to [0, 1] with softmax
-            return tf.nn.softmax(sim)
+    def _similarity_confidence_from_embeddings(
+        self,
+        input_embeddings: tf.Tensor,
+        label_embeddings: tf.Tensor,
+        mask: Optional[tf.Tensor] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        """Computes similarity between input and label embeddings and model's confidence.
+
+        First compute the similarity from embeddings and then apply an activation
+        function if needed to get the confidence.
+
+        Args:
+            input_embeddings: Embeddings of input.
+            label_embeddings: Embeddings of labels.
+            mask: Mask over input and output sequence.
+
+        Returns:
+            similarity between input and label embeddings and model's prediction confidence for each label.
+        """
+        # If model's prediction confidence is configured to be cosine similarity,
+        # then normalize embeddings to unit vectors.
+        if self.model_confidence == COSINE:
+            input_embeddings = tf.nn.l2_normalize(input_embeddings, axis=-1)
+            label_embeddings = tf.nn.l2_normalize(label_embeddings, axis=-1)
+
+        similarities = self.sim(input_embeddings, label_embeddings, mask)
+        confidences = similarities
+        if self.model_confidence == SOFTMAX:
+            confidences = tf.nn.softmax(similarities)
+        return similarities, confidences
 
     def _train_sim(
         self,
@@ -806,7 +843,7 @@ def _loss_margin(
 
         return loss
 
-    def _loss_softmax(
+    def _loss_cross_entropy(
         self,
         sim_pos: tf.Tensor,
         sim_neg_il: tf.Tensor,
@@ -815,18 +852,15 @@ def _loss_softmax(
         sim_neg_li: tf.Tensor,
         mask: Optional[tf.Tensor],
     ) -> tf.Tensor:
-        """Define softmax loss."""
-
-        logits = tf.concat(
-            [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
+        """Defines cross entropy loss."""
+        loss = self._compute_softmax_loss(
+            sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li
         )
 
-        # create label_ids for softmax
-        label_ids = tf.zeros_like(logits[..., 0], tf.int32)
-
-        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-            labels=label_ids, logits=logits
-        )
+        if self.constrain_similarities:
+            loss += self._compute_sigmoid_loss(
+                sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li
+            )
 
         if self.scale_loss:
             # in case of cross entropy log_likelihood = -loss
@@ -845,18 +879,68 @@ def _loss_softmax(
         # average the loss over the batch
         return tf.reduce_mean(loss)
 
+    @staticmethod
+    def _compute_sigmoid_loss(
+        sim_pos: tf.Tensor,
+        sim_neg_il: tf.Tensor,
+        sim_neg_ll: tf.Tensor,
+        sim_neg_ii: tf.Tensor,
+        sim_neg_li: tf.Tensor,
+    ) -> tf.Tensor:
+        # Constrain similarity values in a range by applying sigmoid
+        # on them individually so that they saturate at extreme values.
+        sigmoid_logits = tf.concat(
+            [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
+        )
+        sigmoid_labels = tf.concat(
+            [
+                tf.ones_like(sigmoid_logits[..., :1]),
+                tf.zeros_like(sigmoid_logits[..., 1:]),
+            ],
+            axis=-1,
+        )
+        sigmoid_loss = tf.nn.sigmoid_cross_entropy_with_logits(
+            labels=sigmoid_labels, logits=sigmoid_logits
+        )
+        # average over logits axis
+        return tf.reduce_mean(sigmoid_loss, axis=-1)
+
+    def _compute_softmax_loss(
+        self,
+        sim_pos: tf.Tensor,
+        sim_neg_il: tf.Tensor,
+        sim_neg_ll: tf.Tensor,
+        sim_neg_ii: tf.Tensor,
+        sim_neg_li: tf.Tensor,
+    ) -> tf.Tensor:
+        # Similarity terms between input and label should be optimized relative
+        # to each other and hence use them as logits for softmax term
+        softmax_logits = tf.concat([sim_pos, sim_neg_il, sim_neg_li], axis=-1)
+        if not self.constrain_similarities:
+            # Concatenate other similarity terms as well. Due to this,
+            # similarity values between input and label may not be
+            # approximately bounded in a defined range.
+            softmax_logits = tf.concat(
+                [softmax_logits, sim_neg_ii, sim_neg_ll], axis=-1
+            )
+        # create label_ids for softmax
+        softmax_label_ids = tf.zeros_like(softmax_logits[..., 0], tf.int32)
+        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            labels=softmax_label_ids, logits=softmax_logits
+        )
+        return softmax_loss
+
     @property
     def _chosen_loss(self) -> Callable:
         """Use loss depending on given option."""
-
         if self.loss_type == MARGIN:
             return self._loss_margin
-        elif self.loss_type == SOFTMAX:
-            return self._loss_softmax
+        elif self.loss_type == CROSS_ENTROPY:
+            return self._loss_cross_entropy
         else:
-            raise ValueError(
+            raise TFLayerConfigException(
                 f"Wrong loss type '{self.loss_type}', "
-                f"should be '{MARGIN}' or '{SOFTMAX}'"
+                f"should be '{MARGIN}' or '{CROSS_ENTROPY}'"
             )
 
     # noinspection PyMethodOverriding
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 6aaf465556f0..697076abef84 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -55,6 +55,8 @@
     CONCAT_DIMENSION,
     DROP_RATE_ATTENTION,
     SCALE_LOSS,
+    CONSTRAIN_SIMILARITIES,
+    MODEL_CONFIDENCE,
 )
 from rasa.utils.tensorflow import layers
 from rasa.utils.tensorflow.transformer import TransformerEncoder
@@ -730,7 +732,6 @@ def _prepare_embed_layers(self, name: Text, prefix: Text = "embed") -> None:
             self.config[EMBEDDING_DIMENSION],
             self.config[REGULARIZATION_CONSTANT],
             name,
-            self.config[SIMILARITY_TYPE],
         )
 
     def _prepare_ffnn_layer(
@@ -789,8 +790,9 @@ def _prepare_dot_product_loss(
             self.config[USE_MAX_NEG_SIM],
             self.config[NEGATIVE_MARGIN_SCALE],
             scale_loss,
-            # set to 1 to get deterministic behaviour
-            parallel_iterations=1 if self.random_seed is not None else 1000,
+            similarity_type=self.config[SIMILARITY_TYPE],
+            constrain_similarities=self.config[CONSTRAIN_SIMILARITIES],
+            model_confidence=self.config[MODEL_CONFIDENCE],
         )
 
     def _prepare_sparse_dense_dropout_layers(
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index b620a87b8a46..ecf0910729ea 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -19,9 +19,12 @@
     AUTO,
     INNER,
     COSINE,
+    CROSS_ENTROPY,
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     DENSE_DIMENSION,
+    CONSTRAIN_SIMILARITIES,
+    MODEL_CONFIDENCE,
 )
 from rasa.shared.nlu.constants import (
     ACTION_NAME,
@@ -31,6 +34,7 @@
 )
 from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS
 from rasa.core.constants import DIALOGUE
+from rasa.shared.exceptions import InvalidConfigException
 
 if TYPE_CHECKING:
     from rasa.nlu.extractors.extractor import EntityTagSpec
@@ -63,7 +67,7 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
     Returns: updated model configuration
     """
     if config.get(SIMILARITY_TYPE) == AUTO:
-        if config[LOSS_TYPE] == SOFTMAX:
+        if config[LOSS_TYPE] == CROSS_ENTROPY:
             config[SIMILARITY_TYPE] = INNER
         elif config[LOSS_TYPE] == MARGIN:
             config[SIMILARITY_TYPE] = COSINE
@@ -71,6 +75,28 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
     return config
 
 
+def update_deprecated_loss_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
+    """If LOSS_TYPE is set to 'softmax', update it to 'cross_entropy' since former is deprecated.
+
+    Args:
+        config: model configuration
+
+    Returns:
+        updated model configuration
+    """
+    # TODO: Completely deprecate this with 3.0
+    if config.get(LOSS_TYPE) == SOFTMAX:
+        rasa.shared.utils.io.raise_deprecation_warning(
+            f"`{LOSS_TYPE}={SOFTMAX}` is deprecated. "
+            f"Please update your configuration file to use"
+            f"`{LOSS_TYPE}={CROSS_ENTROPY}` instead.",
+            warn_until_version=NEXT_MAJOR_VERSION_FOR_DEPRECATIONS,
+        )
+        config[LOSS_TYPE] = CROSS_ENTROPY
+
+    return config
+
+
 def align_token_features(
     list_of_tokens: List[List["Token"]],
     in_token_features: np.ndarray,
@@ -342,6 +368,94 @@ def override_defaults(
     return config
 
 
+def update_confidence_type(component_config: Dict[Text, Any]) -> Dict[Text, Any]:
+    """Set model confidence to cosine if margin loss is used.
+
+    Args:
+        component_config: model configuration
+
+    Returns:
+        updated model configuration
+    """
+    # TODO: Remove this once model_confidence is set to cosine by default.
+    if (
+        component_config[LOSS_TYPE] == MARGIN
+        and component_config[MODEL_CONFIDENCE] == SOFTMAX
+    ):
+        rasa.shared.utils.io.raise_warning(
+            f"Overriding defaults by setting {MODEL_CONFIDENCE} to "
+            f"{COSINE} as {LOSS_TYPE} is set to {MARGIN} in the configuration."
+        )
+        component_config[MODEL_CONFIDENCE] = COSINE
+    return component_config
+
+
+def validate_configuration_settings(component_config: Dict[Text, Any]) -> None:
+    """Performs checks to validate that combination of parameters in the configuration are correctly set.
+
+    Args:
+        component_config: Configuration to validate.
+    """
+    _check_loss_setting(component_config)
+    _check_confidence_setting(component_config)
+    _check_similarity_loss_setting(component_config)
+
+
+def _check_confidence_setting(component_config: Dict[Text, Any]) -> None:
+    if component_config[MODEL_CONFIDENCE] == SOFTMAX:
+        rasa.shared.utils.io.raise_warning(
+            f"{MODEL_CONFIDENCE} is set to `softmax`. It is recommended "
+            f"to set it to `cosine`. It will be set to `cosine` by default, "
+            f"Rasa Open Source 3.0.0 onwards.",
+            category=UserWarning,
+        )
+        if component_config[LOSS_TYPE] not in [SOFTMAX, CROSS_ENTROPY]:
+            raise InvalidConfigException(
+                f"{LOSS_TYPE}={component_config[LOSS_TYPE]} and "
+                f"{MODEL_CONFIDENCE}={SOFTMAX} is not a valid "
+                f"combination. You can use {MODEL_CONFIDENCE}={SOFTMAX} "
+                f"only with {LOSS_TYPE}={CROSS_ENTROPY}."
+            )
+        if component_config[SIMILARITY_TYPE] not in [INNER, AUTO]:
+            raise InvalidConfigException(
+                f"{SIMILARITY_TYPE}={component_config[SIMILARITY_TYPE]} and "
+                f"{MODEL_CONFIDENCE}={SOFTMAX} is not a valid "
+                f"combination. You can use {MODEL_CONFIDENCE}={SOFTMAX} "
+                f"only with {SIMILARITY_TYPE}={INNER}."
+            )
+
+
+def _check_loss_setting(component_config: Dict[Text, Any]) -> None:
+    if not component_config[CONSTRAIN_SIMILARITIES] and component_config[LOSS_TYPE] in [
+        SOFTMAX,
+        CROSS_ENTROPY,
+    ]:
+        rasa.shared.utils.io.raise_warning(
+            f"{CONSTRAIN_SIMILARITIES} is set to `False`. It is recommended "
+            f"to set it to `True` when using cross-entropy loss. It will be set to `True` by default, "
+            f"Rasa Open Source 3.0.0 onwards.",
+            category=UserWarning,
+        )
+
+
+def _check_similarity_loss_setting(component_config: Dict[Text, Any]) -> None:
+    if (
+        component_config[SIMILARITY_TYPE] == COSINE
+        and component_config[LOSS_TYPE] == CROSS_ENTROPY
+        or component_config[SIMILARITY_TYPE] == INNER
+        and component_config[LOSS_TYPE] == MARGIN
+    ):
+        rasa.shared.utils.io.raise_warning(
+            f"`{SIMILARITY_TYPE}={component_config[SIMILARITY_TYPE]}`"
+            f" and `{LOSS_TYPE}={component_config[LOSS_TYPE]}` "
+            f"is not a recommended setting as it may not lead to best results."
+            f"Ideally use `{SIMILARITY_TYPE}={INNER}`"
+            f" and `{LOSS_TYPE}={CROSS_ENTROPY}` or"
+            f"`{SIMILARITY_TYPE}={COSINE}` and `{LOSS_TYPE}={MARGIN}`.",
+            category=UserWarning,
+        )
+
+
 def init_split_entities(
     split_entities_config, default_split_entity
 ) -> Dict[Text, bool]:
diff --git a/tests/core/policies/test_ted_policy.py b/tests/core/policies/test_ted_policy.py
index ea790d422127..de3c7668e008 100644
--- a/tests/core/policies/test_ted_policy.py
+++ b/tests/core/policies/test_ted_policy.py
@@ -32,8 +32,12 @@
     SCALE_LOSS,
     SIMILARITY_TYPE,
     VALUE_RELATIVE_ATTENTION,
+    MODEL_CONFIDENCE,
+    COSINE,
+    INNER,
 )
 from tests.core.test_policies import PolicyTestCollection
+from rasa.shared.constants import DEFAULT_SENDER_ID
 
 UTTER_GREET_ACTION = "utter_greet"
 GREET_INTENT_NAME = "greet"
@@ -264,7 +268,10 @@ def create_policy(
         )
 
     def test_similarity_type(self, trained_policy: TEDPolicy):
-        assert trained_policy.config[SIMILARITY_TYPE] == "cosine"
+        assert trained_policy.config[SIMILARITY_TYPE] == COSINE
+
+    def test_confidence_type(self, trained_policy: TEDPolicy):
+        assert trained_policy.config[MODEL_CONFIDENCE] == COSINE
 
     def test_normalization(
         self,
@@ -283,6 +290,18 @@ def test_normalization(
         # function should not get called for margin loss_type
         mock.normalize.assert_not_called()
 
+    def test_prediction_on_empty_tracker(
+        self, trained_policy: Policy, default_domain: Domain
+    ):
+        tracker = DialogueStateTracker(DEFAULT_SENDER_ID, default_domain.slots)
+        prediction = trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        )
+        assert not prediction.is_end_to_end_prediction
+        assert len(prediction.probabilities) == default_domain.num_actions
+        assert max(prediction.probabilities) <= 1.0
+        assert min(prediction.probabilities) >= -1.0
+
 
 class TestTEDPolicyWithEval(TestTEDPolicy):
     def create_policy(
@@ -330,6 +349,106 @@ def test_normalization(
         mock.normalize.assert_not_called()
 
 
+class TestTEDPolicyCosineConfidence(TestTEDPolicy):
+    def create_policy(
+        self, featurizer: Optional[TrackerFeaturizer], priority: int
+    ) -> Policy:
+        return TEDPolicy(
+            featurizer=featurizer, priority=priority, **{MODEL_CONFIDENCE: COSINE}
+        )
+
+    def test_confidence_type(self, trained_policy: TEDPolicy):
+        assert trained_policy.config[MODEL_CONFIDENCE] == COSINE
+
+    def test_normalization(
+        self,
+        trained_policy: Policy,
+        tracker: DialogueStateTracker,
+        default_domain: Domain,
+        monkeypatch: MonkeyPatch,
+    ):
+        # first check the output is what we expect
+        predicted_probabilities = trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        ).probabilities
+        # there should be no normalization
+        confidence_in_range = [
+            -1 <= confidence <= 1 for confidence in predicted_probabilities
+        ]
+        assert all(confidence_in_range)
+
+        # also check our function is not called
+        mock = Mock()
+        monkeypatch.setattr(train_utils, "normalize", mock.normalize)
+        trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        )
+
+        mock.normalize.assert_not_called()
+
+    def test_prediction_on_empty_tracker(
+        self, trained_policy: Policy, default_domain: Domain
+    ):
+        tracker = DialogueStateTracker(DEFAULT_SENDER_ID, default_domain.slots)
+        prediction = trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        )
+        assert not prediction.is_end_to_end_prediction
+        assert len(prediction.probabilities) == default_domain.num_actions
+        assert max(prediction.probabilities) <= 1.0
+        assert min(prediction.probabilities) >= -1.0
+
+
+class TestTEDPolicyInnerConfidence(TestTEDPolicy):
+    def create_policy(
+        self, featurizer: Optional[TrackerFeaturizer], priority: int
+    ) -> Policy:
+        return TEDPolicy(
+            featurizer=featurizer, priority=priority, **{MODEL_CONFIDENCE: INNER}
+        )
+
+    def test_confidence_type(self, trained_policy: TEDPolicy):
+        assert trained_policy.config[MODEL_CONFIDENCE] == INNER
+
+    def test_normalization(
+        self,
+        trained_policy: Policy,
+        tracker: DialogueStateTracker,
+        default_domain: Domain,
+        monkeypatch: MonkeyPatch,
+    ):
+        # first check the output is what we expect
+        predicted_probabilities = trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        ).probabilities
+        # there should be no normalization
+        confidence_in_range = [
+            -1e9 <= confidence <= 1e9 for confidence in predicted_probabilities
+        ]
+        assert all(confidence_in_range)
+
+        # also check our function is not called
+        mock = Mock()
+        monkeypatch.setattr(train_utils, "normalize", mock.normalize)
+        trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        )
+
+        mock.normalize.assert_not_called()
+
+    def test_prediction_on_empty_tracker(
+        self, trained_policy: Policy, default_domain: Domain
+    ):
+        tracker = DialogueStateTracker(DEFAULT_SENDER_ID, default_domain.slots)
+        prediction = trained_policy.predict_action_probabilities(
+            tracker, default_domain, RegexInterpreter()
+        )
+        assert not prediction.is_end_to_end_prediction
+        assert len(prediction.probabilities) == default_domain.num_actions
+        assert max(prediction.probabilities) <= 1e9
+        assert min(prediction.probabilities) >= -1e9
+
+
 class TestTEDPolicyLowRankingLength(TestTEDPolicy):
     def create_policy(
         self, featurizer: Optional[TrackerFeaturizer], priority: int
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index bb56ee931347..bb0c2aa13cb0 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -4,6 +4,7 @@
 import pytest
 from unittest.mock import Mock
 from typing import List, Text, Dict, Any
+from _pytest.monkeypatch import MonkeyPatch
 
 import rasa.model
 from rasa.shared.nlu.training_data.features import Features
@@ -31,6 +32,7 @@
     BILOU_FLAG,
     ENTITY_RECOGNITION,
     INTENT_CLASSIFICATION,
+    MODEL_CONFIDENCE,
 )
 from rasa.nlu.components import ComponentBuilder
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
@@ -367,6 +369,72 @@ async def test_softmax_normalization(
     assert parse_data.get("intent") == intent_ranking[0]
 
 
+@pytest.mark.parametrize(
+    "classifier_params, prediction_min, prediction_max, output_length",
+    [
+        (
+            {RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "cosine"},
+            -1,
+            1,
+            LABEL_RANKING_LENGTH,
+        ),
+        (
+            {RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "inner"},
+            -1e9,
+            1e9,
+            LABEL_RANKING_LENGTH,
+        ),
+    ],
+)
+async def test_cross_entropy_without_normalization(
+    component_builder: ComponentBuilder,
+    tmp_path: Path,
+    classifier_params: Dict[Text, Any],
+    prediction_min: float,
+    prediction_max: float,
+    output_length: int,
+    monkeypatch: MonkeyPatch,
+):
+    pipeline = as_pipeline(
+        "WhitespaceTokenizer", "CountVectorsFeaturizer", "DIETClassifier"
+    )
+    assert pipeline[2]["name"] == "DIETClassifier"
+    pipeline[2].update(classifier_params)
+
+    _config = RasaNLUModelConfig({"pipeline": pipeline})
+    (trained_model, _, persisted_path) = await train(
+        _config,
+        path=str(tmp_path),
+        data="data/test/many_intents.md",
+        component_builder=component_builder,
+    )
+    loaded = Interpreter.load(persisted_path, component_builder)
+
+    mock = Mock()
+    monkeypatch.setattr(train_utils, "normalize", mock.normalize)
+
+    parse_data = loaded.parse("hello")
+    intent_ranking = parse_data.get("intent_ranking")
+
+    # check that the output was correctly truncated
+    assert len(intent_ranking) == output_length
+
+    intent_confidences = [intent.get("confidence") for intent in intent_ranking]
+
+    # check each confidence is in range
+    confidence_in_range = [
+        prediction_min <= confidence <= prediction_max
+        for confidence in intent_confidences
+    ]
+    assert all(confidence_in_range)
+
+    # normalize shouldn't have been called
+    mock.normalize.assert_not_called()
+
+    # check whether the normalization of rankings is reflected in intent prediction
+    assert parse_data.get("intent") == intent_ranking[0]
+
+
 @pytest.mark.parametrize(
     "classifier_params, output_length",
     [({LOSS_TYPE: "margin", RANDOM_SEED: 42, EPOCHS: 1}, LABEL_RANKING_LENGTH)],
diff --git a/tests/nlu/selectors/test_selectors.py b/tests/nlu/selectors/test_selectors.py
index 6d5c4aabea9d..610ef3304efb 100644
--- a/tests/nlu/selectors/test_selectors.py
+++ b/tests/nlu/selectors/test_selectors.py
@@ -3,6 +3,8 @@
 import pytest
 import numpy as np
 from typing import List, Dict, Text, Any
+from mock import Mock
+from _pytest.monkeypatch import MonkeyPatch
 
 import rasa.model
 from rasa.nlu import train
@@ -19,12 +21,18 @@
     EVAL_NUM_EPOCHS,
     EVAL_NUM_EXAMPLES,
     CHECKPOINT_MODEL,
+    MODEL_CONFIDENCE,
+    RANDOM_SEED,
+    RANKING_LENGTH,
+    LOSS_TYPE,
 )
+from rasa.utils import train_utils
 from rasa.shared.nlu.constants import TEXT
 from rasa.shared.constants import DIAGNOSTIC_DATA
 from rasa.nlu.selectors.response_selector import ResponseSelector
 from rasa.shared.nlu.training_data.message import Message
 from rasa.shared.nlu.training_data.training_data import TrainingData
+from tests.nlu.classifiers.test_diet_classifier import as_pipeline
 
 
 @pytest.mark.parametrize(
@@ -315,3 +323,126 @@ async def test_process_gives_diagnostic_data(trained_response_selector_bot: Path
     assert "attention_weights" in diagnostic_data[name]
     # By default, ResponseSelector has `number_of_transformer_layers = 0`
     assert diagnostic_data[name].get("attention_weights") is None
+
+
+@pytest.mark.parametrize(
+    "classifier_params, prediction_min, prediction_max, output_length",
+    [
+        ({RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "cosine"}, -1, 1, 9),
+        ({RANDOM_SEED: 42, EPOCHS: 1, MODEL_CONFIDENCE: "inner"}, -1e9, 1e9, 9),
+    ],
+)
+async def test_cross_entropy_without_normalization(
+    component_builder: ComponentBuilder,
+    tmp_path: Path,
+    classifier_params: Dict[Text, Any],
+    prediction_min: float,
+    prediction_max: float,
+    output_length: int,
+    monkeypatch: MonkeyPatch,
+):
+    pipeline = as_pipeline(
+        "WhitespaceTokenizer", "CountVectorsFeaturizer", "ResponseSelector"
+    )
+    assert pipeline[2]["name"] == "ResponseSelector"
+    pipeline[2].update(classifier_params)
+
+    _config = RasaNLUModelConfig({"pipeline": pipeline})
+    (trained_model, _, persisted_path) = await train(
+        _config,
+        path=str(tmp_path),
+        data="data/test_selectors",
+        component_builder=component_builder,
+    )
+    loaded = Interpreter.load(persisted_path, component_builder)
+
+    mock = Mock()
+    monkeypatch.setattr(train_utils, "normalize", mock.normalize)
+
+    parse_data = loaded.parse("hello")
+    response_ranking = parse_data.get("response_selector").get("default").get("ranking")
+
+    # check that the output was correctly truncated
+    assert len(response_ranking) == output_length
+
+    response_confidences = [response.get("confidence") for response in response_ranking]
+
+    # check each confidence is in range
+    confidence_in_range = [
+        prediction_min <= confidence <= prediction_max
+        for confidence in response_confidences
+    ]
+    assert all(confidence_in_range)
+
+    # normalize shouldn't have been called
+    mock.normalize.assert_not_called()
+
+
+@pytest.mark.parametrize(
+    "classifier_params", [({LOSS_TYPE: "margin", RANDOM_SEED: 42, EPOCHS: 1})],
+)
+async def test_margin_loss_is_not_normalized(
+    monkeypatch: MonkeyPatch,
+    component_builder: ComponentBuilder,
+    tmp_path: Path,
+    classifier_params: Dict[Text, int],
+):
+    pipeline = as_pipeline(
+        "WhitespaceTokenizer", "CountVectorsFeaturizer", "ResponseSelector"
+    )
+    assert pipeline[2]["name"] == "ResponseSelector"
+    pipeline[2].update(classifier_params)
+
+    mock = Mock()
+    monkeypatch.setattr(train_utils, "normalize", mock.normalize)
+
+    _config = RasaNLUModelConfig({"pipeline": pipeline})
+    (trained_model, _, persisted_path) = await train(
+        _config,
+        path=str(tmp_path),
+        data="data/test_selectors",
+        component_builder=component_builder,
+    )
+    loaded = Interpreter.load(persisted_path, component_builder)
+
+    parse_data = loaded.parse("hello")
+    response_ranking = parse_data.get("response_selector").get("default").get("ranking")
+
+    # check that the output was not normalized
+    mock.normalize.assert_not_called()
+
+    # check that the output was correctly truncated
+    assert len(response_ranking) == 9
+
+
+@pytest.mark.parametrize(
+    "classifier_params, data_path, output_length",
+    [
+        ({RANDOM_SEED: 42, EPOCHS: 1}, "data/test_selectors", 9),
+        ({RANDOM_SEED: 42, RANKING_LENGTH: 0, EPOCHS: 1}, "data/test_selectors", 9),
+        ({RANDOM_SEED: 42, RANKING_LENGTH: 2, EPOCHS: 1}, "data/test_selectors", 2),
+    ],
+)
+async def test_softmax_ranking(
+    component_builder: ComponentBuilder,
+    tmp_path: Path,
+    classifier_params: Dict[Text, int],
+    data_path: Text,
+    output_length: int,
+):
+    pipeline = as_pipeline(
+        "WhitespaceTokenizer", "CountVectorsFeaturizer", "ResponseSelector"
+    )
+    assert pipeline[2]["name"] == "ResponseSelector"
+    pipeline[2].update(classifier_params)
+
+    _config = RasaNLUModelConfig({"pipeline": pipeline})
+    (trained_model, _, persisted_path) = await train(
+        _config, path=str(tmp_path), data=data_path, component_builder=component_builder
+    )
+    loaded = Interpreter.load(persisted_path, component_builder)
+
+    parse_data = loaded.parse("hello")
+    response_ranking = parse_data.get("response_selector").get("default").get("ranking")
+    # check that the output was correctly truncated after normalization
+    assert len(response_ranking) == output_length
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 9ec906d9606e..e3e02d7ca31a 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pytest
+from typing import Text
 
 import rasa.utils.train_utils as train_utils
 from rasa.nlu.constants import NUMBER_OF_SUB_TOKENS
@@ -10,6 +11,17 @@
     SPLIT_ENTITIES_BY_COMMA_DEFAULT_VALUE,
     SPLIT_ENTITIES_BY_COMMA,
 )
+from rasa.utils.tensorflow.constants import (
+    MODEL_CONFIDENCE,
+    SIMILARITY_TYPE,
+    LOSS_TYPE,
+    COSINE,
+    SOFTMAX,
+    INNER,
+    CROSS_ENTROPY,
+    MARGIN,
+)
+from rasa.shared.exceptions import InvalidConfigException
 
 
 def test_align_token_features():
@@ -35,6 +47,18 @@ def test_align_token_features():
     assert np.all(actual_features[0][4] == np.mean(token_features[0][5:10], axis=0))
 
 
+@pytest.mark.parametrize(
+    "input_values, ranking_length, output_values",
+    [
+        ([0.2, 0.7, 0.1], 2, [0.2222222, 0.77777778, 0.0]),
+        ([0.1, 0.7, 0.1], 5, [0.11111111, 0.77777778, 0.11111111]),
+    ],
+)
+def test_normalize(input_values, ranking_length, output_values):
+    normalized_values = train_utils.normalize(np.array(input_values), ranking_length)
+    assert np.allclose(normalized_values, np.array(output_values), atol=1e-5)
+
+
 @pytest.mark.parametrize(
     "split_entities_config, expected_initialized_config",
     [
@@ -61,3 +85,66 @@ def test_init_split_entities_config(
         )
         == expected_initialized_config
     )
+
+
+@pytest.mark.parametrize(
+    "component_config, raises_exception",
+    [
+        ({MODEL_CONFIDENCE: SOFTMAX, LOSS_TYPE: MARGIN}, True),
+        ({MODEL_CONFIDENCE: SOFTMAX, LOSS_TYPE: SOFTMAX}, False),
+        ({MODEL_CONFIDENCE: SOFTMAX, LOSS_TYPE: CROSS_ENTROPY}, False),
+        ({MODEL_CONFIDENCE: COSINE, LOSS_TYPE: MARGIN}, False),
+        ({MODEL_CONFIDENCE: COSINE, LOSS_TYPE: SOFTMAX}, False),
+        ({MODEL_CONFIDENCE: COSINE, LOSS_TYPE: CROSS_ENTROPY}, False),
+        ({MODEL_CONFIDENCE: INNER, LOSS_TYPE: MARGIN}, False),
+        ({MODEL_CONFIDENCE: INNER, LOSS_TYPE: SOFTMAX}, False),
+        ({MODEL_CONFIDENCE: INNER, LOSS_TYPE: CROSS_ENTROPY}, False),
+    ],
+)
+def test_confidence_loss_settings(
+    component_config: Dict[Text, Any], raises_exception: bool
+):
+    component_config[SIMILARITY_TYPE] = INNER
+    if raises_exception:
+        with pytest.raises(InvalidConfigException):
+            train_utils._check_confidence_setting(component_config)
+    else:
+        train_utils._check_confidence_setting(component_config)
+
+
+@pytest.mark.parametrize(
+    "component_config, raises_exception",
+    [
+        ({MODEL_CONFIDENCE: SOFTMAX, SIMILARITY_TYPE: INNER}, False),
+        ({MODEL_CONFIDENCE: SOFTMAX, SIMILARITY_TYPE: COSINE}, True),
+        ({MODEL_CONFIDENCE: COSINE, SIMILARITY_TYPE: INNER}, False),
+        ({MODEL_CONFIDENCE: COSINE, SIMILARITY_TYPE: COSINE}, False),
+        ({MODEL_CONFIDENCE: INNER, SIMILARITY_TYPE: INNER}, False),
+        ({MODEL_CONFIDENCE: INNER, SIMILARITY_TYPE: COSINE}, False),
+    ],
+)
+def test_confidence_similarity_settings(
+    component_config: Dict[Text, Any], raises_exception: bool
+):
+    component_config[LOSS_TYPE] = SOFTMAX
+    if raises_exception:
+        with pytest.raises(InvalidConfigException):
+            train_utils._check_confidence_setting(component_config)
+    else:
+        train_utils._check_confidence_setting(component_config)
+
+
+@pytest.mark.parametrize(
+    "component_config, model_confidence",
+    [
+        ({MODEL_CONFIDENCE: SOFTMAX, LOSS_TYPE: MARGIN}, COSINE),
+        ({MODEL_CONFIDENCE: SOFTMAX, LOSS_TYPE: CROSS_ENTROPY}, SOFTMAX),
+        ({MODEL_CONFIDENCE: COSINE, LOSS_TYPE: CROSS_ENTROPY}, COSINE),
+        ({MODEL_CONFIDENCE: COSINE, LOSS_TYPE: MARGIN}, COSINE),
+    ],
+)
+def test_update_confidence_type(
+    component_config: Dict[Text, Text], model_confidence: Text
+):
+    component_config = train_utils.update_confidence_type(component_config)
+    assert component_config[MODEL_CONFIDENCE] == model_confidence