diff --git a/changelog/7496.feature.md b/changelog/7496.feature.md
new file mode 100644
index 000000000000..ddb20b9bbbd9
--- /dev/null
+++ b/changelog/7496.feature.md
@@ -0,0 +1,36 @@
+Make [TED Policy](./policies.mdx#ted-policy) an end-to-end policy. Namely, make it possible to train TED on stories that contain
+intent and entities or user text and bot actions or bot text.
+If you don't have text in your stories, TED will behave the same way as before.
+Add possibility to predict entities using TED.
+
+Here's an example of a dialogue in the Rasa story format:
+
+```rasa-yaml
+stories:
+- story: collect restaurant booking info  # name of the story - just for debugging
+  steps:
+  - intent: greet                          # user message with no entities
+  - action: utter_ask_howcanhelp           # action that the bot should execute
+  - intent: inform                         # user message with entities
+    entities:
+    - location: "rome"
+    - price: "cheap"
+  - bot: On it                             # actual text that bot can output
+  - action: utter_ask_cuisine
+  - user: I would like [spanish](cuisine). # actual text that user input
+  - action: utter_ask_num_people
+```
+
+Some model options for `TEDPolicy` got renamed.
+Please update your configuration files using the following mapping:
+
+|      Old model option       |                  New model option                      |
+|-----------------------------|--------------------------------------------------------|
+|transformer_size             |dictionary “transformer_size” with keys                 |
+|                             |“text”, “action_text”, “label_action_text”, “dialogue”  |
+|number_of_transformer_layers |dictionary “number_of_transformer_layers” with keys     |
+|                             |“text”, “action_text”, “label_action_text”, “dialogue”  |
+|dense_dimension              |dictionary “dense_dimension” with keys                  |
+|                             |“text”, “action_text”, “label_action_text”, “intent”,   |
+|                             |“action_name”, “label_action_name”, “entities”, “slots”,|
+|                             |“active_loop”                                           |
diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx
index e6764790ba90..23c41bd17c94 100644
--- a/docs/docs/components.mdx
+++ b/docs/docs/components.mdx
@@ -1485,7 +1485,7 @@ However, additional parameters exist that can be adapted.
 +=================================+==================+==============================================================+
 | hidden_layers_sizes             | text: []         | Hidden layer sizes for layers before the embedding layers    |
 |                                 | label: []        | for user messages and labels. The number of hidden layers is |
-|                                 |                  | equal to the length of the corresponding.                    |
+|                                 |                  | equal to the length of the corresponding list.                    |
 +---------------------------------+------------------+--------------------------------------------------------------+
 | share_hidden_layers             | False            | Whether to share the hidden layer weights between user       |
 |                                 |                  | messages and labels.                                         |
@@ -1519,8 +1519,8 @@ However, additional parameters exist that can be adapted.
 +---------------------------------+------------------+--------------------------------------------------------------+
 | embedding_dimension             | 20               | Dimension size of embedding vectors.                         |
 +---------------------------------+------------------+--------------------------------------------------------------+
-| dense_dimension                 | text: 128        | Dense dimension for sparse features to use if no dense       |
-|                                 | label: 20        | features are present.                                        |
+| dense_dimension                 | text: 128        | Dense dimension for sparse features to use.                  |
+|                                 | label: 20        |                                                              |
 +---------------------------------+------------------+--------------------------------------------------------------+
 | concat_dimension                | text: 128        | Concat dimension for sequence and sentence features.         |
 |                                 | label: 20        |                                                              |
diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index 56eced375460..1d51261010ae 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -12,8 +12,54 @@ how you can migrate from one version to another.
 
 ## Rasa 2.1 to Rasa 2.2
 
+### General
+
+`TEDPolicy`'s  `transformer_size`, `number_of_transformer_layers`,
+and `dense_dimensions` parameters have been renamed.
+Please update your configuration files using the following mapping:
+
+|      Old Model Parameter    |                 New Model Parameter                    |
+|-----------------------------|--------------------------------------------------------|
+|`transformer_size`           |dictionary `transformer_size` with keys                 |
+|                             |`text`, `action_text`, `label_action_text`, `dialogue`  |
+|`number_of_transformer_layers`|dictionary `number_of_transformer_layers` with keys    |
+|                             |`text`, `action_text`, `label_action_text`, `dialogue`  |
+|`dense_dimension`            |dictionary `dense_dimension` with keys                  |
+|                             |`text`, `action_text`, `label_action_text`, `intent`,   |
+|                             |`action_name`, `label_action_name`, `entities`, `slots`,|
+|                             |`active_loop`                                           |
+
+For example:
+
+```yaml-rasa title="config.yml"
+policies:
+  - name: TEDPolicy
+    transformer_size:
+      text: 128
+      action_text: 128
+      label_action_text: 128
+      dialogue: 128
+    number_of_transformer_layers:
+      text: 1
+      action_text: 1
+      label_action_text: 1
+      dialogue: 1
+    dense_dimension:
+      text: 128
+      action_text: 128
+      label_action_text: 128
+      intent: 20
+      action_name: 20
+      label_action_name: 20
+      entities: 20
+      slots: 20
+      active_loop: 20
+```
+
+
 ### Deprecations
 
+#### Markdown Data
 Training and test data in Markdown format is now deprecated. This includes:
 - reading and writing of story files in Markdown format
 - reading and writing of NLU data in Markdown format
@@ -24,6 +70,7 @@ Support for Markdown data will be removed entirely in Rasa Open Source 3.0.0.
 Please convert your existing Markdown data by using the commands
 described [here](./migration-guide.mdx#training-data-files).
 
+
 ### Policies
 
 [Policies](./policies.mdx) now require a `**kwargs` argument in their constructor and `load` method.
@@ -31,6 +78,15 @@ Policies without `**kwargs` will be supported until Rasa version `3.0.0`.
 However when using [incremental training](./command-line-interface.mdx#incremental-training)
 `**kwargs` **must** be included.
 
+
+#### Other
+
+* `Domain.random_template_for` is deprecated and will be removed in Rasa Open Source
+   3.0.0. You can alternatively use the `TemplatedNaturalLanguageGenerator`.
+* `Domain.action_names` is deprecated and will be removed in Rasa Open Source
+   3.0.0. Please use `Domain.action_names_or_texts` instead.
+
+
 ## Rasa 2.0 to Rasa 2.1
 
 ### Deprecations
diff --git a/docs/docs/policies.mdx b/docs/docs/policies.mdx
index 136b74327e60..6dcd139907cf 100644
--- a/docs/docs/policies.mdx
+++ b/docs/docs/policies.mdx
@@ -80,25 +80,44 @@ Doing so can lead to unexpected and undesired bot behavior.
 
 ### TED Policy
 
-The Transformer Embedding Dialogue (TED) Policy is described in
+The Transformer Embedding Dialogue (TED) Policy is
+a multi-task architecture for next action prediction and entity
+recognition. The architecture several transformer encoders which are shared for both tasks.
+A sequence of entity labels is predicted through a Conditional Random Field (CRF) tagging layer on top of the
+user sequence transformer encoder output corresponding to the input sequence of tokens.
+For the next action prediction the dialogue transformer encoder output and system action labels are embedded into a
+single semantic vector space. We use the dot-product loss to maximize the similarity with the target label and
+minimize similarities with negative samples.
+
+If you want to learn more about the model, check out
 [our paper](https://arxiv.org/abs/1910.00486) and on our
 [youtube channel](https://www.youtube.com/watch?v=j90NvurJI4I&list=PL75e0qA87dlG-za8eLI6t0_Pbxafk-cxb&index=14&ab_channel=Rasa).
+where we explain the model architecture in detail.
 
-This policy has a pre-defined architecture, which comprises the
-following steps:
+TED Policy architecture comprises the following steps:
 
-1. Concatenate user input (user intent and entities), previous system actions, slots and active forms for each time
-  step into an input vector to pre-transformer embedding layer.
+1. Concatenate features for
+   - user input (user intent and entities) or user text processed through a user sequence transformer encoder,
+   - previous system actions or bot utterances processed through a bot sequence transformer encoder,
+   - slots and active forms
 
-2. Feed the input vector into a transformer.
+   for each time step into an input vector to the embedding layer that precedes the 
+   dialogue transformer.
 
-3. Apply a dense layer to the output of the transformer to get embeddings of a dialogue for each time step.
+2. Feed the embedding of the input vector into the dialogue transformer encoder.
+
+3. Apply a dense layer to the output of the dialogue transformer to get embeddings of the dialogue for each time step.
 
 4. Apply a dense layer to create embeddings for system actions for each time step.
 
 5. Calculate the similarity between the dialogue embedding and embedded system actions.
    This step is based on the [StarSpace](https://arxiv.org/abs/1709.03856) idea.
 
+6. Concatenate the token-level output of the user sequence transformer encoder
+   with the output of the dialogue transformer encoder for each time step.
+
+7. Apply CRF algorithm to predict contextual entities for each user text input.
+
 **Configuration:**
 
 You can pass configuration parameters to the `TEDPolicy` using the `config.yml` file.
@@ -135,35 +154,20 @@ If you want to fine-tune your model, start by modifying the following parameters
     max_history: 8
   ```
 
-* `hidden_layers_sizes`:
-  This parameter allows you to define the number of feed forward layers and their output
-  dimensions for dialogues and intents (it defaults to: `dialogue: [], label: []`).
-  Every entry in the list corresponds to a feed forward layer.
-  For example, if you use the following configuration:
-
-  ```yaml-rasa title="config.yml"
-  policies:
-  - name: TEDPolicy
-    hidden_layers_sizes:
-      dialogue: [256, 128]
-  ```
-
-  Rasa Open Source will add two feed forward layers in front of the transformer.
-  The vectors of the input tokens (coming from the dialogue) will be passed on to those
-  layers. The first layer will have an output dimension of 256 and the second layer will have an output
-  dimension of 128. If an empty list is used (default behavior), no feed forward layer will be
-  added.
-  Make sure to use only positive integer values. Usually, numbers of power of two are used.
-  Also, it is usual practice to have decreasing values in the list: next value is smaller or equal to the
-  value before.
-
 * `number_of_transformer_layers`:
-  This parameter sets the number of transformer layers to use (default: `1`).
-  The number of transformer layers corresponds to the transformer blocks to use for the model.
+  This parameter sets the number of sequence transformer encoder layers to use for
+  sequential transformer encoders for user, action and action label texts and for
+  dialogue transformer encoder.
+  (defaults: `text: 1, action_text: 1, label_action_text: 1, dialogue: 1`).
+  The number of sequence transformer encoder layers corresponds
+  to the transformer blocks to use for the model.
 
 * `transformer_size`:
-  This parameter sets the number of units in the transformer (default: `128`).
-  The vectors coming out of the transformers will have the given `transformer_size`.
+  This parameter sets the number of units in the sequence transformer encoder layers to use for
+  sequential transformer encoders for user, action and action label texts and for
+  dialogue transformer encoder.
+  (defaults: `text: 128, action_text: 128, label_action_text: 128, dialogue: 128`).
+  The vectors coming out of the transformer encoders will have the given `transformer_size`.
 
 * `weight_sparsity`:
   This parameter defines the fraction of kernel weights that are set to 0 for all feed forward layers
@@ -178,109 +182,144 @@ However, additional parameters exist that can be adapted.
 <details><summary>More configurable parameters</summary>
 
 ```
-+---------------------------------+------------------+--------------------------------------------------------------+
-| Parameter                       | Default Value    | Description                                                  |
-+=================================+==================+==============================================================+
-| hidden_layers_sizes             | dialogue: []     | Hidden layer sizes for layers before the embedding layers    |
-|                                 | label: []        | for dialogue and labels. The number of hidden layers is      |
-|                                 |                  | equal to the length of the corresponding.                    |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| transformer_size                | 128              | Number of units in transformer.                              |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| number_of_transformer_layers    | 1                | Number of transformer layers.                                |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| number_of_attention_heads       | 4                | Number of attention heads in transformer.                    |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| use_key_relative_attention      | False            | If 'True' use key relative embeddings in attention.          |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| use_value_relative_attention    | False            | If 'True' use value relative embeddings in attention.        |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| max_relative_position           | None             | Maximum position for relative embeddings.                    |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| batch_size                      | [64, 256]        | Initial and final value for batch sizes.                     |
-|                                 |                  | Batch size will be linearly increased for each epoch.        |
-|                                 |                  | If constant `batch_size` is required, pass an int, e.g. `8`. |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| batch_strategy                  | "balanced"       | Strategy used when creating batches.                         |
-|                                 |                  | Can be either 'sequence' or 'balanced'.                      |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| epochs                          | 1                | Number of epochs to train.                                   |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| random_seed                     | None             | Set random seed to any 'int' to get reproducible results.    |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| embedding_dimension             | 20               | Dimension size of embedding vectors.                         |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| number_of_negative_examples     | 20               | The number of incorrect labels. The algorithm will minimize  |
-|                                 |                  | their similarity to the user input during training.          |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| similarity_type                 | "auto"           | Type of similarity measure to use, either 'auto' or 'cosine' |
-|                                 |                  | or 'inner'.                                                  |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| loss_type                       | "softmax"        | The type of the loss function, either 'softmax' or 'margin'. |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| ranking_length                  | 10               | Number of top actions to normalize scores for loss type      |
-|                                 |                  | 'softmax'. Set to 0 to turn off normalization.               |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| maximum_positive_similarity     | 0.8              | Indicates how similar the algorithm should try to make       |
-|                                 |                  | embedding vectors for correct labels.                        |
-|                                 |                  | Should be 0.0 < ... < 1.0 for 'cosine' similarity type.      |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| maximum_negative_similarity     | -0.2             | Maximum negative similarity for incorrect labels.            |
-|                                 |                  | Should be -1.0 < ... < 1.0 for 'cosine' similarity type.     |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| use_maximum_negative_similarity | True             | If 'True' the algorithm only minimizes maximum similarity    |
-|                                 |                  | over incorrect intent labels, used only if 'loss_type' is    |
-|                                 |                  | set to 'margin'.                                             |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| scale_loss                      | True             | Scale loss inverse proportionally to confidence of correct   |
-|                                 |                  | prediction.                                                  |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| regularization_constant         | 0.001            | The scale of regularization.                                 |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| negative_margin_scale           | 0.8              | The scale of how important it is to minimize the maximum     |
-|                                 |                  | similarity between embeddings of different labels.           |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| drop_rate_dialogue              | 0.1              | Dropout rate for embedding layers of dialogue features.      |
-|                                 |                  | Value should be between 0 and 1.                             |
-|                                 |                  | The higher the value the higher the regularization effect.   |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| drop_rate_label                 | 0.0              | Dropout rate for embedding layers of label features.         |
-|                                 |                  | Value should be between 0 and 1.                             |
-|                                 |                  | The higher the value the higher the regularization effect.   |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| drop_rate_attention             | 0.0              | Dropout rate for attention. Value should be between 0 and 1. |
-|                                 |                  | The higher the value the higher the regularization effect.   |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| weight_sparsity                 | 0.8              | Sparsity of the weights in dense layers.                     |
-|                                 |                  | Value should be between 0 and 1.                             |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| evaluate_every_number_of_epochs | 20               | How often to calculate validation accuracy.                  |
-|                                 |                  | Set to '-1' to evaluate just once at the end of training.    |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| evaluate_on_number_of_examples  | 0                | How many examples to use for hold out validation set.        |
-|                                 |                  | Large values may hurt performance, e.g. model accuracy.      |
-|                                 |                  | Keep at 0 if your data set contains a lot of unique examples |
-|                                 |                  | of dialogue turns.                                           |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| tensorboard_log_directory       | None             | If you want to use tensorboard to visualize training         |
-|                                 |                  | metrics, set this option to a valid output directory. You    |
-|                                 |                  | can view the training metrics after training in tensorboard  |
-|                                 |                  | via 'tensorboard --logdir <path-to-given-directory>'.        |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| tensorboard_log_level           | "epoch"          | Define when training metrics for tensorboard should be       |
-|                                 |                  | logged. Either after every epoch ('epoch') or for every      |
-|                                 |                  | training step ('minibatch').                                 |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| checkpoint_model                | False            | Save the best performing model during training. Models are   |
-|                                 |                  | stored to the location specified by `--out`. Only the one    |
-|                                 |                  | best model will be saved.                                    |
-|                                 |                  | Requires `evaluate_on_number_of_examples > 0` and            |
-|                                 |                  | `evaluate_every_number_of_epochs > 0`                        |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| featurizers                     | []               | List of featurizer names (alias names). Only features        |
-|                                 |                  | coming from the listed names are used. If list is empty      |
-|                                 |                  | all available features are used.                             |
-+---------------------------------+------------------+--------------------------------------------------------------+
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| Parameter                             | Default Value          | Description                                                  |
++=======================================+========================+==============================================================+
+| hidden_layers_sizes                   | text: []               | Hidden layer sizes for layers before the embedding layers    |
+|                                       | action_text: []        | for user messages and bot messages in previous actions       |
+|                                       | label_action_text: []  | and labels. The number of hidden layers is                   |
+|                                       |                        | equal to the length of the corresponding list.               |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| dense_dimension                       | text: 128              | Dense dimension for sparse features to use after they are    |
+|                                       | action_text: 128       | converted into dense features.                               |
+|                                       | label_action_text: 128 |                                                              |
+|                                       | intent: 20             |                                                              |
+|                                       | action_name: 20        |                                                              |
+|                                       | label_action_name: 20  |                                                              |
+|                                       | entities: 20           |                                                              |
+|                                       | slots: 20              |                                                              |
+|                                       | active_loop: 20        |                                                              |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| concat_dimension                      | text: 128              | Common dimension to which sequence and sentence features of  |
+|                                       | action_text: 128       | different dimensions get converted before concatenation.     |
+|                                       | label_action_text: 128 |                                                              |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| encoding_dimension                    | 50                     | Dimension size of embedding vectors                          |
+|                                       |                        | before the dialogue transformer encoder.                     |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| transformer_size                      | text: 128              | Number of units in user text sequence transformer encoder.   |
+|                                       | action_text: 128       | Number of units in bot text sequence transformer encoder.    |
+|                                       | label_action_text: 128 | Number of units in bot text sequence transformer encoder.    |
+|                                       | dialogue: 128          | Number of units in dialogue transformer encoder.             |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| number_of_transformer_layers          | text: 1                | Number of layers in user text sequence transformer encoder.  |
+|                                       | action_text: 1         | Number of layers in bot text sequence transformer encoder.   |
+|                                       | label_action_text: 1   | Number of layers in bot text sequence transformer encoder.   |
+|                                       | dialogue: 1            | Number of layers in dialogue transformer encoder.            |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| number_of_attention_heads             | 4                      | Number of self-attention heads in transformers.              |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| use_key_relative_attention            | False                  | If 'True' use key relative embeddings in attention.          |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| use_value_relative_attention          | False                  | If 'True' use value relative embeddings in attention.        |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| max_relative_position                 | None                   | Maximum position for relative embeddings.                    |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| batch_size                            | [64, 256]              | Initial and final value for batch sizes.                     |
+|                                       |                        | Batch size will be linearly increased for each epoch.        |
+|                                       |                        | If constant `batch_size` is required, pass an int, e.g. `8`. |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| batch_strategy                        | "balanced"             | Strategy used when creating batches.                         |
+|                                       |                        | Can be either 'sequence' or 'balanced'.                      |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| epochs                                | 1                      | Number of epochs to train.                                   |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| random_seed                           | None                   | Set random seed to any 'int' to get reproducible results.    |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| embedding_dimension                   | 20                     | Dimension size of dialogue & system action embedding vectors.|
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| number_of_negative_examples           | 20                     | The number of incorrect labels. The algorithm will minimize  |
+|                                       |                        | their similarity to the user input during training.          |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| similarity_type                       | "auto"                 | Type of similarity measure to use, either 'auto' or 'cosine' |
+|                                       |                        | or 'inner'.                                                  |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| loss_type                             | "softmax"              | The type of the loss function, either 'softmax' or 'margin'. |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| ranking_length                        | 10                     | Number of top actions to normalize scores for loss type      |
+|                                       |                        | 'softmax'. Set to 0 to turn off normalization.               |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| maximum_positive_similarity           | 0.8                    | Indicates how similar the algorithm should try to make       |
+|                                       |                        | embedding vectors for correct labels.                        |
+|                                       |                        | Should be 0.0 < ... < 1.0 for 'cosine' similarity type.      |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| maximum_negative_similarity           | -0.2                   | Maximum negative similarity for incorrect labels.            |
+|                                       |                        | Should be -1.0 < ... < 1.0 for 'cosine' similarity type.     |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| use_maximum_negative_similarity       | True                   | If 'True' the algorithm only minimizes maximum similarity    |
+|                                       |                        | over incorrect intent labels, used only if 'loss_type' is    |
+|                                       |                        | set to 'margin'.                                             |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| scale_loss                            | True                   | Scale loss inverse proportionally to confidence of correct   |
+|                                       |                        | prediction.                                                  |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| regularization_constant               | 0.001                  | The scale of regularization.                                 |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| negative_margin_scale                 | 0.8                    | The scale of how important it is to minimize the maximum     |
+|                                       |                        | similarity between embeddings of different labels.           |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| drop_rate_dialogue                    | 0.1                    | Dropout rate for embedding layers of dialogue features.      |
+|                                       |                        | Value should be between 0 and 1.                             |
+|                                       |                        | The higher the value the higher the regularization effect.   |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| drop_rate_label                       | 0.0                    | Dropout rate for embedding layers of label features.         |
+|                                       |                        | Value should be between 0 and 1.                             |
+|                                       |                        | The higher the value the higher the regularization effect.   |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| drop_rate_attention                   | 0.0                    | Dropout rate for attention. Value should be between 0 and 1. |
+|                                       |                        | The higher the value the higher the regularization effect.   |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| weight_sparsity                       | 0.8                    | Sparsity of the weights in dense layers.                     |
+|                                       |                        | Value should be between 0 and 1.                             |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| use_sparse_input_dropout              | True                   | If 'True' apply dropout to sparse input tensors.             |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| use_dense_input_dropout               | True                   | If 'True' apply dropout to sparse features after they are    |
+|                                       |                        | converted into dense features.                               |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| evaluate_every_number_of_epochs       | 20                     | How often to calculate validation accuracy.                  |
+|                                       |                        | Set to '-1' to evaluate just once at the end of training.    |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| evaluate_on_number_of_examples        | 0                      | How many examples to use for hold out validation set.        |
+|                                       |                        | Large values may hurt performance, e.g. model accuracy.      |
+|                                       |                        | Keep at 0 if your data set contains a lot of unique examples |
+|                                       |                        | of dialogue turns.                                           |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| tensorboard_log_directory             | None                   | If you want to use tensorboard to visualize training         |
+|                                       |                        | metrics, set this option to a valid output directory. You    |
+|                                       |                        | can view the training metrics after training in tensorboard  |
+|                                       |                        | via 'tensorboard --logdir <path-to-given-directory>'.        |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| tensorboard_log_level                 | "epoch"                | Define when training metrics for tensorboard should be       |
+|                                       |                        | logged. Either after every epoch ('epoch') or for every      |
+|                                       |                        | training step ('minibatch').                                 |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| checkpoint_model                      | False                  | Save the best performing model during training. Models are   |
+|                                       |                        | stored to the location specified by `--out`. Only the one    |
+|                                       |                        | best model will be saved.                                    |
+|                                       |                        | Requires `evaluate_on_number_of_examples > 0` and            |
+|                                       |                        | `evaluate_every_number_of_epochs > 0`                        |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| e2e_confidence_threshold              | 0.5                    | The threshold that ensures that end-to-end is picked only if |
+|                                       |                        | the policy is confident enough.                              |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| featurizers                           | []                     | List of featurizer names (alias names). Only features        |
+|                                       |                        | coming from the listed names are used. If list is empty      |
+|                                       |                        | all available features are used.                             |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| entity_recognition                    | True                   | If 'True' entity recognition is trained and entities are     |
+|                                       |                        | extracted.                                                   |
++---------------------------------------+------------------------+--------------------------------------------------------------+
 ```
 
 :::note
diff --git a/docs/docs/stories.mdx b/docs/docs/stories.mdx
index 7cd907d47575..e50895c1e446 100644
--- a/docs/docs/stories.mdx
+++ b/docs/docs/stories.mdx
@@ -10,7 +10,8 @@ abstract: Stories are a type of training data used to train your assistant's dia
 
 A story is a representation of a conversation between a user and an AI assistant,
 converted into a specific format where user inputs are expressed as intents
-(and entities when necessary), while the assistant's responses and actions are expressed as action names.
+(and entities when necessary),
+while the assistant's responses and actions are expressed as action names.
 
 Here's an example of a dialogue in the Rasa story format:
 
@@ -32,6 +33,7 @@ stories:
   - action: utter_ask_num_people
 ```
 
+
 ### User Messages
 
 While writing stories, you do not have to deal with the specific contents of
@@ -185,3 +187,56 @@ into a single file for evaluation. Read more about this format in [Testing Your
 This format is only used for testing and cannot be used for training.
 
 :::
+
+
+## End-to-end Training
+
+:::caution experimental feature
+End-to-end training is an experimental feature.
+We introduce experimental features to get feedback from our community, so we encourage you to try it out!
+However, the functionality might be changed or removed in the future.
+If you have feedback (positive or negative) please share it with us on the [Rasa Forum](https://forum.rasa.com).
+
+:::
+
+With end-to-end training, you do not have to deal with the specific
+intents of the messages that are extracted by the NLU pipeline
+or with separate `utter_` responses in the domain file.
+Instead, you can include the text of the user messages and/or bot responses directly in your stories.
+See the [training data format](./training-data-format.mdx#end-to-end-training)
+for detailed description of how to write end-to-end stories.
+
+You can mix training data in the end-to-end format with labeled training data which has 
+`intent`s and `action`s specified: Stories can have some steps defined by intents/actions
+and other steps defined directly by user or bot utterances.
+
+We call it end-to-end training because policies can consume and predict actual text.
+For end-to-end user inputs, intents classified by the NLU pipeline
+and extracted entities are ignored.
+
+
+Only [Rule Policy](./policies.mdx#rule-policy)
+and [TED Policy](./policies.mdx#ted-policy) allow end-to-end training.
+
+- `RulePolicy` uses simple string matching during prediction. Namely,
+  rules based on user text will only match if the user
+  text strings inside your rules and input during prediction are identical.
+
+- `TEDPolicy` passes user text through an additional Neural Network to create
+  hidden representations of the text. In order to obtain robust performance you
+  need to provide enough training stories to capture a variety of user texts for any 
+  end-to-end dialogue turn.
+
+Rasa policies are trained for next utterance selection.
+The only difference to creating `utter_` response is how `TEDPolicy` featurizes
+bot utterances.
+In case of an `utter_` action, `TEDPolicy` sees only the name of the action, while
+if you provide actual utterance using `bot` key,
+`TEDPolicy` will featurize it as textual input depending on the NLU configuration.
+This can help in case of similar utterances in slightly different situations.
+However, this can also make things harder to learn because the fact that different
+utterances have similar texts make it easier for `TEDPolicy` to confuse these utterances.
+
+End-to-end training requires significantly more parameters in `TEDPolicy`.
+Therefore, training an end-to-end model might require significant computational
+resources depending on how many end-to-end turns you have in your stories.
diff --git a/docs/docs/training-data-format.mdx b/docs/docs/training-data-format.mdx
index 8ec5d5ae71d4..1cb0e6d316d7 100644
--- a/docs/docs/training-data-format.mdx
+++ b/docs/docs/training-data-format.mdx
@@ -381,11 +381,11 @@ stories:
 Each step can be one of the following:
 
   - A [user message](#user-messages), represented by **intent** and **entities**.
-  - An [or statement](#or-statement), which includes two or more user messages under it
-  - A bot [action](#actions)
-  - A [form](#forms)
-  - A [slot was set](#slots) event
-  - A [checkpoint](#checkpoints), which connects the story to another story
+  - An [or statement](#or-statement), which includes two or more user messages under it.
+  - A bot [action](#actions).
+  - A [form](#forms).
+  - A [slot was set](#slots) event.
+  - A [checkpoint](#checkpoints), which connects the story to another story.
 
 
 #### User Messages
@@ -401,13 +401,14 @@ messages the users can send with the same meaning.
 
 User messages follow the format:
 
-```yaml-rasa
+```yaml-rasa {4-6}
 stories:
 - story: user message structure
   steps:
     - intent: intent_name  # Required
       entities:  # Optional
       - entity_name: entity_value
+    - action: action_name
 ```
 
 For example, to represent the sentence
@@ -685,3 +686,84 @@ rasa test
 
 If you want to know more about testing head over to
 [Testing Your Assistant](testing-your-assistant.mdx).
+
+
+## End-to-end Training
+
+:::caution experimental feature
+End-to-end training is an experimental feature.
+We introduce experimental features to get feedback from our community, so we encourage you to try it out!
+However, the functionality might be changed or removed in the future.
+If you have feedback (positive or negative) please share it with us on the [Rasa Forum](https://forum.rasa.com).
+
+:::
+
+With [end-to-end training](stories.mdx#end-to-end-training), you do not have to deal with the specific
+intents of the messages that are extracted by the NLU pipeline.
+Instead, you can put the text of the user message directly in the stories,
+by using `user` key.
+
+These end-to-end user messages follow the format:
+
+```yaml-rasa {4}
+stories:
+- story: user message structure
+  steps:
+    - user: the actual text of the user message
+    - action: action_name
+```
+
+In addition, you can add entity tags that can be extracted
+by the [TED Policy](./policies.mdx#ted-policy).
+The syntax for entity tags is the same as in
+[the NLU training data](./training-data-format.mdx#entities).
+For example, the following story contains the user utterance
+` I can always go for sushi`. By using the syntax from the NLU training data
+`[sushi](cuisine)`, you can mark `sushi` as an entity of type `cuisine`.
+
+```yaml-rasa {4}
+stories:
+- story: story with entities
+  steps:
+  - user: I can always go for [sushi](cuisine)
+  - action: utter_suggest_cuisine
+```
+
+
+Similarly, you can put bot utterances directly in the stories,
+by using the `bot` key followed by the text that you want your bot to say.
+
+A story with only a bot utterance might look like this:
+
+```yaml-rasa {7}
+stories:
+- story: story with an end-to-end response
+  steps:
+  - intent: greet
+    entities:
+      name: Ivan
+  - bot: Hello, a person with a name!
+```
+
+You can also have a mixed end-to-end story:
+
+```yaml-rasa
+stories:
+- story: full end-to-end story
+  steps:
+  - intent: greet
+    entities:
+      name: Ivan
+  - bot: Hello, a person with a name!
+  - intent: search_restaurant
+  - action: utter_suggest_cuisine
+  - user: I can always go for [sushi](cuisine)
+  - bot: Personally, I prefer pizza, but sure let's search sushi restaurants
+  - action: utter_suggest_cuisine
+  - user: Have a beautiful day!
+  - action: utter_goodbye
+```
+
+Rasa end-to-end training is fully integrated with standard Rasa approach.
+It means that you can have mixed stories with some steps defined by actions or intents
+and other steps defined directly by user messages or bot responses.
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index d2d31c4ecdcd..06e75f358ed6 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -1,4 +1,3 @@
-import copy
 import logging
 from pathlib import Path
 from collections import defaultdict
@@ -40,7 +39,7 @@
 from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS, ACTION_LISTEN_NAME
 from rasa.shared.core.trackers import DialogueStateTracker
 from rasa.shared.core.generator import TrackerWithCachedStates
-from rasa.utils import train_utils
+import rasa.utils.train_utils
 from rasa.utils.tensorflow.models import RasaModel, TransformerRasaModel
 from rasa.utils.tensorflow.model_data import (
     RasaModelData,
@@ -145,14 +144,11 @@ class TEDPolicy(Policy):
     # please make sure to update the docs when changing a default parameter
     defaults = {
         # ## Architecture of the used neural network
-        # Hidden layer sizes for layers before the dialogue and label embedding layers.
-        # The number of hidden layers is equal to the length of the corresponding
-        # list.
         # Hidden layer sizes for layers before the embedding layers for user message
         # and labels.
-        # The number of hidden layers is equal to the length of the corresponding
-        # list.
+        # The number of hidden layers is equal to the length of the corresponding list.
         HIDDEN_LAYERS_SIZES: {TEXT: [], ACTION_TEXT: [], f"{LABEL}_{ACTION_TEXT}": []},
+        # Dense dimension to use for sparse features.
         DENSE_DIMENSION: {
             TEXT: 128,
             ACTION_TEXT: 128,
@@ -164,16 +160,24 @@ class TEDPolicy(Policy):
             SLOTS: 20,
             ACTIVE_LOOP: 20,
         },
+        # Default dimension to use for concatenating sequence and sentence features.
         CONCAT_DIMENSION: {TEXT: 128, ACTION_TEXT: 128, f"{LABEL}_{ACTION_TEXT}": 128},
+        # Dimension size of embedding vectors before the dialogue transformer encoder.
         ENCODING_DIMENSION: 50,
-        # Number of units in sequence transformer
-        TRANSFORMER_SIZE: 128,
-        # Number of sequence transformer layers
-        NUM_TRANSFORMER_LAYERS: 1,
-        # Number of units in dialogue transformer
-        f"{DIALOGUE}_{TRANSFORMER_SIZE}": 128,
-        # Number of dialogue transformer layers
-        f"{DIALOGUE}_{NUM_TRANSFORMER_LAYERS}": 1,
+        # Number of units in transformer encoders
+        TRANSFORMER_SIZE: {
+            TEXT: 128,
+            ACTION_TEXT: 128,
+            f"{LABEL}_{ACTION_TEXT}": 128,
+            DIALOGUE: 128,
+        },
+        # Number of layers in transformer encoders
+        NUM_TRANSFORMER_LAYERS: {
+            TEXT: 1,
+            ACTION_TEXT: 1,
+            f"{LABEL}_{ACTION_TEXT}": 1,
+            DIALOGUE: 1,
+        },
         # Number of attention heads in transformer
         NUM_HEADS: 4,
         # If 'True' use key relative embeddings in attention
@@ -235,7 +239,7 @@ class TEDPolicy(Policy):
         # Dropout rate for embedding layers of label, e.g. action, features.
         DROP_RATE_LABEL: 0.0,
         # Dropout rate for attention.
-        DROP_RATE_ATTENTION: 0,
+        DROP_RATE_ATTENTION: 0.0,
         # Sparsity of the weights in dense layers
         WEIGHT_SPARSITY: 0.8,
         # If 'True' apply dropout to sparse input tensors
@@ -314,13 +318,12 @@ def __init__(
         self.data_example: Optional[Dict[Text, List[np.ndarray]]] = None
 
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
-        self.config = copy.deepcopy(self.defaults)
-        self.config.update(kwargs)
-
-        self.config = train_utils.check_deprecated_options(self.config)
-
-        self.config = train_utils.update_similarity_type(self.config)
-        self.config = train_utils.update_evaluation_parameters(self.config)
+        new_config = rasa.utils.train_utils.check_core_deprecated_options(kwargs)
+        self.config = rasa.utils.train_utils.override_defaults(
+            self.defaults, new_config
+        )
+        self.config = rasa.utils.train_utils.update_similarity_type(self.config)
+        self.config = rasa.utils.train_utils.update_evaluation_parameters(self.config)
 
     def _create_entity_tag_specs(self) -> List[EntityTagSpec]:
         """Create entity tag specifications with their respective tag id mappings."""
@@ -606,7 +609,9 @@ def predict_action_probabilities(
         confidence, is_e2e_prediction = self._pick_confidence(confidences, similarities)
 
         if self.config[LOSS_TYPE] == SOFTMAX and self.config[RANKING_LENGTH] > 0:
-            confidence = train_utils.normalize(confidence, self.config[RANKING_LENGTH])
+            confidence = rasa.utils.train_utils.normalize(
+                confidence, self.config[RANKING_LENGTH]
+            )
 
         optional_events = self._create_optional_event_for_entities(
             output, is_e2e_prediction, interpreter, tracker
@@ -776,7 +781,7 @@ def load(
         model_data_example = RasaModelData(
             label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data=loaded_data
         )
-        meta = train_utils.update_similarity_type(meta)
+        meta = rasa.utils.train_utils.update_similarity_type(meta)
 
         meta[EPOCHS] = epoch_override
 
@@ -903,8 +908,8 @@ def _prepare_layers(self) -> None:
 
         self._prepare_transformer_layer(
             DIALOGUE,
-            self.config[f"{DIALOGUE}_{NUM_TRANSFORMER_LAYERS}"],
-            self.config[f"{DIALOGUE}_{TRANSFORMER_SIZE}"],
+            self.config[NUM_TRANSFORMER_LAYERS][DIALOGUE],
+            self.config[TRANSFORMER_SIZE][DIALOGUE],
             self.config[DROP_RATE_DIALOGUE],
             self.config[DROP_RATE_ATTENTION],
         )
@@ -1133,8 +1138,8 @@ def _encode_fake_features_per_attribute(
             # if the input features are fake, we don't process them further,
             # but we need to calculate correct last dim (units) so that tf could infer
             # the last shape of the tensors
-            if self.config[f"{DIALOGUE}_{NUM_TRANSFORMER_LAYERS}"] > 0:
-                text_transformer_units = self.config[f"{DIALOGUE}_{TRANSFORMER_SIZE}"]
+            if self.config[NUM_TRANSFORMER_LAYERS][TEXT] > 0:
+                text_transformer_units = self.config[TRANSFORMER_SIZE][TEXT]
             elif self.config[HIDDEN_LAYERS_SIZES][TEXT]:
                 text_transformer_units = self.config[HIDDEN_LAYERS_SIZES][TEXT][-1]
             else:
@@ -1472,8 +1477,8 @@ def _reshape_for_entities(
 
         # broadcast the dialogue transformer output sequence-length-times to get the
         # same shape as the text sequence transformer output
-        dialogue_transformer_output = tf.broadcast_to(
-            dialogue_transformer_output, tf.shape(text_transformer_output)
+        dialogue_transformer_output = tf.tile(
+            dialogue_transformer_output, (1, tf.shape(text_transformer_output)[1], 1)
         )
 
         # concat the output of the dialogue transformer to the output of the text
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 71ecbaace1eb..d1f26fec25fd 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -139,8 +139,7 @@ def required_components(cls) -> List[Type[Component]]:
         # ## Architecture of the used neural network
         # Hidden layer sizes for layers before the embedding layers for user message
         # and labels.
-        # The number of hidden layers is equal to the length of the corresponding
-        # list.
+        # The number of hidden layers is equal to the length of the corresponding list.
         HIDDEN_LAYERS_SIZES: {TEXT: [], LABEL: []},
         # Whether to share the hidden layer weights between user message and labels.
         SHARE_HIDDEN_LAYERS: False,
@@ -174,7 +173,7 @@ def required_components(cls) -> List[Type[Component]]:
         # ## Parameters for embeddings
         # Dimension size of embedding vectors
         EMBEDDING_DIMENSION: 20,
-        # Default dense dimension to use if no dense features are present.
+        # Dense dimension to use for sparse features.
         DENSE_DIMENSION: {TEXT: 128, LABEL: 20},
         # Default dimension to use for concatenating sequence and sentence features.
         CONCAT_DIMENSION: {TEXT: 128, LABEL: 20},
diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 3bea874e0914..73a31751ea14 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -4,6 +4,7 @@
 import typing
 from typing import Any, Dict, Hashable, List, Optional, Set, Text, Tuple, Type, Iterable
 
+import rasa.utils.train_utils
 from rasa.exceptions import MissingDependencyException
 from rasa.shared.exceptions import RasaException
 from rasa.shared.nlu.constants import TRAINABLE_EXTRACTORS
@@ -442,7 +443,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         # this is important for e.g. persistence
         component_config["name"] = self.name
 
-        self.component_config = rasa.nlu.config.override_defaults(
+        self.component_config = rasa.utils.train_utils.override_defaults(
             self.defaults, component_config
         )
 
diff --git a/rasa/nlu/config.py b/rasa/nlu/config.py
index 00678ced09e7..afdb39af1e17 100644
--- a/rasa/nlu/config.py
+++ b/rasa/nlu/config.py
@@ -1,18 +1,16 @@
-import copy
 import logging
 import os
-import ruamel.yaml as yaml
 from typing import Any, Dict, List, Optional, Text, Union
 
-from rasa.shared.exceptions import InvalidConfigException, RasaException
+from rasa.shared.exceptions import InvalidConfigException
 import rasa.shared.utils.io
 import rasa.utils.io
 from rasa.shared.constants import (
     DOCS_URL_PIPELINE,
-    DOCS_URL_MIGRATION_GUIDE,
     DEFAULT_CONFIG_PATH,
 )
 from rasa.shared.utils.io import json_to_string
+import rasa.utils.train_utils
 
 logger = logging.getLogger(__name__)
 
@@ -53,32 +51,24 @@ def _load_from_dict(config: Dict, **kwargs: Any) -> "RasaNLUModelConfig":
     return RasaNLUModelConfig(config)
 
 
-def override_defaults(
-    defaults: Optional[Dict[Text, Any]], custom: Optional[Dict[Text, Any]]
-) -> Dict[Text, Any]:
-    if defaults:
-        cfg = copy.deepcopy(defaults)
-    else:
-        cfg = {}
-
-    if custom:
-        for key in custom.keys():
-            if isinstance(cfg.get(key), dict):
-                cfg[key].update(custom[key])
-            else:
-                cfg[key] = custom[key]
-
-    return cfg
-
-
 def component_config_from_pipeline(
     index: int,
     pipeline: List[Dict[Text, Any]],
     defaults: Optional[Dict[Text, Any]] = None,
 ) -> Dict[Text, Any]:
+    """Get config of the component with the given index in the pipeline.
+
+    Args:
+        index: index the component in the pipeline
+        pipeline: a list of component configs in the NLU pipeline
+        defaults: default config of the component
+
+    Returns:
+        config of the component
+    """
     try:
         c = pipeline[index]
-        return override_defaults(defaults, c)
+        return rasa.utils.train_utils.override_defaults(defaults, c)
     except IndexError:
         rasa.shared.utils.io.raise_warning(
             f"Tried to get configuration value for component "
@@ -86,13 +76,17 @@ def component_config_from_pipeline(
             f"Returning `defaults`.",
             docs=DOCS_URL_PIPELINE,
         )
-        return override_defaults(defaults, {})
+        return rasa.utils.train_utils.override_defaults(defaults, {})
 
 
 class RasaNLUModelConfig:
+    """A class that stores NLU model configuration parameters."""
+
     def __init__(self, configuration_values: Optional[Dict[Text, Any]] = None) -> None:
-        """Create a model configuration, optionally overriding
-        defaults with a dictionary ``configuration_values``.
+        """Create a model configuration.
+
+        Args:
+            configuration_values: optional dictionary to override defaults.
         """
         if not configuration_values:
             configuration_values = {}
diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
index de29ae67cfb6..8950c862775b 100644
--- a/rasa/nlu/utils/bilou_utils.py
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -1,10 +1,7 @@
 import logging
 from collections import defaultdict, Counter
-from typing import List, Tuple, Text, Optional, Dict, Any
+from typing import List, Tuple, Text, Optional, Dict, Any, TYPE_CHECKING
 
-from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.shared.nlu.training_data.training_data import TrainingData
-from rasa.shared.nlu.training_data.message import Message
 from rasa.nlu.constants import (
     TOKENS_NAMES,
     BILOU_ENTITIES,
@@ -22,6 +19,11 @@
     NO_ENTITY_TAG,
 )
 
+if TYPE_CHECKING:
+    from rasa.nlu.tokenizers.tokenizer import Token
+    from rasa.shared.nlu.training_data.training_data import TrainingData
+    from rasa.shared.nlu.training_data.message import Message
+
 logger = logging.getLogger(__name__)
 
 BEGINNING = "B-"
@@ -58,7 +60,7 @@ def tag_without_prefix(tag: Text) -> Text:
 
 
 def bilou_tags_to_ids(
-    message: Message,
+    message: "Message",
     tag_id_dict: Dict[Text, int],
     tag_name: Text = ENTITY_ATTRIBUTE_TYPE,
 ) -> List[int]:
@@ -115,7 +117,7 @@ def remove_bilou_prefixes(tags: List[Text]) -> List[Text]:
 
 
 def build_tag_id_dict(
-    training_data: TrainingData, tag_name: Text = ENTITY_ATTRIBUTE_TYPE
+    training_data: "TrainingData", tag_name: Text = ENTITY_ATTRIBUTE_TYPE
 ) -> Optional[Dict[Text, int]]:
     """Create a mapping of unique tags to ids.
 
@@ -151,7 +153,7 @@ def build_tag_id_dict(
     return tag_id_dict
 
 
-def apply_bilou_schema(training_data: TrainingData) -> None:
+def apply_bilou_schema(training_data: "TrainingData") -> None:
     """Get a list of BILOU entity tags and set them on the given messages.
 
     Args:
@@ -176,7 +178,7 @@ def apply_bilou_schema(training_data: TrainingData) -> None:
 
 
 def map_message_entities(
-    message: Message, attribute_key: Text = ENTITY_ATTRIBUTE_TYPE
+    message: "Message", attribute_key: Text = ENTITY_ATTRIBUTE_TYPE
 ) -> List[Tuple[int, int, Text]]:
     """Maps the entities of the given message to their start, end, and tag values.
 
@@ -203,7 +205,7 @@ def convert_entity(entity: Dict[Text, Any]) -> Tuple[int, int, Text]:
 
 
 def bilou_tags_from_offsets(
-    tokens: List[Token], entities: List[Tuple[int, int, Text]]
+    tokens: List["Token"], entities: List[Tuple[int, int, Text]]
 ) -> List[Text]:
     """Creates BILOU tags for the given tokens and entities.
 
diff --git a/rasa/nlu/utils/mitie_utils.py b/rasa/nlu/utils/mitie_utils.py
index 91d37cc392d7..4631e30ee641 100644
--- a/rasa/nlu/utils/mitie_utils.py
+++ b/rasa/nlu/utils/mitie_utils.py
@@ -3,7 +3,8 @@
 from typing import Any, Dict, List, Optional, Text
 
 from rasa.nlu.components import Component
-from rasa.nlu.config import RasaNLUModelConfig, override_defaults
+from rasa.nlu.config import RasaNLUModelConfig
+import rasa.utils.train_utils
 from rasa.nlu.model import Metadata
 
 if typing.TYPE_CHECKING:
@@ -37,7 +38,9 @@ def create(
     ) -> "MitieNLP":
         import mitie
 
-        component_config = override_defaults(cls.defaults, component_config)
+        component_config = rasa.utils.train_utils.override_defaults(
+            cls.defaults, component_config
+        )
 
         model_file = component_config.get("model")
         if not model_file:
diff --git a/rasa/nlu/utils/spacy_utils.py b/rasa/nlu/utils/spacy_utils.py
index 3186da23a243..4392b4a96921 100644
--- a/rasa/nlu/utils/spacy_utils.py
+++ b/rasa/nlu/utils/spacy_utils.py
@@ -3,7 +3,8 @@
 from typing import Any, Dict, List, Optional, Text, Tuple
 
 from rasa.nlu.components import Component
-from rasa.nlu.config import RasaNLUModelConfig, override_defaults
+from rasa.nlu.config import RasaNLUModelConfig
+import rasa.utils.train_utils
 from rasa.shared.nlu.training_data.training_data import TrainingData
 from rasa.shared.nlu.training_data.message import Message
 from rasa.nlu.model import InvalidModelError
@@ -64,7 +65,9 @@ def create(
         cls, component_config: Dict[Text, Any], config: RasaNLUModelConfig
     ) -> "SpacyNLP":
 
-        component_config = override_defaults(cls.defaults, component_config)
+        component_config = rasa.utils.train_utils.override_defaults(
+            cls.defaults, component_config
+        )
 
         spacy_model_name = component_config.get("model")
 
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 083e2d895fa7..1f3235a0c143 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -757,7 +757,7 @@ def _prepare_transformer_layer(
         drop_rate_attention: float,
         prefix: Text = "transformer",
     ):
-        if self.config[NUM_TRANSFORMER_LAYERS] > 0:
+        if num_layers > 0:
             self._tf_layers[f"{prefix}.{name}"] = TransformerEncoder(
                 num_layers,
                 units,
@@ -857,10 +857,21 @@ def _prepare_input_layers(self, name: Text) -> None:
 
     def _prepare_sequence_layers(self, name: Text) -> None:
         self._prepare_input_layers(name)
+
+        if isinstance(self.config[TRANSFORMER_SIZE], int):
+            size = self.config[TRANSFORMER_SIZE]
+        else:
+            size = self.config[TRANSFORMER_SIZE][name]
+
+        if isinstance(self.config[NUM_TRANSFORMER_LAYERS], int):
+            num_layers = self.config[NUM_TRANSFORMER_LAYERS]
+        else:
+            num_layers = self.config[NUM_TRANSFORMER_LAYERS][name]
+
         self._prepare_transformer_layer(
             name,
-            self.config[NUM_TRANSFORMER_LAYERS],
-            self.config[TRANSFORMER_SIZE],
+            num_layers,
+            size,
             self.config[DROP_RATE],
             self.config[DROP_RATE_ATTENTION],
         )
@@ -1052,7 +1063,12 @@ def _create_sequence(
             transformer_inputs, 1 - mask, self._training
         )
 
-        if self.config[NUM_TRANSFORMER_LAYERS] > 0:
+        if isinstance(self.config[NUM_TRANSFORMER_LAYERS], int):
+            num_layers = self.config[NUM_TRANSFORMER_LAYERS]
+        else:
+            num_layers = self.config[NUM_TRANSFORMER_LAYERS][name]
+
+        if num_layers > 0:
             # apply activation
             outputs = tfa.activations.gelu(outputs)
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 6d620038ebe1..fb9ea1faf6ed 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,5 +1,5 @@
 from typing import Optional, Text, Dict, Any, Union, List, Tuple, TYPE_CHECKING
-
+import copy
 import numpy as np
 
 import rasa.shared.utils.common
@@ -7,7 +7,6 @@
 import rasa.nlu.utils.bilou_utils
 from rasa.shared.constants import NEXT_MAJOR_VERSION_FOR_DEPRECATIONS
 from rasa.nlu.constants import NUMBER_OF_SUB_TOKENS
-from rasa.nlu.tokenizers.tokenizer import Token
 import rasa.utils.io as io_utils
 from rasa.utils.tensorflow.constants import (
     LOSS_TYPE,
@@ -20,10 +19,17 @@
     AUTO,
     INNER,
     COSINE,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    DENSE_DIMENSION,
 )
+from rasa.shared.nlu.constants import ACTION_NAME, INTENT, ENTITIES
+from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS
+from rasa.core.constants import DIALOGUE
 
 if TYPE_CHECKING:
     from rasa.nlu.classifiers.diet_classifier import EntityTagSpec
+    from rasa.nlu.tokenizers.tokenizer import Token
 
 
 def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
@@ -61,7 +67,7 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
 
 
 def align_token_features(
-    list_of_tokens: List[List[Token]],
+    list_of_tokens: List[List["Token"]],
     in_token_features: np.ndarray,
     shape: Optional[Tuple] = None,
 ) -> np.ndarray:
@@ -155,40 +161,104 @@ def _replace_deprecated_option(
     config: Dict[Text, Any],
     warn_until_version: Text = NEXT_MAJOR_VERSION_FOR_DEPRECATIONS,
 ) -> Dict[Text, Any]:
-    if old_option in config:
-        if isinstance(new_option, str):
-            rasa.shared.utils.io.raise_deprecation_warning(
-                f"Option '{old_option}' got renamed to '{new_option}'. "
-                f"Please update your configuration file.",
-                warn_until_version=warn_until_version,
-            )
-            config[new_option] = config[old_option]
-        else:
-            rasa.shared.utils.io.raise_deprecation_warning(
-                f"Option '{old_option}' got renamed to "
-                f"a dictionary '{new_option[0]}' with a key '{new_option[1]}'. "
-                f"Please update your configuration file.",
-                warn_until_version=warn_until_version,
-            )
-            option_dict = config.get(new_option[0], {})
-            option_dict[new_option[1]] = config[old_option]
-            config[new_option[0]] = option_dict
+    if old_option not in config:
+        return {}
+
+    if isinstance(new_option, str):
+        rasa.shared.utils.io.raise_deprecation_warning(
+            f"Option '{old_option}' got renamed to '{new_option}'. "
+            f"Please update your configuration file.",
+            warn_until_version=warn_until_version,
+        )
+        return {new_option: config[old_option]}
 
-    return config
+    rasa.shared.utils.io.raise_deprecation_warning(
+        f"Option '{old_option}' got renamed to "
+        f"a dictionary '{new_option[0]}' with a key '{new_option[1]}'. "
+        f"Please update your configuration file.",
+        warn_until_version=warn_until_version,
+    )
+    return {new_option[0]: {new_option[1]: config[old_option]}}
 
 
 def check_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
-    """
+    """Update the config according to changed config params.
+
     If old model configuration parameters are present in the provided config, replace
     them with the new parameters and log a warning.
+
     Args:
         config: model configuration
 
     Returns: updated model configuration
     """
+    # note: call _replace_deprecated_option() here when there are options to deprecate
+
+    return config
 
+
+def check_core_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
+    """Update the core config according to changed config params.
+
+    If old model configuration parameters are present in the provided config, replace
+    them with the new parameters and log a warning.
+
+    Args:
+        config: model configuration
+
+    Returns: updated model configuration
+    """
     # note: call _replace_deprecated_option() here when there are options to deprecate
+    new_config = {}
+    if isinstance(config.get(TRANSFORMER_SIZE), int):
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                TRANSFORMER_SIZE, [TRANSFORMER_SIZE, DIALOGUE], config
+            ),
+        )
+
+    if isinstance(config.get(NUM_TRANSFORMER_LAYERS), int):
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                NUM_TRANSFORMER_LAYERS, [NUM_TRANSFORMER_LAYERS, DIALOGUE], config
+            ),
+        )
 
+    if isinstance(config.get(DENSE_DIMENSION), int):
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                DENSE_DIMENSION, [DENSE_DIMENSION, INTENT], config
+            ),
+        )
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                DENSE_DIMENSION, [DENSE_DIMENSION, ACTION_NAME], config
+            ),
+        )
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                DENSE_DIMENSION, [DENSE_DIMENSION, ENTITIES], config
+            ),
+        )
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                DENSE_DIMENSION, [DENSE_DIMENSION, SLOTS], config
+            ),
+        )
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                DENSE_DIMENSION, [DENSE_DIMENSION, ACTIVE_LOOP], config
+            ),
+        )
+
+    config.update(new_config)
     return config
 
 
@@ -236,3 +306,32 @@ def entity_label_to_tags(
         confidence_values[tag_spec.tag_name] = confidences
 
     return predicted_tags, confidence_values
+
+
+def override_defaults(
+    defaults: Optional[Dict[Text, Any]], custom: Optional[Dict[Text, Any]]
+) -> Dict[Text, Any]:
+    """Override default config with the given config.
+
+    We cannot use `dict.update` method because configs contain nested dicts.
+
+    Args:
+        defaults: default config
+        custom: user config containing new parameters
+
+    Returns:
+        updated config
+    """
+    if defaults:
+        config = copy.deepcopy(defaults)
+    else:
+        config = {}
+
+    if custom:
+        for key in custom.keys():
+            if isinstance(config.get(key), dict):
+                config[key].update(custom[key])
+            else:
+                config[key] = custom[key]
+
+    return config
diff --git a/tests/docs/test_docs_training_data.py b/tests/docs/test_docs_training_data.py
index 706b44599fa2..1008f7248b9c 100644
--- a/tests/docs/test_docs_training_data.py
+++ b/tests/docs/test_docs_training_data.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import List, Text
+from typing import List, Text, Tuple
 import re
 
 import pytest
@@ -28,7 +28,7 @@ def test_docs_training_data(mdx_file_path: Path):
         mdx_content = handle.read()
 
     matches = TRAINING_DATA_CODEBLOCK_RE.finditer(mdx_content)
-    lines_with_errors: List[Text] = []
+    lines_with_errors: List[Tuple[Text, Text]] = []
 
     for match in matches:
         yaml_path = match.group("yaml_path")
@@ -46,11 +46,14 @@ def test_docs_training_data(mdx_file_path: Path):
         for schema in schemas_to_try:
             try:
                 rasa.shared.utils.validation.validate_yaml_schema(codeblock, schema)
-            except ValueError:
-                lines_with_errors.append(str(line_number))
+            except ValueError as error:
+                lines_with_errors.append((str(line_number), str(error)))
 
     if lines_with_errors:
+        error_details = "\n\n" + "\n".join(
+            f" - At line {line}: {error} " for line, error in lines_with_errors
+        )
         raise AssertionError(
             f"({mdx_file_path}): Invalid training data found "
-            f"at line{'s' if len(lines_with_errors) > 1 else ''} {', '.join(lines_with_errors)}"
+            f"at line{'s' if len(lines_with_errors) > 1 else ''}: {error_details}"
         )