diff --git a/examples/e2ebot/data/stories.yml b/examples/e2ebot/data/stories.yml
index bf884abf1856..cab5ea7113ca 100644
--- a/examples/e2ebot/data/stories.yml
+++ b/examples/e2ebot/data/stories.yml
@@ -10,7 +10,7 @@ stories:
 
 - story: sad path (text to text)
   steps:
-  - user: "Hello"
+  - user: "[Hello](bla)"
   - bot: "Welcome to moodbot. How are you feeling today?"
   - user: "Horrible"
   - bot: "Oh no! Here is a kitten photo. Did it help?"
diff --git a/examples/e2ebot/domain.yml b/examples/e2ebot/domain.yml
index 17b3faba2a75..5c35c3d83a7b 100644
--- a/examples/e2ebot/domain.yml
+++ b/examples/e2ebot/domain.yml
@@ -1,5 +1,5 @@
 version: "2.0"
-
+# TODO create a bot that makes sense
 actions:
  - utter_greet
  - utter_happy
@@ -9,3 +9,6 @@ actions:
 intents:
  - greet
  - mood_great
+
+entities:
+ - bla
diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index 3b86d4962cd3..e702134a0bf6 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -1,10 +1,11 @@
 import logging
 import numpy as np
 import scipy.sparse
-from typing import List, Optional, Dict, Text, Set
+from typing import List, Optional, Dict, Text, Set, Any
 from collections import defaultdict
 
 import rasa.shared.utils.io
+from rasa.nlu.constants import TOKENS_NAMES
 from rasa.shared.core.domain import SubState, State, Domain
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter
 from rasa.shared.core.constants import PREVIOUS_ACTION, ACTIVE_LOOP, USER, SLOTS
@@ -16,9 +17,15 @@
     ACTION_TEXT,
     ACTION_NAME,
     INTENT,
+    TEXT,
+    NO_ENTITY_TAG,
+    ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_TAGS,
 )
 from rasa.shared.nlu.training_data.features import Features
 from rasa.shared.nlu.training_data.message import Message
+from rasa.utils.tensorflow.model_data_utils import TAG_ID_ORIGIN
+from rasa.utils.tensorflow.constants import IDS
 
 logger = logging.getLogger(__name__)
 
@@ -36,6 +43,23 @@ class SingleStateFeaturizer:
     def __init__(self) -> None:
         self._default_feature_states = {}
         self.action_texts = []
+        self.entity_tag_id_mapping = {}
+
+    def get_entity_tag_ids(self) -> Dict[Text, int]:
+        """Returns the tag to index mapping for entities.
+
+        Returns:
+            Tag to index mapping.
+        """
+        if ENTITIES not in self._default_feature_states:
+            return {}
+
+        tag_ids = {
+            tag: idx + 1  # +1 to keep 0 for the NO_ENTITY_TAG
+            for tag, idx in self._default_feature_states[ENTITIES].items()
+        }
+        tag_ids[NO_ENTITY_TAG] = 0
+        return tag_ids
 
     def prepare_from_domain(self, domain: Domain) -> None:
         """Gets necessary information for featurization from domain.
@@ -55,6 +79,7 @@ def convert_to_dict(feature_states: List[Text]) -> Dict[Text, int]:
         self._default_feature_states[SLOTS] = convert_to_dict(domain.slot_states)
         self._default_feature_states[ACTIVE_LOOP] = convert_to_dict(domain.form_names)
         self.action_texts = domain.action_texts
+        self.entity_tag_id_mapping = self.get_entity_tag_ids()
 
     def _state_features_for_attribute(
         self, sub_state: SubState, attribute: Text
@@ -84,7 +109,7 @@ def _create_features(
 
         features = np.zeros(len(self._default_feature_states[attribute]), np.float32)
         for state_feature, value in state_features.items():
-            # check that the value is in default_feature_states to be able to assigh
+            # check that the value is in default_feature_states to be able to assign
             # its value
             if state_feature in self._default_feature_states[attribute]:
                 features[self._default_feature_states[attribute][state_feature]] = value
@@ -215,6 +240,51 @@ def encode_state(
 
         return state_features
 
+    def encode_entities(
+        self, entity_data: Dict[Text, Any], interpreter: NaturalLanguageInterpreter
+    ) -> Dict[Text, List["Features"]]:
+        """Encode the given entity data with the help of the given interpreter.
+
+        Produce numeric entity tags for tokens.
+
+        Args:
+            entity_data: The dict containing the text and entity labels and locations
+            interpreter: The interpreter used to encode the state
+
+        Returns:
+            A dictionary of entity type to list of features.
+        """
+        from rasa.nlu.test import determine_token_labels
+
+        # TODO
+        #  The entity states used to create the tag-idx-mapping contains the
+        #  entities and the concatenated entity and roles/groups. We do not
+        #  distinguish between entities and roles/groups right now.
+        # TODO
+        #  Should we support BILOU tagging?
+
+        if TEXT not in entity_data or len(self.entity_tag_id_mapping) < 2:
+            # we cannot build a classifier if there are less than 2 class
+            return {}
+
+        parsed_text = interpreter.featurize_message(Message({TEXT: entity_data[TEXT]}))
+        entities = entity_data.get(ENTITIES, [])
+
+        _tags = []
+        for token in parsed_text.get(TOKENS_NAMES[TEXT]):
+            _tag = determine_token_labels(
+                token, entities, attribute_key=ENTITY_ATTRIBUTE_TYPE
+            )
+            # TODO handle if tag is not in mapping
+            _tags.append(self.entity_tag_id_mapping[_tag])
+
+        # transpose to have seq_len x 1
+        return {
+            ENTITY_TAGS: [
+                Features(np.array([_tags]).T, IDS, ENTITY_TAGS, TAG_ID_ORIGIN,)
+            ]
+        }
+
     def _encode_action(
         self, action: Text, interpreter: NaturalLanguageInterpreter
     ) -> Dict[Text, List["Features"]]:
diff --git a/rasa/core/featurizers/tracker_featurizers.py b/rasa/core/featurizers/tracker_featurizers.py
index 4f250c535ea8..d12b92a42e79 100644
--- a/rasa/core/featurizers/tracker_featurizers.py
+++ b/rasa/core/featurizers/tracker_featurizers.py
@@ -3,15 +3,15 @@
 import jsonpickle
 import logging
 
-from rasa.shared.nlu.constants import TEXT, INTENT
+from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES
 from rasa.shared.exceptions import RasaException
 from tqdm import tqdm
-from typing import Tuple, List, Optional, Dict, Text, Union
+from typing import Tuple, List, Optional, Dict, Text, Union, Any
 import numpy as np
 
 from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer
 from rasa.shared.core.domain import State, Domain
-from rasa.shared.core.events import ActionExecuted
+from rasa.shared.core.events import ActionExecuted, UserUttered
 from rasa.shared.core.trackers import (
     DialogueStateTracker,
     is_prev_action_listen_in_state,
@@ -91,6 +91,43 @@ def _convert_labels_to_ids(
             ]
         )
 
+    def _create_entity_tags(
+        self,
+        trackers_as_entities: List[List[Dict[Text, Any]]],
+        interpreter: NaturalLanguageInterpreter,
+    ) -> List[List[Dict[Text, List["Features"]]]]:
+        return [
+            [
+                self.state_featurizer.encode_entities(entity_data, interpreter)
+                for entity_data in trackers_entities
+            ]
+            for trackers_entities in trackers_as_entities
+        ]
+
+    @staticmethod
+    def _entity_data(event: UserUttered) -> Dict[Text, Any]:
+        if event.text:
+            return {TEXT: event.text, ENTITIES: event.entities}
+
+        # input is not textual, so add empty dict
+        return {}
+
+    def training_states_actions_and_entities(
+        self, trackers: List[DialogueStateTracker], domain: Domain
+    ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
+        """Transforms list of trackers to lists of states, actions and entity data.
+
+        Args:
+            trackers: The trackers to transform
+            domain: The domain
+
+        Returns:
+            A tuple of list of states, list of actions and list of entity data.
+        """
+        raise NotImplementedError(
+            "Featurizer must have the capacity to encode trackers to feature vectors"
+        )
+
     def training_states_and_actions(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[State]], List[List[Text]]]:
@@ -103,16 +140,23 @@ def training_states_and_actions(
         Returns:
             A tuple of list of states and list of actions.
         """
-        raise NotImplementedError(
-            "Featurizer must have the capacity to encode trackers to feature vectors"
-        )
+        (
+            trackers_as_states,
+            trackers_as_actions,
+            _,
+        ) = self.training_states_actions_and_entities(trackers, domain)
+        return trackers_as_states, trackers_as_actions
 
     def featurize_trackers(
         self,
         trackers: List[DialogueStateTracker],
         domain: Domain,
         interpreter: NaturalLanguageInterpreter,
-    ) -> Tuple[List[List[Dict[Text, List["Features"]]]], np.ndarray]:
+    ) -> Tuple[
+        List[List[Dict[Text, List["Features"]]]],
+        np.ndarray,
+        List[List[Dict[Text, List["Features"]]]],
+    ]:
         """Featurize the training trackers.
 
         Args:
@@ -137,14 +181,17 @@ def featurize_trackers(
 
         self.state_featurizer.prepare_from_domain(domain)
 
-        trackers_as_states, trackers_as_actions = self.training_states_and_actions(
-            trackers, domain
-        )
+        (
+            trackers_as_states,
+            trackers_as_actions,
+            trackers_as_entities,
+        ) = self.training_states_actions_and_entities(trackers, domain)
 
         tracker_state_features = self._featurize_states(trackers_as_states, interpreter)
         label_ids = self._convert_labels_to_ids(trackers_as_actions, domain)
+        entity_tags = self._create_entity_tags(trackers_as_entities, interpreter)
 
-        return tracker_state_features, label_ids
+        return tracker_state_features, label_ids, entity_tags
 
     @staticmethod
     def _choose_last_user_input(
@@ -252,23 +299,22 @@ class FullDialogueTrackerFeaturizer(TrackerFeaturizer):
     Training data is padded up to the length of the longest dialogue with -1.
     """
 
-    def training_states_and_actions(
+    def training_states_actions_and_entities(
         self, trackers: List[DialogueStateTracker], domain: Domain
-    ) -> Tuple[List[List[State]], List[List[Text]]]:
-        """Transforms list of trackers to lists of states and actions.
-
-        Training data is padded up to the length of the longest dialogue with -1.
+    ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
+        """Transforms list of trackers to lists of states, actions and entity data.
 
         Args:
             trackers: The trackers to transform
             domain: The domain
 
         Returns:
-            A tuple of list of states and list of actions.
+            A tuple of list of states, list of actions and list of entity data.
         """
 
         trackers_as_states = []
         trackers_as_actions = []
+        trackers_as_entities = []
 
         logger.debug(
             "Creating states and action examples from "
@@ -285,7 +331,12 @@ def training_states_and_actions(
 
             delete_first_state = False
             actions = []
+            entities = []
+            entity_data = {}
             for event in tracker.applied_events():
+                if isinstance(event, UserUttered):
+                    entity_data = self._entity_data(event)
+
                 if not isinstance(event, ActionExecuted):
                     continue
 
@@ -293,6 +344,7 @@ def training_states_and_actions(
                     # only actions which can be
                     # predicted at a stories start
                     actions.append(event.action_name or event.action_text)
+                    entities.append(entity_data)
                 else:
                     # unpredictable actions can be
                     # only the first in the story
@@ -303,13 +355,17 @@ def training_states_and_actions(
                         )
                     delete_first_state = True
 
+                # reset entity_data for the the next turn
+                entity_data = {}
+
             if delete_first_state:
                 states = states[1:]
 
             trackers_as_states.append(states[:-1])
             trackers_as_actions.append(actions)
+            trackers_as_entities.append(entities)
 
-        return trackers_as_states, trackers_as_actions
+        return trackers_as_states, trackers_as_actions, trackers_as_entities
 
     def prediction_states(
         self,
@@ -386,23 +442,22 @@ def _hash_example(
         frozen_actions = (action,)
         return hash((frozen_states, frozen_actions))
 
-    def training_states_and_actions(
+    def training_states_actions_and_entities(
         self, trackers: List[DialogueStateTracker], domain: Domain
-    ) -> Tuple[List[List[State]], List[List[Text]]]:
-        """Transforms list of trackers to lists of states and actions.
-
-        Training data is padded up to the length of the longest dialogue with -1.
+    ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
+        """Transforms list of trackers to lists of states, actions and entity data.
 
         Args:
             trackers: The trackers to transform
             domain: The domain
 
         Returns:
-            A tuple of list of states and list of actions.
+            A tuple of list of states, list of actions and list of entity data.
         """
 
         trackers_as_states = []
         trackers_as_actions = []
+        trackers_as_entities = []
 
         # from multiple states that create equal featurizations
         # we only need to keep one.
@@ -422,7 +477,11 @@ def training_states_and_actions(
             states = self._create_states(tracker, domain)
 
             states_length_for_action = 0
+            entity_data = {}
             for event in tracker.applied_events():
+                if isinstance(event, UserUttered):
+                    entity_data = self._entity_data(event)
+
                 if not isinstance(event, ActionExecuted):
                     continue
 
@@ -448,15 +507,19 @@ def training_states_and_actions(
                         trackers_as_actions.append(
                             [event.action_name or event.action_text]
                         )
+                        trackers_as_entities.append([entity_data])
                 else:
                     trackers_as_states.append(sliced_states)
                     trackers_as_actions.append([event.action_name or event.action_text])
+                    trackers_as_entities.append([entity_data])
 
+                # reset entity_data for the the next turn
+                entity_data = {}
                 pbar.set_postfix({"# actions": "{:d}".format(len(trackers_as_actions))})
 
         logger.debug("Created {} action examples.".format(len(trackers_as_actions)))
 
-        return trackers_as_states, trackers_as_actions
+        return trackers_as_states, trackers_as_actions, trackers_as_entities
 
     def prediction_states(
         self,
diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py
index 0d5db5a6c01b..6517b8e3362b 100644
--- a/rasa/core/policies/policy.py
+++ b/rasa/core/policies/policy.py
@@ -143,7 +143,11 @@ def featurize_for_training(
         domain: Domain,
         interpreter: NaturalLanguageInterpreter,
         **kwargs: Any,
-    ) -> Tuple[List[List[Dict[Text, List["Features"]]]], np.ndarray]:
+    ) -> Tuple[
+        List[List[Dict[Text, List["Features"]]]],
+        np.ndarray,
+        List[List[Dict[Text, List["Features"]]]],
+    ]:
         """Transform training trackers into a vector representation.
 
         The trackers, consisting of multiple turns, will be transformed
@@ -163,7 +167,7 @@ def featurize_for_training(
               trackers
         """
 
-        state_features, label_ids = self.featurizer.featurize_trackers(
+        state_features, label_ids, entity_tags = self.featurizer.featurize_trackers(
             training_trackers, domain, interpreter
         )
 
@@ -175,8 +179,9 @@ def featurize_for_training(
             )
             state_features = state_features[:max_training_samples]
             label_ids = label_ids[:max_training_samples]
+            entity_tags = entity_tags[:max_training_samples]
 
-        return state_features, label_ids
+        return state_features, label_ids, entity_tags
 
     def train(
         self,
diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py
index 93abf244d931..0126e60e15f7 100644
--- a/rasa/core/policies/sklearn_policy.py
+++ b/rasa/core/policies/sklearn_policy.py
@@ -233,7 +233,7 @@ def train(
         interpreter: NaturalLanguageInterpreter,
         **kwargs: Any,
     ) -> None:
-        tracker_state_features, label_ids = self.featurize_for_training(
+        tracker_state_features, label_ids, _ = self.featurize_for_training(
             training_trackers, domain, interpreter, **kwargs
         )
         training_data, zero_state_features = model_data_utils.convert_to_data_format(
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 98f9c8cd3215..b228ea4ec2e8 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -11,6 +11,7 @@
 from typing import Any, List, Optional, Text, Dict, Tuple, Union, TYPE_CHECKING
 
 import rasa.utils.io as io_utils
+from rasa.nlu.classifiers.diet_classifier import EntityTagSpec
 from rasa.shared.core.domain import Domain
 from rasa.core.featurizers.tracker_featurizers import (
     TrackerFeaturizer,
@@ -26,6 +27,8 @@
     ENTITIES,
     VALID_FEATURE_TYPES,
     FEATURE_TYPE_SENTENCE,
+    ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_TAGS,
 )
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter
 from rasa.core.policies.policy import Policy
@@ -43,6 +46,7 @@
 from rasa.utils.tensorflow.model_data_utils import convert_to_data_format
 from rasa.utils.tensorflow.constants import (
     LABEL,
+    IDS,
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
@@ -91,6 +95,7 @@
     MASK,
     HIDDEN_LAYERS_SIZES,
     FEATURIZERS,
+    ENTITY_RECOGNITION,
 )
 
 
@@ -101,8 +106,9 @@
 logger = logging.getLogger(__name__)
 
 LABEL_KEY = LABEL
-LABEL_SUB_KEY = "ids"
+LABEL_SUB_KEY = IDS
 LENGTH = "length"
+INDICES = "indices"
 SENTENCE_FEATURES_TO_ENCODE = [INTENT, TEXT, ACTION_NAME, ACTION_TEXT]
 SEQUENCE_FEATURES_TO_ENCODE = [TEXT, ACTION_TEXT, f"{LABEL}_{ACTION_TEXT}"]
 LABEL_FEATURES_TO_ENCODE = [f"{LABEL}_{ACTION_NAME}", f"{LABEL}_{ACTION_TEXT}"]
@@ -135,7 +141,6 @@ class TEDPolicy(Policy):
         # Hidden layer sizes for layers before the dialogue and label embedding layers.
         # The number of hidden layers is equal to the length of the corresponding
         # list.
-        # TODO add 2 parallel NNs: transformer for text and ffnn for names
         # Hidden layer sizes for layers before the embedding layers for user message
         # and labels.
         # The number of hidden layers is equal to the length of the corresponding
@@ -154,10 +159,14 @@ class TEDPolicy(Policy):
         },
         CONCAT_DIMENSION: {TEXT: 128, ACTION_TEXT: 128, f"{LABEL}_{ACTION_TEXT}": 128},
         ENCODING_DIMENSION: 50,
-        # Number of units in transformer
+        # Number of units in sequence transformer
         TRANSFORMER_SIZE: 128,
-        # Number of transformer layers
+        # Number of sequence transformer layers
         NUM_TRANSFORMER_LAYERS: 1,
+        # Number of units in dialogue transformer
+        f"{DIALOGUE}_{TRANSFORMER_SIZE}": 128,
+        # Number of dialogue transformer layers
+        f"{DIALOGUE}_{NUM_TRANSFORMER_LAYERS}": 1,
         # Number of attention heads in transformer
         NUM_HEADS: 4,
         # If 'True' use key relative embeddings in attention
@@ -250,6 +259,8 @@ class TEDPolicy(Policy):
         # Specify what features to use as sequence and sentence features.
         # By default all features in the pipeline are used.
         FEATURIZERS: [],
+        # If set to true, entities are predicted in user utterances.
+        ENTITY_RECOGNITION: True,
     }
 
     @staticmethod
@@ -264,7 +275,8 @@ def __init__(
         priority: int = DEFAULT_POLICY_PRIORITY,
         max_history: Optional[int] = None,
         model: Optional[RasaModel] = None,
-        zero_state_features: Optional[Dict[Text, List["Features"]]] = None,
+        fake_features: Optional[Dict[Text, List["Features"]]] = None,
+        entity_tag_specs: Optional[List[EntityTagSpec]] = None,
         **kwargs: Any,
     ) -> None:
         """Declare instance variables with default values."""
@@ -282,7 +294,9 @@ def __init__(
 
         self.model = model
 
-        self.zero_state_features = zero_state_features or defaultdict(list)
+        self._entity_tag_specs = entity_tag_specs
+
+        self.fake_features = fake_features or defaultdict(list)
 
         self._label_data: Optional[RasaModelData] = None
         self.data_example: Optional[Dict[Text, List[np.ndarray]]] = None
@@ -296,6 +310,27 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = train_utils.update_similarity_type(self.config)
         self.config = train_utils.update_evaluation_parameters(self.config)
 
+    def _create_entity_tag_specs(self) -> List[EntityTagSpec]:
+        """Create entity tag specifications with their respective tag id mappings."""
+
+        _tag_specs = []
+
+        tag_id_index_mapping = self.featurizer.state_featurizer.get_entity_tag_ids()
+
+        if tag_id_index_mapping:
+            _tag_specs.append(
+                EntityTagSpec(
+                    tag_name=ENTITY_ATTRIBUTE_TYPE,
+                    tags_to_ids=tag_id_index_mapping,
+                    ids_to_tags={
+                        value: key for key, value in tag_id_index_mapping.items()
+                    },
+                    num_tags=len(tag_id_index_mapping),
+                )
+            )
+
+        return _tag_specs
+
     def _create_label_data(
         self, domain: Domain, interpreter: NaturalLanguageInterpreter
     ) -> Tuple[RasaModelData, List[Dict[Text, List["Features"]]]]:
@@ -329,6 +364,7 @@ def _create_model_data(
         self,
         tracker_state_features: List[List[Dict[Text, List["Features"]]]],
         label_ids: Optional[np.ndarray] = None,
+        entity_tags: Optional[List[List[Dict[Text, List["Features"]]]]] = None,
         encoded_all_labels: Optional[List[Dict[Text, List["Features"]]]] = None,
     ) -> RasaModelData:
         """Combine all model related data into RasaModelData.
@@ -346,7 +382,11 @@ def _create_model_data(
         """
         model_data = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY)
 
-        if label_ids is not None and encoded_all_labels is not None:
+        if (
+            label_ids is not None
+            and entity_tags is not None
+            and encoded_all_labels is not None
+        ):
 
             label_ids = np.array(
                 [np.expand_dims(seq_label_ids, -1) for seq_label_ids in label_ids]
@@ -357,14 +397,27 @@ def _create_model_data(
                 [FeatureArray(label_ids, number_of_dimensions=3)],
             )
 
-            attribute_data, self.zero_state_features = convert_to_data_format(
+            attribute_data, self.fake_features = convert_to_data_format(
                 tracker_state_features, featurizers=self.config[FEATURIZERS]
             )
+            if self.config[ENTITY_RECOGNITION]:
+                # check that there are real entity tags
+                if any([any(turn_tags) for turn_tags in entity_tags]):
+                    entity_tags_data, _ = convert_to_data_format(entity_tags)
+                    model_data.add_data(entity_tags_data)
+                else:
+                    # there are no "real" entity tags
+                    logger.debug(
+                        f"Entity recognition cannot be performed,"
+                        f"set {ENTITY_RECOGNITION} to False"
+                    )
+                    self.config[ENTITY_RECOGNITION] = False
+
         else:
             # method is called during prediction
             attribute_data, _ = convert_to_data_format(
                 tracker_state_features,
-                self.zero_state_features,
+                self.fake_features,
                 featurizers=self.config[FEATURIZERS],
             )
 
@@ -384,6 +437,9 @@ def _create_model_data(
             FeatureArray(dialogue_lengths, number_of_dimensions=1)
         ]
 
+        # make sure all keys are in the same order during training and prediction
+        model_data.sort()
+
         return model_data
 
     def train(
@@ -403,7 +459,7 @@ def train(
             return
 
         # dealing with training data
-        tracker_state_features, label_ids = self.featurize_for_training(
+        tracker_state_features, label_ids, entity_tags = self.featurize_for_training(
             training_trackers, domain, interpreter, **kwargs
         )
 
@@ -413,7 +469,7 @@ def train(
 
         # extract actual training data to feed to model
         model_data = self._create_model_data(
-            tracker_state_features, label_ids, encoded_all_labels
+            tracker_state_features, label_ids, entity_tags, encoded_all_labels
         )
         if model_data.is_empty():
             logger.error(
@@ -422,6 +478,9 @@ def train(
             )
             return
 
+        if self.config[ENTITY_RECOGNITION]:
+            self._entity_tag_specs = self._create_entity_tag_specs()
+
         # keep one example for persisting and loading
         self.data_example = model_data.first_data_example()
 
@@ -430,6 +489,7 @@ def train(
             self.config,
             isinstance(self.featurizer, MaxHistoryTrackerFeaturizer),
             self._label_data,
+            self._entity_tag_specs,
         )
 
         self.model.fit(
@@ -458,7 +518,7 @@ def predict_action_probabilities(
         # create model data from tracker
         tracker_state_features = []
         if (
-            INTENT in self.zero_state_features
+            INTENT in self.fake_features
             or not tracker.latest_action_name == ACTION_LISTEN_NAME
         ):
             # the first example in a batch uses intent
@@ -467,7 +527,7 @@ def predict_action_probabilities(
                 [tracker], domain, interpreter, use_text_for_last_user_input=False
             )
         if (
-            TEXT in self.zero_state_features
+            TEXT in self.fake_features
             and tracker.latest_action_name == ACTION_LISTEN_NAME
         ):
             # the second - text, but only after user utterance
@@ -489,6 +549,7 @@ def predict_action_probabilities(
         if (
             len(tracker_state_features) == 2
             and np.max(confidences[1]) > self.config[E2E_CONFIDENCE_THRESHOLD]
+            # TODO maybe compare confidences is better
             and np.max(similarities[1]) > np.max(similarities[0])
         ):
             batch_index = 1
@@ -499,7 +560,7 @@ def predict_action_probabilities(
         else:  # only one tracker present
             batch_index = 0
             if tracker.latest_action_name == ACTION_LISTEN_NAME:
-                if TEXT in self.zero_state_features:
+                if TEXT in self.fake_features:
                     is_e2e_prediction = True
                 else:
                     is_e2e_prediction = False
@@ -547,14 +608,24 @@ def persist(self, path: Union[Text, Path]) -> None:
             model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl", self.data_example
         )
         io_utils.pickle_dump(
-            model_path / f"{SAVE_MODEL_FILE_NAME}.zero_state_features.pkl",
-            self.zero_state_features,
+            model_path / f"{SAVE_MODEL_FILE_NAME}.fake_features.pkl",
+            self.fake_features,
         )
         io_utils.pickle_dump(
             model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl",
             dict(self._label_data.data),
         )
 
+        entity_tag_specs = (
+            [tag_spec._asdict() for tag_spec in self._entity_tag_specs]
+            if self._entity_tag_specs
+            else []
+        )
+        rasa.shared.utils.io.dump_obj_as_json_to_file(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.entity_tag_specs.json",
+            entity_tag_specs,
+        )
+
     @classmethod
     def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
         """Loads a policy from the storage.
@@ -581,14 +652,30 @@ def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
         label_data = io_utils.pickle_load(
             model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl"
         )
-        zero_state_features = io_utils.pickle_load(
-            model_path / f"{SAVE_MODEL_FILE_NAME}.zero_state_features.pkl"
+        fake_features = io_utils.pickle_load(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.fake_features.pkl"
         )
         label_data = RasaModelData(data=label_data)
         meta = io_utils.pickle_load(model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl")
         priority = io_utils.json_unpickle(
             model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl"
         )
+        entity_tag_specs = rasa.shared.utils.io.read_json_file(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.entity_tag_specs.json"
+        )
+        entity_tag_specs = [
+            EntityTagSpec(
+                tag_name=tag_spec["tag_name"],
+                ids_to_tags={
+                    int(key): value for key, value in tag_spec["ids_to_tags"].items()
+                },
+                tags_to_ids={
+                    key: int(value) for key, value in tag_spec["tags_to_ids"].items()
+                },
+                num_tags=tag_spec["num_tags"],
+            )
+            for tag_spec in entity_tag_specs
+        ]
 
         model_data_example = RasaModelData(
             label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data=loaded_data
@@ -600,10 +687,11 @@ def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
             model_data_example,
             data_signature=model_data_example.get_signature(),
             config=meta,
-            max_history_tracker_featurizer_used=isinstance(
-                featurizer, MaxHistoryTrackerFeaturizer
-            ),
+            # during prediction we don't care about previous dialogue turns,
+            # so to save computation time, use only the last one
+            use_only_last_dialogue_turns=True,
             label_data=label_data,
+            entity_tag_specs=entity_tag_specs,
         )
 
         # build the graph for prediction
@@ -624,7 +712,8 @@ def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
             featurizer=featurizer,
             priority=priority,
             model=model,
-            zero_state_features=zero_state_features,
+            fake_features=fake_features,
+            entity_tag_specs=entity_tag_specs,
             **meta,
         )
 
@@ -634,12 +723,13 @@ def __init__(
         self,
         data_signature: Dict[Text, Dict[Text, List[FeatureSignature]]],
         config: Dict[Text, Any],
-        max_history_tracker_featurizer_used: bool,
+        use_only_last_dialogue_turns: bool,
         label_data: RasaModelData,
+        entity_tag_specs: Optional[List[EntityTagSpec]],
     ) -> None:
         super().__init__("TED", config, data_signature, label_data)
 
-        self.max_history_tracker_featurizer_used = max_history_tracker_featurizer_used
+        self.use_only_last_dialogue_turns = use_only_last_dialogue_turns
 
         self.predict_data_signature = {
             feature_name: features
@@ -648,13 +738,19 @@ def __init__(
             in STATE_LEVEL_FEATURES + SENTENCE_FEATURES_TO_ENCODE + [DIALOGUE]
         }
 
+        self._entity_tag_specs = entity_tag_specs
+
         # optimizer
         self.optimizer = tf.keras.optimizers.Adam()
 
         # metrics
         self.action_loss = tf.keras.metrics.Mean(name="loss")
         self.action_acc = tf.keras.metrics.Mean(name="acc")
+        self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
+        self.entity_f1 = tf.keras.metrics.Mean(name="e_f1")
         self.metrics_to_log += ["loss", "acc"]
+        if self.config[ENTITY_RECOGNITION]:
+            self.metrics_to_log += ["e_loss", "e_f1"]
 
         # needed for efficient prediction
         self.all_labels_embed: Optional[tf.Tensor] = None
@@ -681,6 +777,8 @@ def _check_data(self) -> None:
                 f"Cannot train '{self.__class__.__name__}' model."
             )
 
+    # ---CREATING LAYERS HELPERS---
+
     def _prepare_layers(self) -> None:
         for name in self.data_signature.keys():
             self._prepare_sparse_dense_layer_for(name, self.data_signature)
@@ -695,7 +793,11 @@ def _prepare_layers(self) -> None:
             self._prepare_encoding_layers(name)
 
         self._prepare_transformer_layer(
-            DIALOGUE, self.config[DROP_RATE_DIALOGUE], self.config[DROP_RATE_ATTENTION]
+            DIALOGUE,
+            self.config[f"{DIALOGUE}_{NUM_TRANSFORMER_LAYERS}"],
+            self.config[f"{DIALOGUE}_{TRANSFORMER_SIZE}"],
+            self.config[DROP_RATE_DIALOGUE],
+            self.config[DROP_RATE_ATTENTION],
         )
 
         self._prepare_embed_layers(DIALOGUE)
@@ -703,6 +805,9 @@ def _prepare_layers(self) -> None:
 
         self._prepare_dot_product_loss(LABEL, self.config[SCALE_LOSS])
 
+        if self.config[ENTITY_RECOGNITION]:
+            self._prepare_entity_recognition_layers()
+
     def _prepare_sparse_dense_layer_for(
         self, name: Text, signature: Dict[Text, Dict[Text, List[FeatureSignature]]]
     ) -> None:
@@ -757,16 +862,39 @@ def _prepare_encoding_layers(self, name: Text) -> None:
             f"{name}",
             [self.config[ENCODING_DIMENSION]],
             self.config[DROP_RATE_DIALOGUE],
+            prefix="encoding_layer",
         )
 
+    # ---GRAPH BUILDING HELPERS---
+
+    @staticmethod
+    def _compute_dialogue_indices(
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
+    ) -> None:
+        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32)
+        # wrap in a list, because that's the structure of tf_batch_data
+        tf_batch_data[DIALOGUE][INDICES] = [
+            (
+                tf.map_fn(
+                    tf.range,
+                    dialogue_lengths,
+                    fn_output_signature=tf.RaggedTensorSpec(
+                        shape=[None], dtype=tf.int32
+                    ),
+                )
+            ).values
+        ]
+
     def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_label_ids = self.tf_label_data[LABEL_KEY][LABEL_SUB_KEY][0]
         # labels cannot have all features "fake"
-        all_labels_encoded = {
-            key: self._encode_real_features_per_attribute(self.tf_label_data, key)
-            for key in self.tf_label_data.keys()
-            if key != LABEL_KEY
-        }
+        all_labels_encoded = {}
+        for key in self.tf_label_data.keys():
+            if key != LABEL_KEY:
+                attribute_features, _, _ = self._encode_real_features_per_attribute(
+                    self.tf_label_data, key
+                )
+                all_labels_encoded[key] = attribute_features
 
         if (
             all_labels_encoded.get(f"{LABEL_KEY}_{ACTION_TEXT}") is not None
@@ -788,11 +916,11 @@ def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
 
         return all_label_ids, all_labels_embed
 
-    def _emebed_dialogue(
+    def _embed_dialogue(
         self,
         dialogue_in: tf.Tensor,
         tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Create dialogue level embedding and mask."""
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
         mask = self._compute_mask(dialogue_lengths)
@@ -802,7 +930,7 @@ def _emebed_dialogue(
         )
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
-        if self.max_history_tracker_featurizer_used:
+        if self.use_only_last_dialogue_turns:
             # pick last vector if max history featurizer is used
             dialogue_transformed = tf.expand_dims(
                 self._last_token(dialogue_transformed, dialogue_lengths), 1
@@ -811,11 +939,11 @@ def _emebed_dialogue(
 
         dialogue_embed = self._tf_layers[f"embed.{DIALOGUE}"](dialogue_transformed)
 
-        return dialogue_embed, mask
+        return dialogue_embed, mask, dialogue_transformed
 
     def _encode_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
-    ) -> tf.Tensor:
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         # The input is a representation of 4d tensor of
         # shape (batch-size x dialogue-len x sequence-len x units) in 3d of shape
         # (sum of dialogue history length for all tensors in the batch x
@@ -839,30 +967,131 @@ def _encode_features_per_attribute(
             lambda: self._encode_fake_features_per_attribute(tf_batch_data, attribute),
         )
 
+    def _get_dense_units(
+        self, attribute_features_list: List[tf.Tensor], attribute: Text
+    ) -> int:
+        # TODO this should be done in corresponding layers once in init
+        units = 0
+        for f in attribute_features_list:
+            if isinstance(f, tf.SparseTensor):
+                units += self.config[DENSE_DIMENSION][attribute]
+            else:
+                units += f.shape[-1]
+        return units
+
+    def _get_concat_units(
+        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
+    ) -> int:
+        # TODO this should be done in corresponding layers once in init
+        # calculate concat sequence sentence dim
+        sentence_units = self._get_dense_units(
+            tf_batch_data[attribute][SENTENCE], attribute
+        )
+        sequence_units = self._get_dense_units(
+            tf_batch_data[attribute][SEQUENCE], attribute
+        )
+
+        if sequence_units and not sentence_units:
+            return sequence_units
+
+        if sentence_units and not sequence_units:
+            return sentence_units
+
+        if sentence_units != sequence_units:
+            return self.config[CONCAT_DIMENSION][TEXT]
+
+        return sentence_units
+
     def _encode_fake_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
-    ) -> tf.Tensor:
-        attribute_features_list = tf_batch_data[attribute][SENTENCE]
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        # we need to create real zero tensors with appropriate batch and dialogue dim
+        # because they are passed to dialogue transformer
         attribute_mask = tf_batch_data[attribute][MASK][0]
 
         batch_dim = tf.shape(attribute_mask)[0]
         dialogue_dim = tf.shape(attribute_mask)[1]
-
         if attribute in set(SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE):
             units = self.config[ENCODING_DIMENSION]
         else:
-            units = 0
-            for f in attribute_features_list:
-                if isinstance(f, tf.SparseTensor):
-                    units += self.config[DENSE_DIMENSION][attribute]
-                else:
-                    units += f.shape[-1]
+            units = self._get_dense_units(tf_batch_data[attribute][SENTENCE], attribute)
+
+        attribute_features = tf.zeros(
+            (batch_dim, dialogue_dim, units), dtype=tf.float32
+        )
+        if attribute == TEXT:
+            # if the input features are fake, we don't process them further,
+            # but we need to calculate correct last dim (units) so that tf could infer
+            # the last shape of the tensors
+            if self.config[f"{DIALOGUE}_{NUM_TRANSFORMER_LAYERS}"] > 0:
+                text_transformer_units = self.config[f"{DIALOGUE}_{TRANSFORMER_SIZE}"]
+            elif self.config[HIDDEN_LAYERS_SIZES][TEXT]:
+                text_transformer_units = self.config[HIDDEN_LAYERS_SIZES][TEXT][-1]
+            else:
+                text_transformer_units = self._get_concat_units(
+                    tf_batch_data, attribute
+                )
+
+            text_transformer_output = tf.zeros(
+                (0, 0, text_transformer_units), dtype=tf.float32
+            )
+            text_sequence_lengths = tf.zeros((0, 1), dtype=tf.int32)
+        else:
+            # simulate None with empty tensor of zeros
+            text_transformer_output = tf.zeros((0,))
+            text_sequence_lengths = tf.zeros((0,))
+
+        return attribute_features, text_transformer_output, text_sequence_lengths
 
-        return tf.zeros((batch_dim, dialogue_dim, units), dtype=tf.float32)
+    @staticmethod
+    def _create_last_dialogue_turns_mask(
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
+    ) -> tf.Tensor:
+        # Since use_only_last_dialogue_turns is True,
+        # we need to find the locations of last dialogue turns in
+        # (combined batch dimension and dialogue length,) dimension,
+        # so that we can use `_sequence_lengths` as a boolean  mask to pick
+        # which ones are "real" textual input in these last dialogue turns.
+
+        # In order to do that we can use given `dialogue_lengths`.
+        # For example:
+        # If we have `dialogue_lengths = [2, 1, 3]`, than
+        # `dialogue_indices = [0, 1, 0, 0, 1, 2]` here we can spot that `0`
+        # always indicates the first dialogue turn,
+        # which means that previous dialogue turn is the last dialogue turn.
+        # Combining this with the fact that the last element in
+        # `dialogue_indices` is always the last dialogue turn, we can add
+        # a `0` to the end, getting
+        # `_dialogue_indices = [0, 1, 0, 0, 1, 2, 0]`.
+        # Then removing the first element
+        # `_last_dialogue_turn_inverse_indicator = [1, 0, 0, 1, 2, 0]`
+        # we see that `0` points to the last dialogue turn.
+        # We convert all positive numbers to `True` and take
+        # the inverse mask to get
+        # `last_dialogue_mask = [0, 1, 1, 0, 0, 1],
+        # which precisely corresponds to the fact that first dialogue is of
+        # length 2, the second 1 and the third 3.
+        last_dialogue_turn_mask = tf.math.logical_not(
+            tf.cast(
+                tf.concat(
+                    [
+                        tf_batch_data[DIALOGUE][INDICES][0],
+                        tf.zeros((1,), dtype=tf.int32),
+                    ],
+                    axis=0,
+                )[1:],
+                dtype=tf.bool,
+            )
+        )
+        # get only the indices of real inputs
+        return tf.boolean_mask(
+            last_dialogue_turn_mask,
+            tf.reshape(tf_batch_data[attribute][SEQUENCE_LENGTH][0], (-1,)),
+        )
 
     def _encode_real_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
-    ) -> tf.Tensor:
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Encodes features for a given attribute.
 
         Args:
@@ -873,21 +1102,25 @@ def _encode_real_features_per_attribute(
         Returns:
             A tensor combining  all features for `attribute`
         """
+        # simulate None with empty tensor of zeros
+        text_transformer_output = tf.zeros((0,))
+        text_sequence_lengths = tf.zeros((0,))
+
         if attribute in SEQUENCE_FEATURES_TO_ENCODE:
             # sequence_lengths contain `0` for "fake" features, while
             # tf_batch_data[attribute] contain only "real" features
-            _sequence_lengths = tf_batch_data[attribute][SEQUENCE_LENGTH][0]
+            sequence_lengths = tf_batch_data[attribute][SEQUENCE_LENGTH][0]
             # extract only nonzero lengths and cast to int
-            _sequence_lengths = tf.cast(
-                tf.boolean_mask(_sequence_lengths, _sequence_lengths), dtype=tf.int32
+            sequence_lengths = tf.cast(
+                tf.boolean_mask(sequence_lengths, sequence_lengths), dtype=tf.int32
             )
             # boolean mask returns flat tensor
-            _sequence_lengths = tf.expand_dims(_sequence_lengths, axis=-1)
+            sequence_lengths = tf.expand_dims(sequence_lengths, axis=-1)
 
             mask_sequence_text = tf.squeeze(
-                self._compute_mask(_sequence_lengths), axis=1
+                self._compute_mask(sequence_lengths), axis=1
             )
-            sequence_lengths = _sequence_lengths + 1
+            sequence_lengths += 1
             mask_text = tf.squeeze(self._compute_mask(sequence_lengths), axis=1)
 
             attribute_features, _, _, _ = self._create_sequence(
@@ -902,7 +1135,22 @@ def _encode_real_features_per_attribute(
                 sequence_ids=False,
             )
 
-            # TODO entities
+            if attribute == TEXT:
+                text_transformer_output = attribute_features
+                text_sequence_lengths = sequence_lengths
+
+                if self.use_only_last_dialogue_turns:
+                    # get the location of all last dialogue inputs
+                    last_dialogue_turns_mask = self._create_last_dialogue_turns_mask(
+                        tf_batch_data, attribute
+                    )
+                    # pick last vector if max history featurizer is used
+                    text_transformer_output = tf.boolean_mask(
+                        text_transformer_output, last_dialogue_turns_mask
+                    )
+                    text_sequence_lengths = tf.boolean_mask(
+                        text_sequence_lengths, last_dialogue_turns_mask
+                    )
 
             # resulting attribute features will have shape
             # combined batch dimension and dialogue length x 1 x units
@@ -917,43 +1165,34 @@ def _encode_real_features_per_attribute(
             # resulting attribute features will have shape
             # combined batch dimension and dialogue length x 1 x units
             attribute_features = self._combine_sparse_dense_features(
-                tf_batch_data[attribute][SENTENCE], f"{attribute}_{SENTENCE}",
+                tf_batch_data[attribute][SENTENCE], f"{attribute}_{SENTENCE}"
             )
 
-        if attribute in set(SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE):
-            attribute_features = self._tf_layers[f"ffnn.{attribute}"](
+        if attribute in SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
+            attribute_features = self._tf_layers[f"encoding_layer.{attribute}"](
                 attribute_features
             )
 
-        # attribute_mask has shape batch x dialogue_len x 1
-        attribute_mask = tf_batch_data[attribute][MASK][0]
-
-        if attribute in set(SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES):
-            dialogue_lengths = tf.cast(
-                tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32
-            )
-        else:
-            # for labels, dialogue length is a fake dim and equal to 1
-            dialogue_lengths = tf.ones((tf.shape(attribute_mask)[0],), dtype=tf.int32)
-
         # attribute features have shape
         # (combined batch dimension and dialogue length x 1 x units)
         # convert them back to their original shape of
         # batch size x dialogue length x units
-        return self._convert_to_original_shape(
-            attribute_features, attribute_mask, dialogue_lengths
+        attribute_features = self._convert_to_original_shape(
+            attribute_features, tf_batch_data, attribute
         )
 
+        return attribute_features, text_transformer_output, text_sequence_lengths
+
     @staticmethod
     def _convert_to_original_shape(
         attribute_features: tf.Tensor,
-        attribute_mask: tf.Tensor,
-        dialogue_lengths: tf.Tensor,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        attribute: Text,
     ) -> tf.Tensor:
         """Transform attribute features back to original shape.
 
-        Given shape: combined batch and dialogue dimension x 1 x units
-        Original shape: batch x dialogue length x units
+        Given shape: (combined batch and dialogue dimension x 1 x units)
+        Original shape: (batch x dialogue length x units)
 
         Args:
             attribute_features: the "real" features to convert
@@ -967,12 +1206,25 @@ def _convert_to_original_shape(
         """
 
         # in order to convert the attribute features with shape
-        # combined batch-size and dialogue length x 1 x units
-        # to a shape of batch-size x dialogue length x units
-        # we use tf.scatter_nd. Therefore, we need to the target shape and the indices
+        # (combined batch-size and dialogue length x 1 x units)
+        # to a shape of (batch-size x dialogue length x units)
+        # we use tf.scatter_nd. Therefore, we need the target shape and the indices
         # mapping the values of attribute features to the position in the resulting
         # tensor.
 
+        # attribute_mask has shape batch x dialogue_len x 1
+        attribute_mask = tf_batch_data[attribute][MASK][0]
+
+        if attribute in SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES:
+            dialogue_lengths = tf.cast(
+                tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32
+            )
+            dialogue_indices = tf_batch_data[DIALOGUE][INDICES][0]
+        else:
+            # for labels, dialogue length is a fake dim and equal to 1
+            dialogue_lengths = tf.ones((tf.shape(attribute_mask)[0],), dtype=tf.int32)
+            dialogue_indices = tf.zeros((tf.shape(attribute_mask)[0],), dtype=tf.int32)
+
         batch_dim = tf.shape(attribute_mask)[0]
         dialogue_dim = tf.shape(attribute_mask)[1]
         units = attribute_features.shape[-1]
@@ -981,17 +1233,9 @@ def _convert_to_original_shape(
         attribute_mask = tf.cast(tf.squeeze(attribute_mask, axis=-1), dtype=tf.int32)
         # sum of attribute mask contains number of dialogue turns with "real" features
         non_fake_dialogue_lengths = tf.reduce_sum(attribute_mask, axis=-1)
-
+        # create the batch indices
         batch_indices = tf.repeat(tf.range(batch_dim), non_fake_dialogue_lengths)
 
-        dialogue_indices = (
-            tf.map_fn(
-                tf.range,
-                dialogue_lengths,
-                fn_output_signature=tf.RaggedTensorSpec(shape=[None], dtype=tf.int32),
-            )
-        ).values
-
         # attribute_mask has shape (batch x dialogue_len x 1), while
         # dialogue_indices has shape (combined_dialogue_len,)
         # in order to find positions of real input we need to flatten
@@ -1005,12 +1249,13 @@ def _convert_to_original_shape(
         indices = tf.stack([batch_indices, dialogue_indices], axis=1)
 
         shape = tf.convert_to_tensor([batch_dim, dialogue_dim, units])
+        attribute_features = tf.squeeze(attribute_features, axis=1)
 
-        return tf.scatter_nd(indices, tf.squeeze(attribute_features, axis=1), shape)
+        return tf.scatter_nd(indices, attribute_features, shape)
 
     def _process_batch_data(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
-    ) -> tf.Tensor:
+    ) -> Tuple[tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]:
         """Encodes batch data.
 
         Combines intent and text and action name and action text if both are present.
@@ -1022,11 +1267,22 @@ def _process_batch_data(
              Tensor: encoding of all features in the batch, combined;
         """
         # encode each attribute present in tf_batch_data
-        batch_encoded = {
-            key: self._encode_features_per_attribute(tf_batch_data, key)
-            for key in tf_batch_data.keys()
-            if LABEL_KEY not in key and DIALOGUE not in key
-        }
+        text_transformer_output = None
+        text_sequence_lengths = None
+        batch_encoded = {}
+        for attribute in tf_batch_data.keys():
+            if attribute in SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES:
+                (
+                    attribute_features,
+                    _text_transformer_output,
+                    _text_sequence_lengths,
+                ) = self._encode_features_per_attribute(tf_batch_data, attribute)
+
+                batch_encoded[attribute] = attribute_features
+                if attribute == TEXT:
+                    text_transformer_output = _text_transformer_output
+                    text_sequence_lengths = _text_sequence_lengths
+
         # if both action text and action name are present, combine them; otherwise,
         # return the one which is present
 
@@ -1060,7 +1316,131 @@ def _process_batch_data(
 
         batch_features = tf.concat(batch_features, axis=-1)
 
-        return batch_features
+        return batch_features, text_transformer_output, text_sequence_lengths
+
+    def _reshape_for_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        # To calculate the loss for entities we need the output of the text
+        # sequence transformer (shape: real entity dim x
+        # sequence length x units), the output of the dialogue transformer
+        # (shape: batch size x dialogue length x units) and the tag ids for the
+        # entities (shape: real entity dim x sequence length - 1 x units)
+        # The real entity dimension for the text sequence transformer
+        # and the tag ids matches.
+        # In order to process the tensors, they need to have the same shape.
+        # Convert the output of the dialogue transformer to shape
+        # (real entity dim x 1 x units).
+        # Note: The CRF layer cannot handle 4D tensors. E.g. we cannot use the shape
+        # batch size x dialogue length x sequence length x units
+
+        # convert the output of the dialogue transformer
+        # to shape (real entity dim x 1 x units)
+        attribute_mask = tf_batch_data[TEXT][MASK][0]
+        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+
+        if self.use_only_last_dialogue_turns:
+            # pick last vector if max history featurizer is used
+            attribute_mask = tf.expand_dims(
+                self._last_token(attribute_mask, dialogue_lengths), axis=1
+            )
+        dialogue_transformer_output = tf.boolean_mask(
+            dialogue_transformer_output, tf.squeeze(attribute_mask, axis=-1)
+        )
+
+        # boolean mask removed axis=1, add it back
+        dialogue_transformer_output = tf.expand_dims(
+            dialogue_transformer_output, axis=1
+        )
+
+        # broadcast the dialogue transformer output sequence-length-times to get the
+        # same shape as the text sequence transformer output
+        dialogue_transformer_output = tf.broadcast_to(
+            dialogue_transformer_output, tf.shape(text_transformer_output)
+        )
+
+        # concat the output of the dialogue transformer to the output of the text
+        # sequence transformer (adding context)
+        # resulting shape
+        # (real entity dim x sequence length x 2 units)
+        text_transformed = tf.concat(
+            [text_transformer_output, dialogue_transformer_output], axis=-1
+        )
+
+        text_mask = tf.squeeze(self._compute_mask(text_sequence_lengths), axis=1)
+        # add zeros to match the shape of text_transformed, because
+        # max sequence length might differ, since it is calculated dynamically
+        # based on a subset of sequence lengths
+        sequence_diff = tf.shape(text_transformed)[1] - tf.shape(text_mask)[1]
+        text_mask = tf.pad(text_mask, [[0, 0], [0, sequence_diff], [0, 0]])
+
+        # remove additional dims and sentence features
+        text_sequence_lengths = tf.reshape(text_sequence_lengths, (-1,)) - 1
+
+        return text_transformed, text_mask, text_sequence_lengths
+
+    # ---TRAINING---
+
+    def _batch_loss_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> tf.Tensor:
+        # It could happen that some batches don't contain "real" features for `text`,
+        # e.g. large number of stories are intent only.
+        # Therefore actual `text_transformer_output` will be empty.
+        # We cannot create a loss with empty tensors.
+        # Since we need actual numbers to create a full loss, we output
+        # zero in this case.
+        return tf.cond(
+            tf.shape(text_transformer_output)[0] > 0,
+            lambda: self._real_batch_loss_entities(
+                tf_batch_data,
+                dialogue_transformer_output,
+                text_transformer_output,
+                text_sequence_lengths,
+            ),
+            lambda: tf.constant(0.0),
+        )
+
+    def _real_batch_loss_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> tf.Tensor:
+
+        text_transformed, text_mask, text_sequence_lengths = self._reshape_for_entities(
+            tf_batch_data,
+            dialogue_transformer_output,
+            text_transformer_output,
+            text_sequence_lengths,
+        )
+
+        tag_ids = tf_batch_data[ENTITY_TAGS][IDS][0]
+        # add a zero (no entity) for the sentence features to match the shape of inputs
+        sequence_diff = tf.shape(text_transformed)[1] - tf.shape(tag_ids)[1]
+        tag_ids = tf.pad(tag_ids, [[0, 0], [0, sequence_diff], [0, 0]])
+
+        loss, f1, _ = self._calculate_entity_loss(
+            text_transformed,
+            tag_ids,
+            text_mask,
+            text_sequence_lengths,
+            ENTITY_ATTRIBUTE_TYPE,
+        )
+
+        self.entity_loss.update_state(loss)
+        self.entity_f1.update_state(f1)
+
+        return loss
 
     @staticmethod
     def _get_labels_embed(
@@ -1086,18 +1466,27 @@ def batch_loss(
             The loss of the given batch.
         """
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
+        self._compute_dialogue_indices(tf_batch_data)
 
         all_label_ids, all_labels_embed = self._create_all_labels_embed()
 
         label_ids = tf_batch_data[LABEL_KEY][LABEL_SUB_KEY][0]
         labels_embed = self._get_labels_embed(label_ids, all_labels_embed)
 
-        dialogue_in = self._process_batch_data(tf_batch_data)
-        dialogue_embed, dialogue_mask = self._emebed_dialogue(
-            dialogue_in, tf_batch_data
-        )
+        (
+            dialogue_in,
+            text_transformer_output,
+            text_sequence_lengths,
+        ) = self._process_batch_data(tf_batch_data)
+        (
+            dialogue_embed,
+            dialogue_mask,
+            dialogue_transformer_output,
+        ) = self._embed_dialogue(dialogue_in, tf_batch_data)
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
+        losses = []
+
         loss, acc = self._tf_layers[f"loss.{LABEL}"](
             dialogue_embed,
             labels_embed,
@@ -1106,11 +1495,28 @@ def batch_loss(
             all_label_ids,
             dialogue_mask,
         )
+        losses.append(loss)
+
+        if (
+            self.config[ENTITY_RECOGNITION]
+            and text_transformer_output is not None
+            and text_sequence_lengths is not None
+        ):
+            losses.append(
+                self._batch_loss_entities(
+                    tf_batch_data,
+                    dialogue_transformer_output,
+                    text_transformer_output,
+                    text_sequence_lengths,
+                )
+            )
 
         self.action_loss.update_state(loss)
         self.action_acc.update_state(acc)
 
-        return loss
+        return tf.math.add_n(losses)
+
+    # ---PREDICTION---
 
     def prepare_for_predict(self) -> None:
         _, self.all_labels_embed = self._create_all_labels_embed()
@@ -1135,11 +1541,18 @@ def batch_predict(
         tf_batch_data = self.batch_to_model_data_format(
             batch_in, self.predict_data_signature
         )
-
-        dialogue_in = self._process_batch_data(tf_batch_data)
-        dialogue_embed, dialogue_mask = self._emebed_dialogue(
-            dialogue_in, tf_batch_data
-        )
+        self._compute_dialogue_indices(tf_batch_data)
+
+        (
+            dialogue_in,
+            text_transformer_output,
+            text_sequence_lengths,
+        ) = self._process_batch_data(tf_batch_data)
+        (
+            dialogue_embed,
+            dialogue_mask,
+            dialogue_transformer_output,
+        ) = self._embed_dialogue(dialogue_in, tf_batch_data)
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
         sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
@@ -1151,8 +1564,73 @@ def batch_predict(
         scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
             sim_all, self.config[SIMILARITY_TYPE]
         )
+        predictions = {"action_scores": scores, "similarities": sim_all}
+
+        if (
+            self.config[ENTITY_RECOGNITION]
+            and text_transformer_output is not None
+            and text_sequence_lengths is not None
+        ):
+            pred_ids, confidences = self._batch_predict_entities(
+                tf_batch_data,
+                dialogue_transformer_output,
+                text_transformer_output,
+                text_sequence_lengths,
+            )
+            name = ENTITY_ATTRIBUTE_TYPE
+            predictions[f"e_{name}_ids"] = pred_ids
+            predictions[f"e_{name}_scores"] = confidences
+
+        return predictions
+
+    def _batch_predict_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        # It could happen that current prediction turn don't contain
+        # "real" features for `text`,
+        # Therefore actual `text_transformer_output` will be empty.
+        # We cannot predict entities with empty tensors.
+        # Since we need to output some tensors of the same shape, we output
+        # zero tensors.
+        return tf.cond(
+            tf.shape(text_transformer_output)[0] > 0,
+            lambda: self._real_batch_predict_entities(
+                tf_batch_data,
+                dialogue_transformer_output,
+                text_transformer_output,
+                text_sequence_lengths,
+            ),
+            lambda: (
+                # the output is of shape (batch_size, max_seq_len)
+                tf.zeros(tf.shape(text_transformer_output)[:2], dtype=tf.int32),
+                tf.zeros(tf.shape(text_transformer_output)[:2], dtype=tf.float32),
+            ),
+        )
+
+    def _real_batch_predict_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+
+        text_transformed, _, text_sequence_lengths = self._reshape_for_entities(
+            tf_batch_data,
+            dialogue_transformer_output,
+            text_transformer_output,
+            text_sequence_lengths,
+        )
+
+        name = ENTITY_ATTRIBUTE_TYPE
+
+        _logits = self._tf_layers[f"embed.{name}.logits"](text_transformed)
 
-        return {"action_scores": scores, "similarities": sim_all}
+        return self._tf_layers[f"crf.{name}"](_logits, text_sequence_lengths)
 
 
 # pytype: enable=key-error
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 09e6ea59b653..eb9268c02887 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -44,6 +44,7 @@
 from rasa.nlu.model import Metadata
 from rasa.utils.tensorflow.constants import (
     LABEL,
+    IDS,
     HIDDEN_LAYERS_SIZES,
     SHARE_HIDDEN_LAYERS,
     TRANSFORMER_SIZE,
@@ -101,8 +102,7 @@
 SPARSE = "sparse"
 DENSE = "dense"
 LABEL_KEY = LABEL
-LABEL_SUB_KEY = "ids"
-TAG_IDS = "tag_ids"
+LABEL_SUB_KEY = IDS
 
 POSSIBLE_TAGS = [ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_ROLE, ENTITY_ATTRIBUTE_GROUP]
 
@@ -1309,22 +1309,6 @@ def _prepare_label_classification_layers(self) -> None:
 
         self._prepare_dot_product_loss(LABEL, self.config[SCALE_LOSS])
 
-    def _prepare_entity_recognition_layers(self) -> None:
-        for tag_spec in self._entity_tag_specs:
-            name = tag_spec.tag_name
-            num_tags = tag_spec.num_tags
-            self._tf_layers[f"embed.{name}.logits"] = layers.Embed(
-                num_tags, self.config[REGULARIZATION_CONSTANT], f"logits.{name}"
-            )
-            self._tf_layers[f"crf.{name}"] = layers.CRF(
-                num_tags, self.config[REGULARIZATION_CONSTANT], self.config[SCALE_LOSS]
-            )
-            self._tf_layers[f"embed.{name}.tags"] = layers.Embed(
-                self.config[EMBEDDING_DIMENSION],
-                self.config[REGULARIZATION_CONSTANT],
-                f"tags.{name}",
-            )
-
     def _create_bow(
         self,
         sequence_features: List[Union[tf.Tensor, tf.SparseTensor]],
@@ -1406,33 +1390,6 @@ def _calculate_label_loss(
             text_embed, label_embed, label_ids, all_labels_embed, all_label_ids
         )
 
-    def _calculate_entity_loss(
-        self,
-        inputs: tf.Tensor,
-        tag_ids: tf.Tensor,
-        mask: tf.Tensor,
-        sequence_lengths: tf.Tensor,
-        tag_name: Text,
-        entity_tags: Optional[tf.Tensor] = None,
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        tag_ids = tf.cast(tag_ids[:, :, 0], tf.int32)
-
-        if entity_tags is not None:
-            _tags = self._tf_layers[f"embed.{tag_name}.tags"](entity_tags)
-            inputs = tf.concat([inputs, _tags], axis=-1)
-
-        logits = self._tf_layers[f"embed.{tag_name}.logits"](inputs)
-
-        # should call first to build weights
-        pred_ids, _ = self._tf_layers[f"crf.{tag_name}"](logits, sequence_lengths)
-        loss = self._tf_layers[f"crf.{tag_name}"].loss(
-            logits, tag_ids, sequence_lengths
-        )
-        f1 = self._tf_layers[f"crf.{tag_name}"].f1_score(tag_ids, pred_ids, mask)
-
-        return loss, f1, logits
-
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
diff --git a/rasa/shared/core/trackers.py b/rasa/shared/core/trackers.py
index c507e5f2c74a..47c1c7744362 100644
--- a/rasa/shared/core/trackers.py
+++ b/rasa/shared/core/trackers.py
@@ -231,7 +231,7 @@ def _events_for_verbosity(
 
     @staticmethod
     def freeze_current_state(state: State) -> FrozenState:
-        frozen_state = frozenset(
+        return frozenset(
             {
                 key: frozenset(values.items())
                 if isinstance(values, Dict)
@@ -239,7 +239,6 @@ def freeze_current_state(state: State) -> FrozenState:
                 for key, values in state.items()
             }.items()
         )
-        return frozen_state
 
     def past_states(self, domain: Domain) -> List[State]:
         """Generate the past states of this tracker based on the history.
diff --git a/rasa/shared/nlu/constants.py b/rasa/shared/nlu/constants.py
index ee85a005f935..53040f0d4c53 100644
--- a/rasa/shared/nlu/constants.py
+++ b/rasa/shared/nlu/constants.py
@@ -26,6 +26,7 @@
 TRAINABLE_EXTRACTORS = {"MitieEntityExtractor", "CRFEntityExtractor", "DIETClassifier"}
 
 ENTITIES = "entities"
+ENTITY_TAGS = "entity_tags"
 ENTITY_ATTRIBUTE_TYPE = "entity"
 ENTITY_ATTRIBUTE_GROUP = "group"
 ENTITY_ATTRIBUTE_ROLE = "role"
diff --git a/rasa/shared/nlu/training_data/features.py b/rasa/shared/nlu/training_data/features.py
index c556d6e6c3ff..755215fae35e 100644
--- a/rasa/shared/nlu/training_data/features.py
+++ b/rasa/shared/nlu/training_data/features.py
@@ -16,21 +16,11 @@ def __init__(
         attribute: Text,
         origin: Union[Text, List[Text]],
     ) -> None:
-        self._validate_feature_type(feature_type)
-
         self.features = features
         self.type = feature_type
         self.origin = origin
         self.attribute = attribute
 
-    @staticmethod
-    def _validate_feature_type(feature_type: Text) -> None:
-        if feature_type not in VALID_FEATURE_TYPES:
-            raise ValueError(
-                f"Invalid feature type '{feature_type}' used. Valid feature types are: "
-                f"{VALID_FEATURE_TYPES}."
-            )
-
     def is_sparse(self) -> bool:
         """Checks if features are sparse or not.
 
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 06f81775a673..7957e84f8351 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -1,6 +1,7 @@
 # constants for configuration parameters of our tensorflow models
 
 LABEL = "label"
+IDS = "ids"
 HIDDEN_LAYERS_SIZES = "hidden_layers_sizes"
 SHARE_HIDDEN_LAYERS = "share_hidden_layers"
 
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 1d64b1b26cb3..a9017094e945 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -136,7 +136,7 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
         if len(inputs.shape) == 3:
             # reshape back
             outputs = tf.reshape(
-                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], -1)
+                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], self.units)
             )
 
         if self.use_bias:
@@ -630,13 +630,15 @@ def body(idx: tf.Tensor, out: tf.Tensor) -> List[tf.Tensor]:
         # create first random array of indices
         out1 = rand_idxs()  # (1, num_neg)
 
-        return tf.while_loop(
-            cond,
-            body,
-            loop_vars=[idx1, out1],
-            shape_invariants=[idx1.shape, tf.TensorShape([None, self.num_neg])],
-            parallel_iterations=self.parallel_iterations,
-            back_prop=False,
+        return tf.nest.map_structure(
+            tf.stop_gradient,
+            tf.while_loop(
+                cond,
+                body,
+                loop_vars=[idx1, out1],
+                shape_invariants=[idx1.shape, tf.TensorShape([None, self.num_neg])],
+                parallel_iterations=self.parallel_iterations,
+            ),
         )[1]
 
     @staticmethod
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 46cf8fd5bd66..4f5dcab1c010 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -124,7 +124,7 @@ def __setstate__(self, state, **kwargs):
     def _validate_number_of_dimensions(
         number_of_dimensions: int, input_array: np.ndarray
     ) -> None:
-        """Validates if the given number of dimensions maps the with the dimensions of the input array.
+        """Validates if the the input array has given number of dimensions.
 
         Args:
             number_of_dimensions: number of dimensions
@@ -140,6 +140,10 @@ def _validate_number_of_dimensions(
             if isinstance(_sub_array, scipy.sparse.spmatrix):
                 dim = i
                 break
+            if isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0:
+                # sequence dimension is 0, we are dealing with "fake" features
+                dim = i
+                break
 
         # If the resulting sub_array is sparse, the remaining number of dimensions
         # should be at least 2
@@ -147,7 +151,15 @@ def _validate_number_of_dimensions(
             if dim > 2:
                 raise ValueError(
                     f"Given number of dimensions '{number_of_dimensions}' does not "
-                    f"match dimensiona of given input array: {input_array}."
+                    f"match dimensions of given input array: {input_array}."
+                )
+        elif isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0:
+            # sequence dimension is 0, we are dealing with "fake" features,
+            # but they should be of dim 2
+            if dim > 2:
+                raise ValueError(
+                    f"Given number of dimensions '{number_of_dimensions}' does not "
+                    f"match dimensions of given input array: {input_array}."
                 )
         # If the resulting sub_array is dense, the sub_array should be a single number
         elif not np.issubdtype(type(_sub_array), np.integer) and not isinstance(
@@ -1122,7 +1134,7 @@ def _pad_4d_dense_data(array_of_array_of_dense: FeatureArray) -> np.ndarray:
         )
 
         data_padded = np.zeros(
-            [combined_dialogue_len, max_seq_len, number_of_features,],
+            [combined_dialogue_len, max_seq_len, number_of_features],
             dtype=array_of_array_of_dense[0][0].dtype,
         )
 
@@ -1225,7 +1237,7 @@ def _4d_scipy_matrix_to_values(
         indices = np.hstack(
             [
                 np.vstack(
-                    [sum(dialogue_len[:i]) + j * np.ones_like(x.row), x.row, x.col,]
+                    [sum(dialogue_len[:i]) + j * np.ones_like(x.row), x.row, x.col]
                 )
                 for i, array_of_sparse in enumerate(array_of_array_of_sparse)
                 for j, x in enumerate(array_of_sparse)
diff --git a/rasa/utils/tensorflow/model_data_utils.py b/rasa/utils/tensorflow/model_data_utils.py
index ca3d000b7310..1f556d22914d 100644
--- a/rasa/utils/tensorflow/model_data_utils.py
+++ b/rasa/utils/tensorflow/model_data_utils.py
@@ -166,7 +166,7 @@ def _filter_features(features: Optional[List["Features"]], featurizers: List[Tex
     return [f for f in features if f.origin in featurizers]
 
 
-def _create_zero_features(
+def _create_fake_features(
     all_features: List[List[List["Features"]]],
 ) -> List["Features"]:
     """Computes default feature values.
@@ -191,8 +191,8 @@ def _create_zero_features(
         )
     )
 
-    # create zero_features for Nones
-    zero_features = []
+    # create fake_features for Nones
+    fake_features = []
     for _features in example_features:
         new_features = copy.deepcopy(_features)
         if _features.is_dense():
@@ -203,16 +203,16 @@ def _create_zero_features(
             new_features.features = scipy.sparse.coo_matrix(
                 (0, _features.features.shape[-1]), _features.features.dtype
             )
-        zero_features.append(new_features)
+        fake_features.append(new_features)
 
-    return zero_features
+    return fake_features
 
 
 def convert_to_data_format(
     features: Union[
         List[List[Dict[Text, List["Features"]]]], List[Dict[Text, List["Features"]]]
     ],
-    zero_features: Optional[Dict[Text, List["Features"]]] = None,
+    fake_features: Optional[Dict[Text, List["Features"]]] = None,
     consider_dialogue_dimension: bool = True,
     featurizers: Optional[List[Text]] = None,
 ) -> Tuple[Data, Optional[Dict[Text, List["Features"]]]]:
@@ -228,7 +228,7 @@ def convert_to_data_format(
     Args:
         features: a dictionary of attributes to a list of features for all
             examples in the training data
-        zero_features: Contains default feature values for attributes
+        fake_features: Contains default feature values for attributes
         consider_dialogue_dimension: If set to false the dialogue dimension will be
             removed from the resulting sequence features.
         featurizers: the featurizers to consider
@@ -237,9 +237,9 @@ def convert_to_data_format(
         Input in "Data" format and zero features
     """
     training = False
-    if not zero_features:
+    if not fake_features:
         training = True
-        zero_features = defaultdict(list)
+        fake_features = defaultdict(list)
 
     # unify format of incoming features
     if isinstance(features[0], Dict):
@@ -254,7 +254,7 @@ def convert_to_data_format(
     if training:
         attributes = list(attribute_to_features.keys())
     else:
-        attributes = list(zero_features.keys())
+        attributes = list(fake_features.keys())
 
     # In case an attribute is not present during prediction, replace it with
     # None values that will then be replaced by zero features
@@ -271,14 +271,14 @@ def convert_to_data_format(
             empty_features,
             attribute_to_features,
             training,
-            zero_features,
+            fake_features,
             consider_dialogue_dimension,
         )
 
     # ensure that all attributes are in the same order
     attribute_data = OrderedDict(sorted(attribute_data.items()))
 
-    return attribute_data, zero_features
+    return attribute_data, fake_features
 
 
 def _features_for_attribute(
@@ -286,7 +286,7 @@ def _features_for_attribute(
     empty_features: List[Any],
     attribute_to_features: Dict[Text, List[List[List["Features"]]]],
     training: bool,
-    zero_features: Dict[Text, List["Features"]],
+    fake_features: Dict[Text, List["Features"]],
     consider_dialogue_dimension: bool,
 ) -> Dict[Text, List[FeatureArray]]:
     """Create the features for the given attribute from the all examples features.
@@ -296,9 +296,9 @@ def _features_for_attribute(
         empty_features: empty features
         attribute_to_features: features for every example
         training: boolean indicating whether we are currently in training or not
-        zero_features: zero features
-        consider_dialogue_dimension: If set to false the dialogue dimension will be removed from the resulting sequence
-            features.
+        fake_features: zero features
+        consider_dialogue_dimension: If set to false the dialogue dimension will be
+          removed from the resulting sequence features.
 
     Returns:
         A dictionary of feature type to actual features for the given attribute.
@@ -312,10 +312,10 @@ def _features_for_attribute(
     # in case some features for a specific attribute are
     # missing, replace them with a feature vector of zeros
     if training:
-        zero_features[attribute] = _create_zero_features(features)
+        fake_features[attribute] = _create_fake_features(features)
 
     (attribute_masks, _dense_features, _sparse_features) = _extract_features(
-        features, zero_features[attribute], attribute
+        features, fake_features[attribute], attribute
     )
 
     sparse_features = {}
@@ -363,7 +363,7 @@ def _features_for_attribute(
 
 def _extract_features(
     features: List[List[List["Features"]]],
-    zero_features: List["Features"],
+    fake_features: List["Features"],
     attribute: Text,
 ) -> Tuple[
     List[np.ndarray],
@@ -375,7 +375,7 @@ def _extract_features(
 
     Args:
         features: all features
-        zero_features: list of zero features
+        fake_features: list of zero features
 
     Returns:
         - a list of attribute masks
@@ -399,7 +399,7 @@ def _extract_features(
             if list_of_features is None:
                 # use zero features and set mask to zero
                 attribute_mask[i] = 0
-                list_of_features = zero_features
+                list_of_features = fake_features
 
             for features in list_of_features:
                 # in case of ENTITIES, if the attribute type matches either 'entity',
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 3708f4a8a99b..50e4903814c0 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -54,6 +54,7 @@
     DENSE_DIMENSION,
     CONCAT_DIMENSION,
     DROP_RATE_ATTENTION,
+    SCALE_LOSS,
 )
 from rasa.utils.tensorflow import layers
 from rasa.utils.tensorflow.transformer import TransformerEncoder
@@ -731,14 +732,16 @@ def _prepare_ffnn_layer(
     def _prepare_transformer_layer(
         self,
         name: Text,
+        num_layers: int,
+        units: int,
         drop_rate: float,
         drop_rate_attention: float,
         prefix: Text = "transformer",
     ):
         if self.config[NUM_TRANSFORMER_LAYERS] > 0:
             self._tf_layers[f"{prefix}.{name}"] = TransformerEncoder(
-                self.config[NUM_TRANSFORMER_LAYERS],
-                self.config[TRANSFORMER_SIZE],
+                num_layers,
+                units,
                 self.config[NUM_HEADS],
                 self.config[TRANSFORMER_SIZE] * 4,
                 self.config[REGULARIZATION_CONSTANT],
@@ -800,7 +803,10 @@ def _prepare_sparse_dense_layers(
             if not dense:
                 # create dense labels for the input to use in negative sampling
                 self._tf_layers[f"sparse_to_dense_ids.{name}"] = layers.DenseForSparse(
-                    units=2, trainable=False, name=f"sparse_to_dense_ids.{name}"
+                    units=2,
+                    use_bias=False,
+                    trainable=False,
+                    name=f"sparse_to_dense_ids.{name}",
                 )
 
     def _prepare_input_layers(self, name: Text) -> None:
@@ -833,9 +839,29 @@ def _prepare_input_layers(self, name: Text) -> None:
     def _prepare_sequence_layers(self, name: Text) -> None:
         self._prepare_input_layers(name)
         self._prepare_transformer_layer(
-            name, self.config[DROP_RATE], self.config[DROP_RATE_ATTENTION]
+            name,
+            self.config[NUM_TRANSFORMER_LAYERS],
+            self.config[TRANSFORMER_SIZE],
+            self.config[DROP_RATE],
+            self.config[DROP_RATE_ATTENTION],
         )
 
+    def _prepare_entity_recognition_layers(self) -> None:
+        for tag_spec in self._entity_tag_specs:
+            name = tag_spec.tag_name
+            num_tags = tag_spec.num_tags
+            self._tf_layers[f"embed.{name}.logits"] = layers.Embed(
+                num_tags, self.config[REGULARIZATION_CONSTANT], f"logits.{name}"
+            )
+            self._tf_layers[f"crf.{name}"] = layers.CRF(
+                num_tags, self.config[REGULARIZATION_CONSTANT], self.config[SCALE_LOSS]
+            )
+            self._tf_layers[f"embed.{name}.tags"] = layers.Embed(
+                self.config[EMBEDDING_DIMENSION],
+                self.config[REGULARIZATION_CONSTANT],
+                f"tags.{name}",
+            )
+
     def _combine_sparse_dense_features(
         self,
         features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]],
@@ -948,6 +974,7 @@ def _features_as_seq_ids(
         self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text
     ) -> Optional[tf.Tensor]:
         """Creates dense labels for negative sampling."""
+
         # if there are dense features - we can use them
         for f in features:
             if not isinstance(f, tf.SparseTensor):
@@ -1064,6 +1091,33 @@ def _get_batch_dim(attribute_data: Dict[Text, List[tf.Tensor]]) -> int:
 
         return tf.shape(attribute_data[SENTENCE][0])[0]
 
+    def _calculate_entity_loss(
+        self,
+        inputs: tf.Tensor,
+        tag_ids: tf.Tensor,
+        mask: tf.Tensor,
+        sequence_lengths: tf.Tensor,
+        tag_name: Text,
+        entity_tags: Optional[tf.Tensor] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+
+        tag_ids = tf.cast(tag_ids[:, :, 0], tf.int32)
+
+        if entity_tags is not None:
+            _tags = self._tf_layers[f"embed.{tag_name}.tags"](entity_tags)
+            inputs = tf.concat([inputs, _tags], axis=-1)
+
+        logits = self._tf_layers[f"embed.{tag_name}.logits"](inputs)
+
+        # should call first to build weights
+        pred_ids, _ = self._tf_layers[f"crf.{tag_name}"](logits, sequence_lengths)
+        loss = self._tf_layers[f"crf.{tag_name}"].loss(
+            logits, tag_ids, sequence_lengths
+        )
+        f1 = self._tf_layers[f"crf.{tag_name}"].f1_score(tag_ids, pred_ids, mask)
+
+        return loss, f1, logits
+
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
diff --git a/tests/core/featurizers/test_single_state_featurizers.py b/tests/core/featurizers/test_single_state_featurizers.py
index 7ff3502b13c2..2f5819e8e659 100644
--- a/tests/core/featurizers/test_single_state_featurizers.py
+++ b/tests/core/featurizers/test_single_state_featurizers.py
@@ -15,6 +15,11 @@
     INTENT,
     FEATURE_TYPE_SEQUENCE,
     FEATURE_TYPE_SENTENCE,
+    ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_ATTRIBUTE_VALUE,
+    ENTITY_ATTRIBUTE_START,
+    ENTITY_ATTRIBUTE_END,
+    ENTITY_TAGS,
 )
 from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS
 from rasa.shared.nlu.interpreter import RegexInterpreter
@@ -182,38 +187,41 @@ def test_single_state_featurizer_with_entity_roles_and_groups(
     from rasa.core.agent import Agent
 
     interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter
-
+    # TODO roles and groups are not supported in e2e yet
+    domain = Domain(
+        intents=[],
+        entities=["city", f"city{ENTITY_LABEL_SEPARATOR}to"],
+        slots=[],
+        templates={},
+        forms={},
+        action_names=[],
+    )
     f = SingleStateFeaturizer()
-    f._default_feature_states[INTENT] = {"a": 0, "b": 1}
-    f._default_feature_states[ENTITIES] = {
-        "c": 0,
-        "d": 1,
-        f"d{ENTITY_LABEL_SEPARATOR}e": 2,
-    }
-    f._default_feature_states[ACTION_NAME] = {"e": 0, "d": 1, "action_listen": 2}
-    f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2}
-    f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3}
-    encoded = f.encode_state(
+    f.prepare_from_domain(domain)
+    encoded = f.encode_entities(
         {
-            "user": {
-                "text": "a ball",
-                "intent": "b",
-                "entities": ["c", f"d{ENTITY_LABEL_SEPARATOR}e"],
-            },
-            "prev_action": {
-                "action_name": "action_listen",
-                "action_text": "throw a ball",
-            },
-            "active_loop": {"name": "k"},
-            "slots": {"e": (1.0,)},
+            TEXT: "I am flying from London to Paris",
+            ENTITIES: [
+                {
+                    ENTITY_ATTRIBUTE_TYPE: "city",
+                    ENTITY_ATTRIBUTE_VALUE: "London",
+                    ENTITY_ATTRIBUTE_START: 17,
+                    ENTITY_ATTRIBUTE_END: 23,
+                },
+                {
+                    ENTITY_ATTRIBUTE_TYPE: f"city{ENTITY_LABEL_SEPARATOR}to",
+                    ENTITY_ATTRIBUTE_VALUE: "Paris",
+                    ENTITY_ATTRIBUTE_START: 27,
+                    ENTITY_ATTRIBUTE_END: 32,
+                },
+            ],
         },
         interpreter=interpreter,
     )
-    # check all the features are encoded and *_text features are encoded by a densefeaturizer
-    assert sorted(list(encoded.keys())) == sorted(
-        [TEXT, ENTITIES, ACTION_NAME, SLOTS, ACTIVE_LOOP, INTENT, ACTION_TEXT]
+    assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS])
+    assert np.all(
+        encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [0], [2]]
     )
-    assert np.all(encoded[ENTITIES][0].features.toarray() == [1, 0, 1])
 
 
 def test_single_state_featurizer_uses_dtype_float():
@@ -241,21 +249,39 @@ def test_single_state_featurizer_with_interpreter_state_with_action_listen(
     interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter
 
     f = SingleStateFeaturizer()
-    f._default_feature_states[INTENT] = {"a": 0, "b": 1}
-    f._default_feature_states[ENTITIES] = {"c": 0}
-    f._default_feature_states[ACTION_NAME] = {"e": 0, "d": 1, "action_listen": 2}
-    f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2}
-    f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3}
-
+    f._default_feature_states[INTENT] = {"greet": 0, "inform": 1}
+    f._default_feature_states[ENTITIES] = {
+        "city": 0,
+        "name": 1,
+        f"city{ENTITY_LABEL_SEPARATOR}to": 2,
+        f"city{ENTITY_LABEL_SEPARATOR}from": 3,
+    }
+    f._default_feature_states[ACTION_NAME] = {
+        "utter_ask_where_to": 0,
+        "utter_greet": 1,
+        "action_listen": 2,
+    }
+    # `_0` in slots represent feature dimension
+    f._default_feature_states[SLOTS] = {"slot_1_0": 0, "slot_2_0": 1, "slot_3_0": 2}
+    f._default_feature_states[ACTIVE_LOOP] = {
+        "active_loop_1": 0,
+        "active_loop_2": 1,
+        "active_loop_3": 2,
+        "active_loop_4": 3,
+    }
     encoded = f.encode_state(
         {
-            "user": {"text": "a ball", "intent": "b", "entities": ["c"]},
+            "user": {
+                "text": "I am flying from London to Paris",
+                "intent": "inform",
+                "entities": ["city", f"city{ENTITY_LABEL_SEPARATOR}to"],
+            },
             "prev_action": {
                 "action_name": "action_listen",
                 "action_text": "throw a ball",
             },
-            "active_loop": {"name": "k"},
-            "slots": {"e": (1.0,)},
+            "active_loop": {"name": "active_loop_4"},
+            "slots": {"slot_1": (1.0,)},
         },
         interpreter=interpreter,
     )
@@ -271,7 +297,7 @@ def test_single_state_featurizer_with_interpreter_state_with_action_listen(
     assert (
         encoded[ACTION_NAME][0].features != scipy.sparse.coo_matrix([[0, 0, 1]])
     ).nnz == 0
-    assert encoded[ENTITIES][0].features.shape[-1] == 1
+    assert encoded[ENTITIES][0].features.shape[-1] == 4
     assert (encoded[SLOTS][0].features != scipy.sparse.coo_matrix([[1, 0, 0]])).nnz == 0
     assert (
         encoded[ACTIVE_LOOP][0].features != scipy.sparse.coo_matrix([[0, 0, 0, 1]])
diff --git a/tests/core/featurizers/test_tracker_featurizer.py b/tests/core/featurizers/test_tracker_featurizer.py
index 98f323bd3279..f6b904d8397b 100644
--- a/tests/core/featurizers/test_tracker_featurizer.py
+++ b/tests/core/featurizers/test_tracker_featurizer.py
@@ -67,7 +67,7 @@ def test_featurize_trackers_with_full_dialogue_tracker_featurizer(
     tracker = tracker_from_dialogue_file(
         "data/test_dialogues/moodbot.json", moodbot_domain
     )
-    state_features, labels = tracker_featurizer.featurize_trackers(
+    state_features, labels, entity_tags = tracker_featurizer.featurize_trackers(
         [tracker], moodbot_domain, RegexInterpreter()
     )
 
@@ -75,6 +75,8 @@ def test_featurize_trackers_with_full_dialogue_tracker_featurizer(
     assert len(state_features) > 0
     assert labels is not None
     assert len(labels) > 0
+    # moodbot doesn't contain e2e entities
+    assert not any([any(turn_tags) for turn_tags in entity_tags])
 
 
 def test_featurize_trackers_with_max_history_tracker_featurizer(moodbot_domain: Domain):
@@ -84,7 +86,7 @@ def test_featurize_trackers_with_max_history_tracker_featurizer(moodbot_domain:
     tracker = tracker_from_dialogue_file(
         "data/test_dialogues/moodbot.json", moodbot_domain
     )
-    state_features, labels = tracker_featurizer.featurize_trackers(
+    state_features, labels, entity_tags = tracker_featurizer.featurize_trackers(
         [tracker], moodbot_domain, RegexInterpreter()
     )
 
@@ -92,3 +94,5 @@ def test_featurize_trackers_with_max_history_tracker_featurizer(moodbot_domain:
     assert len(state_features) > 0
     assert labels is not None
     assert len(labels) > 0
+    # moodbot doesn't contain e2e entities
+    assert not any([any(turn_tags) for turn_tags in entity_tags])
diff --git a/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py b/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py
index fa746263b082..4e48ea67d793 100644
--- a/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py
+++ b/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py
@@ -108,8 +108,6 @@ def test_yaml_writer_dumps_user_messages():
         - story: default
           steps:
           - intent: greet
-            user: |-
-              Hello
           - action: utter_greet
 
     """
@@ -139,10 +137,10 @@ def test_yaml_writer_avoids_dumping_not_existing_user_messages():
 
 
 @pytest.mark.parametrize(
-    "input_yaml_file", ["data/test_yaml_stories/rules_with_stories_sorted.yaml",],
+    "input_yaml_file", ["data/test_yaml_stories/rules_with_stories_sorted.yaml"]
 )
 def test_yaml_writer_dumps_rules(
-    input_yaml_file: Text, tmpdir: Path, default_domain: Domain,
+    input_yaml_file: Text, tmpdir: Path, default_domain: Domain
 ):
     original_yaml_reader = YAMLStoryReader(default_domain, None, False)
     original_yaml_story_steps = original_yaml_reader.read_from_file(input_yaml_file)
diff --git a/tests/test_test.py b/tests/test_test.py
index b279fee01231..8bbc45bececb 100644
--- a/tests/test_test.py
+++ b/tests/test_test.py
@@ -197,8 +197,6 @@ def test_write_classification_errors():
         - story: default
           steps:
           - intent: greet  # predicted: goodbye: Hello
-            user: |-
-              Hello
           - action: utter_greet  # predicted: utter_goodbye
 
     """
diff --git a/tests/utils/tensorflow/test_model_data_utils.py b/tests/utils/tensorflow/test_model_data_utils.py
index f495222958df..2dab29353f3a 100644
--- a/tests/utils/tensorflow/test_model_data_utils.py
+++ b/tests/utils/tensorflow/test_model_data_utils.py
@@ -30,7 +30,7 @@
 shape = 100
 
 
-def test_create_zero_features():
+def test_create_fake_features():
     # DENSE FEATURES
     dense_feature_sentence_features = Features(
         features=np.random.rand(shape),
@@ -40,10 +40,10 @@ def test_create_zero_features():
     )
     features = [[None, None, [dense_feature_sentence_features]]]
 
-    zero_features = model_data_utils._create_zero_features(features)
-    assert len(zero_features) == 1
-    assert zero_features[0].is_dense()
-    assert zero_features[0].features.shape == (0, shape)
+    fake_features = model_data_utils._create_fake_features(features)
+    assert len(fake_features) == 1
+    assert fake_features[0].is_dense()
+    assert fake_features[0].features.shape == (0, shape)
 
     # SPARSE FEATURES
     sparse_feature_sentence_features = Features(
@@ -53,11 +53,11 @@ def test_create_zero_features():
         origin=[],
     )
     features = [[None, None, [sparse_feature_sentence_features]]]
-    zero_features = model_data_utils._create_zero_features(features)
-    assert len(zero_features) == 1
-    assert zero_features[0].is_sparse()
-    assert zero_features[0].features.shape == (0, shape)
-    assert zero_features[0].features.nnz == 0
+    fake_features = model_data_utils._create_fake_features(features)
+    assert len(fake_features) == 1
+    assert fake_features[0].is_sparse()
+    assert fake_features[0].features.shape == (0, shape)
+    assert fake_features[0].features.nnz == 0
 
 
 def test_surface_attributes():
@@ -142,18 +142,18 @@ def test_surface_attributes():
 
 
 def test_extract_features():
-    zero_features = np.zeros(shape)
-    zero_features_as_features = Features(
-        features=zero_features, attribute=INTENT, feature_type=SENTENCE, origin=[]
+    fake_features = np.zeros(shape)
+    fake_features_as_features = Features(
+        features=fake_features, attribute=INTENT, feature_type=SENTENCE, origin=[]
     )
     # create zero features
-    zero_features_list = [zero_features_as_features]
+    fake_features_list = [fake_features_as_features]
 
     # create tracker state features by setting a random index in the array to 1
     random_inds = np.random.randint(shape, size=6)
     list_of_features = []
     for idx in random_inds:
-        current_features = copy.deepcopy(zero_features_as_features)
+        current_features = copy.deepcopy(fake_features_as_features)
         current_features.features[idx] = 1
         list_of_features.append([current_features])
 
@@ -168,11 +168,11 @@ def test_extract_features():
         attribute_masks,
         dense_features,
         sparse_features,
-    ) = model_data_utils._extract_features(tracker_features, zero_features_list, INTENT)
+    ) = model_data_utils._extract_features(tracker_features, fake_features_list, INTENT)
     expected_mask = np.array([[1, 0, 1], [0, 0, 1], [1, 1, 1]])
 
     assert np.all(np.squeeze(np.array(attribute_masks), 2) == expected_mask)
-    assert np.array(dense_features[SENTENCE]).shape[-1] == zero_features.shape[-1]
+    assert np.array(dense_features[SENTENCE]).shape[-1] == fake_features.shape[-1]
     assert sparse_features == {}