From 1e6fa3fca46da01a7d6292e6e4f794dfdd070612 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Mon, 9 Nov 2020 15:58:11 +0100
Subject: [PATCH] create empty fakes (#7198)

* substitute fake features with empty arrays and use attribute mask to rebuild input

* remove unused import, remove comment

* refactor, add comments, add types

* support empty features

* add prepare_for_predict to precalculate self.all_labels_embed

* return to default config

* add error

* add prepare_for_predict to diet

* fix test_model_data_utils

* fix test gen_batch

* Update rasa/core/policies/ted_policy.py

Co-authored-by: Tanja <tabergma@gmail.com>

* rename to filter fakes and create dial len beforehand

* add dtype=

* fix comment

* add comments about fake features

Co-authored-by: Tanja <tabergma@gmail.com>
---
 examples/e2ebot/domain.yml                    |   2 +-
 rasa/core/policies/ted_policy.py              | 179 +++++++++++++-----
 rasa/nlu/classifiers/diet_classifier.py       |   9 +-
 rasa/shared/core/trackers.py                  |   2 -
 rasa/utils/tensorflow/model_data.py           |  65 +++++--
 rasa/utils/tensorflow/model_data_utils.py     |  30 ++-
 rasa/utils/tensorflow/models.py               |  10 +
 tests/core/test_policies.py                   | 115 +++++++----
 .../utils/tensorflow/test_model_data_utils.py |   7 +-
 9 files changed, 287 insertions(+), 132 deletions(-)

diff --git a/examples/e2ebot/domain.yml b/examples/e2ebot/domain.yml
index c15888fdeb38..17b3faba2a75 100644
--- a/examples/e2ebot/domain.yml
+++ b/examples/e2ebot/domain.yml
@@ -8,4 +8,4 @@ actions:
 
 intents:
  - greet
- - mood_great
\ No newline at end of file
+ - mood_great
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 3eaa9399bdb5..98f9c8cd3215 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -4,7 +4,6 @@
 from collections import defaultdict
 
 import numpy as np
-from tensorflow import RaggedTensorSpec
 
 import rasa.shared.utils.io
 import tensorflow as tf
@@ -105,7 +104,7 @@
 LABEL_SUB_KEY = "ids"
 LENGTH = "length"
 SENTENCE_FEATURES_TO_ENCODE = [INTENT, TEXT, ACTION_NAME, ACTION_TEXT]
-SEQUENCE_FEATURES_TO_ENCODE = [TEXT, ACTION_TEXT]
+SEQUENCE_FEATURES_TO_ENCODE = [TEXT, ACTION_TEXT, f"{LABEL}_{ACTION_TEXT}"]
 LABEL_FEATURES_TO_ENCODE = [f"{LABEL}_{ACTION_NAME}", f"{LABEL}_{ACTION_TEXT}"]
 STATE_LEVEL_FEATURES = [ENTITIES, SLOTS, ACTIVE_LOOP]
 
@@ -141,19 +140,19 @@ class TEDPolicy(Policy):
         # and labels.
         # The number of hidden layers is equal to the length of the corresponding
         # list.
-        HIDDEN_LAYERS_SIZES: {TEXT: [], ACTION_TEXT: []},
+        HIDDEN_LAYERS_SIZES: {TEXT: [], ACTION_TEXT: [], f"{LABEL}_{ACTION_TEXT}": []},
         DENSE_DIMENSION: {
             TEXT: 128,
             ACTION_TEXT: 128,
-            ENTITIES: 128,
-            SLOTS: 128,
-            ACTIVE_LOOP: 128,
-            f"{LABEL}_{ACTION_TEXT}": 20,
+            f"{LABEL}_{ACTION_TEXT}": 128,
             INTENT: 20,
             ACTION_NAME: 20,
             f"{LABEL}_{ACTION_NAME}": 20,
+            ENTITIES: 20,
+            SLOTS: 20,
+            ACTIVE_LOOP: 20,
         },
-        CONCAT_DIMENSION: {TEXT: 128, ACTION_TEXT: 128},
+        CONCAT_DIMENSION: {TEXT: 128, ACTION_TEXT: 128, f"{LABEL}_{ACTION_TEXT}": 128},
         ENCODING_DIMENSION: 50,
         # Number of units in transformer
         TRANSFORMER_SIZE: 128,
@@ -310,6 +309,12 @@ def _create_label_data(
 
         label_data = RasaModelData()
         label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_")
+        label_data.add_lengths(
+            f"{LABEL}_{ACTION_TEXT}",
+            SEQUENCE_LENGTH,
+            f"{LABEL}_{ACTION_TEXT}",
+            SEQUENCE,
+        )
 
         label_ids = np.arange(domain.num_actions)
         label_data.add_features(
@@ -685,6 +690,8 @@ def _prepare_layers(self) -> None:
 
         for name in self.label_signature.keys():
             self._prepare_sparse_dense_layer_for(name, self.label_signature)
+            if name in SEQUENCE_FEATURES_TO_ENCODE:
+                self._prepare_sequence_layers(name)
             self._prepare_encoding_layers(name)
 
         self._prepare_transformer_layer(
@@ -754,9 +761,9 @@ def _prepare_encoding_layers(self, name: Text) -> None:
 
     def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_label_ids = self.tf_label_data[LABEL_KEY][LABEL_SUB_KEY][0]
-
+        # labels cannot have all features "fake"
         all_labels_encoded = {
-            key: self._encode_features_per_attribute(self.tf_label_data, key)
+            key: self._encode_real_features_per_attribute(self.tf_label_data, key)
             for key in self.tf_label_data.keys()
             if key != LABEL_KEY
         }
@@ -808,7 +815,54 @@ def _emebed_dialogue(
 
     def _encode_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
-    ) -> Optional[tf.Tensor]:
+    ) -> tf.Tensor:
+        # The input is a representation of 4d tensor of
+        # shape (batch-size x dialogue-len x sequence-len x units) in 3d of shape
+        # (sum of dialogue history length for all tensors in the batch x
+        # max sequence length x number of features).
+
+        # However, some dialogue turns contain non existent state features,
+        # e.g. `intent` and `text` features are mutually exclusive,
+        # as well as `action_name` and `action_text` are mutually exclusive,
+        # or some dialogue turns don't contain any `slots`.
+        # In order to create 4d full tensors, we created "fake" zero features for
+        # these non existent state features. And filtered them during batch generation.
+        # Therefore the first dimensions for different attributes are different.
+        # It could happen that some batches don't contain "real" features at all,
+        # e.g. large number of stories don't contain any `slots`.
+        # Therefore actual input tensors will be empty.
+        # Since we need actual numbers to create dialogue turn features, we create
+        # zero tensors in `_encode_fake_features_per_attribute` for these attributes.
+        return tf.cond(
+            tf.shape(tf_batch_data[attribute][SENTENCE][0])[0] > 0,
+            lambda: self._encode_real_features_per_attribute(tf_batch_data, attribute),
+            lambda: self._encode_fake_features_per_attribute(tf_batch_data, attribute),
+        )
+
+    def _encode_fake_features_per_attribute(
+        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
+    ) -> tf.Tensor:
+        attribute_features_list = tf_batch_data[attribute][SENTENCE]
+        attribute_mask = tf_batch_data[attribute][MASK][0]
+
+        batch_dim = tf.shape(attribute_mask)[0]
+        dialogue_dim = tf.shape(attribute_mask)[1]
+
+        if attribute in set(SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE):
+            units = self.config[ENCODING_DIMENSION]
+        else:
+            units = 0
+            for f in attribute_features_list:
+                if isinstance(f, tf.SparseTensor):
+                    units += self.config[DENSE_DIMENSION][attribute]
+                else:
+                    units += f.shape[-1]
+
+        return tf.zeros((batch_dim, dialogue_dim, units), dtype=tf.float32)
+
+    def _encode_real_features_per_attribute(
+        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
+    ) -> tf.Tensor:
         """Encodes features for a given attribute.
 
         Args:
@@ -819,13 +873,17 @@ def _encode_features_per_attribute(
         Returns:
             A tensor combining  all features for `attribute`
         """
-        attribute_mask = tf_batch_data[attribute][MASK][0]
-
         if attribute in SEQUENCE_FEATURES_TO_ENCODE:
+            # sequence_lengths contain `0` for "fake" features, while
+            # tf_batch_data[attribute] contain only "real" features
+            _sequence_lengths = tf_batch_data[attribute][SEQUENCE_LENGTH][0]
+            # extract only nonzero lengths and cast to int
             _sequence_lengths = tf.cast(
-                tf_batch_data[attribute][SEQUENCE_LENGTH][0], dtype=tf.int32
+                tf.boolean_mask(_sequence_lengths, _sequence_lengths), dtype=tf.int32
             )
-            _sequence_lengths = tf.squeeze(_sequence_lengths, axis=-1)
+            # boolean mask returns flat tensor
+            _sequence_lengths = tf.expand_dims(_sequence_lengths, axis=-1)
+
             mask_sequence_text = tf.squeeze(
                 self._compute_mask(_sequence_lengths), axis=1
             )
@@ -859,41 +917,38 @@ def _encode_features_per_attribute(
             # resulting attribute features will have shape
             # combined batch dimension and dialogue length x 1 x units
             attribute_features = self._combine_sparse_dense_features(
-                tf_batch_data[attribute][SENTENCE],
-                f"{attribute}_{SENTENCE}",
-                mask=attribute_mask,
+                tf_batch_data[attribute][SENTENCE], f"{attribute}_{SENTENCE}",
             )
 
-        if attribute in set(
-            SENTENCE_FEATURES_TO_ENCODE
-            + SEQUENCE_FEATURES_TO_ENCODE
-            + LABEL_FEATURES_TO_ENCODE
-        ):
+        if attribute in set(SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE):
             attribute_features = self._tf_layers[f"ffnn.{attribute}"](
                 attribute_features
             )
 
-        attribute_features = attribute_features * attribute_mask
+        # attribute_mask has shape batch x dialogue_len x 1
+        attribute_mask = tf_batch_data[attribute][MASK][0]
 
-        if attribute in set(
-            SENTENCE_FEATURES_TO_ENCODE
-            + SEQUENCE_FEATURES_TO_ENCODE
-            + STATE_LEVEL_FEATURES
-        ):
-            # attribute features have shape
-            # combined batch dimension and dialogue length x 1 x units
-            # convert them back to their original shape of
-            # batch size x dialogue length x units
-            attribute_features = self._convert_to_original_shape(
-                attribute_features, tf_batch_data
+        if attribute in set(SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES):
+            dialogue_lengths = tf.cast(
+                tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32
             )
-
-        return attribute_features
+        else:
+            # for labels, dialogue length is a fake dim and equal to 1
+            dialogue_lengths = tf.ones((tf.shape(attribute_mask)[0],), dtype=tf.int32)
+
+        # attribute features have shape
+        # (combined batch dimension and dialogue length x 1 x units)
+        # convert them back to their original shape of
+        # batch size x dialogue length x units
+        return self._convert_to_original_shape(
+            attribute_features, attribute_mask, dialogue_lengths
+        )
 
     @staticmethod
     def _convert_to_original_shape(
         attribute_features: tf.Tensor,
-        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        attribute_mask: tf.Tensor,
+        dialogue_lengths: tf.Tensor,
     ) -> tf.Tensor:
         """Transform attribute features back to original shape.
 
@@ -901,29 +956,34 @@ def _convert_to_original_shape(
         Original shape: batch x dialogue length x units
 
         Args:
-            attribute_features: the features to convert
-            tf_batch_data: the batch data
+            attribute_features: the "real" features to convert
+            attribute_mask:  the tensor containing the position of "real" features
+                in the dialogue, shape is (batch-size x dialogue_len x 1)
+            dialogue_lengths: the tensor containing the actual dialogue length,
+                shape is (batch-size,)
 
         Returns:
             The converted attribute features
         """
-        # dialogue lengths contains the actual dialogue length
-        # shape is batch-size x 1
-        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
 
         # in order to convert the attribute features with shape
         # combined batch-size and dialogue length x 1 x units
-        # to a shape of
-        # batch-size x dialogue length x units
+        # to a shape of batch-size x dialogue length x units
         # we use tf.scatter_nd. Therefore, we need to the target shape and the indices
         # mapping the values of attribute features to the position in the resulting
         # tensor.
 
-        batch_dim = tf.size(dialogue_lengths)
-        dialogue_dim = tf.reduce_max(dialogue_lengths)
+        batch_dim = tf.shape(attribute_mask)[0]
+        dialogue_dim = tf.shape(attribute_mask)[1]
         units = attribute_features.shape[-1]
 
-        batch_indices = tf.repeat(tf.range(batch_dim), dialogue_lengths)
+        # attribute_mask has shape (batch x dialogue_len x 1), remove last dimension
+        attribute_mask = tf.cast(tf.squeeze(attribute_mask, axis=-1), dtype=tf.int32)
+        # sum of attribute mask contains number of dialogue turns with "real" features
+        non_fake_dialogue_lengths = tf.reduce_sum(attribute_mask, axis=-1)
+
+        batch_indices = tf.repeat(tf.range(batch_dim), non_fake_dialogue_lengths)
+
         dialogue_indices = (
             tf.map_fn(
                 tf.range,
@@ -931,6 +991,17 @@ def _convert_to_original_shape(
                 fn_output_signature=tf.RaggedTensorSpec(shape=[None], dtype=tf.int32),
             )
         ).values
+
+        # attribute_mask has shape (batch x dialogue_len x 1), while
+        # dialogue_indices has shape (combined_dialogue_len,)
+        # in order to find positions of real input we need to flatten
+        # attribute mask to (combined_dialogue_len,)
+        dialogue_indices_mask = tf.boolean_mask(
+            attribute_mask, tf.sequence_mask(dialogue_lengths, dtype=tf.int32)
+        )
+        # pick only those indices that contain "real" input
+        dialogue_indices = tf.boolean_mask(dialogue_indices, dialogue_indices_mask)
+
         indices = tf.stack([batch_indices, dialogue_indices], axis=1)
 
         shape = tf.convert_to_tensor([batch_dim, dialogue_dim, units])
@@ -1041,6 +1112,9 @@ def batch_loss(
 
         return loss
 
+    def prepare_for_predict(self) -> None:
+        _, self.all_labels_embed = self._create_all_labels_embed()
+
     def batch_predict(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> Dict[Text, tf.Tensor]:
@@ -1052,13 +1126,16 @@ def batch_predict(
         Returns:
             The output to predict.
         """
+        if self.all_labels_embed is None:
+            raise ValueError(
+                "The model was not prepared for prediction. "
+                "Call `prepare_for_predict` first."
+            )
+
         tf_batch_data = self.batch_to_model_data_format(
             batch_in, self.predict_data_signature
         )
 
-        if self.all_labels_embed is None:
-            _, self.all_labels_embed = self._create_all_labels_embed()
-
         dialogue_in = self._process_batch_data(tf_batch_data)
         dialogue_embed, dialogue_mask = self._emebed_dialogue(
             dialogue_in, tf_batch_data
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index fe0890c8a66c..09e6ea59b653 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1580,6 +1580,10 @@ def _update_entity_metrics(self, loss: tf.Tensor, f1: tf.Tensor, tag_name: Text)
             self.entity_role_loss.update_state(loss)
             self.entity_role_f1.update_state(f1)
 
+    def prepare_for_predict(self) -> None:
+        if self.config[INTENT_CLASSIFICATION]:
+            _, self.all_labels_embed = self._create_all_labels()
+
     def batch_predict(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> Dict[Text, tf.Tensor]:
@@ -1665,7 +1669,10 @@ def _batch_predict_intents(
     ) -> Dict[Text, tf.Tensor]:
 
         if self.all_labels_embed is None:
-            _, self.all_labels_embed = self._create_all_labels()
+            raise ValueError(
+                "The model was not prepared for prediction. "
+                "Call `prepare_for_predict` first."
+            )
 
         # get sentence feature vector for intent classification
         sentence_vector = self._last_token(text_transformed, sequence_lengths)
diff --git a/rasa/shared/core/trackers.py b/rasa/shared/core/trackers.py
index 99bc485c631f..c507e5f2c74a 100644
--- a/rasa/shared/core/trackers.py
+++ b/rasa/shared/core/trackers.py
@@ -455,8 +455,6 @@ def applied_events(self) -> List[Event]:
                 )
                 if event.use_text_for_featurization is None:
                     event.use_text_for_featurization = use_text_for_featurization
-                elif event.use_text_for_featurization != use_text_for_featurization:
-                    logger.debug("Got contradicting user featurization info.")
 
                 applied_events.append(event)
             else:
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 6d0c1bca2a92..46cf8fd5bd66 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -1039,6 +1039,23 @@ def _create_label_ids(label_ids: FeatureArray) -> np.ndarray:
 
         raise ValueError("Unsupported label_ids dimensions")
 
+    @staticmethod
+    def _filter_out_fake_inputs(
+        array_of_array_of_features: FeatureArray,
+    ) -> Union[List[List[np.ndarray]], List[List[scipy.sparse.spmatrix]]]:
+        return list(
+            filter(
+                # filter empty lists created by another filter
+                lambda x: len(x) > 0,
+                [
+                    # filter all the "fake" inputs, we know the input is "fake",
+                    # when sequence dimension is `0`
+                    list(filter(lambda x: x.shape[0] > 0, array_of_features))
+                    for array_of_features in array_of_array_of_features
+                ],
+            )
+        )
+
     @staticmethod
     def _pad_dense_data(array_of_dense: FeatureArray) -> np.ndarray:
         """Pad data of different lengths.
@@ -1082,6 +1099,17 @@ def _pad_4d_dense_data(array_of_array_of_dense: FeatureArray) -> np.ndarray:
         # the original shape and the original dialogue length is passed on to the model
         # it can be used to transform the 3D tensor back into 4D
 
+        # in order to create 4d tensor inputs, we created "fake" zero features
+        # for nonexistent inputs. To save calculation we filter this features before
+        # input to tf methods.
+        number_of_features = array_of_array_of_dense[0][0].shape[-1]
+        array_of_array_of_dense = RasaModelData._filter_out_fake_inputs(
+            array_of_array_of_dense
+        )
+        if not array_of_array_of_dense:
+            # return empty 3d array with appropriate last dims
+            return np.zeros((0, 0, number_of_features), dtype=np.float32)
+
         combined_dialogue_len = sum(
             len(array_of_dense) for array_of_dense in array_of_array_of_dense
         )
@@ -1094,11 +1122,7 @@ def _pad_4d_dense_data(array_of_array_of_dense: FeatureArray) -> np.ndarray:
         )
 
         data_padded = np.zeros(
-            [
-                combined_dialogue_len,
-                max_seq_len,
-                array_of_array_of_dense[0][0].shape[-1],
-            ],
+            [combined_dialogue_len, max_seq_len, number_of_features,],
             dtype=array_of_array_of_dense[0][0].dtype,
         )
 
@@ -1163,6 +1187,21 @@ def _4d_scipy_matrix_to_values(
         # the original shape and the original dialogue length is passed on to the model
         # it can be used to transform the 3D tensor back into 4D
 
+        # in order to create 4d tensor inputs, we created "fake" zero features
+        # for nonexistent inputs. To save calculation we filter this features before
+        # input to tf methods.
+        number_of_features = array_of_array_of_sparse[0][0].shape[-1]
+        array_of_array_of_sparse = RasaModelData._filter_out_fake_inputs(
+            array_of_array_of_sparse
+        )
+        if not array_of_array_of_sparse:
+            # create empty array with appropriate last dims
+            return [
+                np.empty((0, 3), dtype=np.int64),
+                np.array([], dtype=np.float32),
+                np.array([0, 0, number_of_features], dtype=np.int64),
+            ]
+
         # we need to make sure that the matrices are coo_matrices otherwise the
         # transformation does not work (e.g. you cannot access x.row, x.col)
         if not isinstance(array_of_array_of_sparse[0][0], scipy.sparse.coo_matrix):
@@ -1171,9 +1210,10 @@ def _4d_scipy_matrix_to_values(
                 for array_of_sparse in array_of_array_of_sparse
             ]
 
-        combined_dialogue_len = sum(
+        dialogue_len = [
             len(array_of_sparse) for array_of_sparse in array_of_array_of_sparse
-        )
+        ]
+        combined_dialogue_len = sum(dialogue_len)
         max_seq_len = max(
             [
                 x.shape[0]
@@ -1185,15 +1225,7 @@ def _4d_scipy_matrix_to_values(
         indices = np.hstack(
             [
                 np.vstack(
-                    [
-                        sum(
-                            len(array_of_sparse)
-                            for array_of_sparse in array_of_array_of_sparse[:i]
-                        )
-                        + j * np.ones_like(x.row),
-                        x.row,
-                        x.col,
-                    ]
+                    [sum(dialogue_len[:i]) + j * np.ones_like(x.row), x.row, x.col,]
                 )
                 for i, array_of_sparse in enumerate(array_of_array_of_sparse)
                 for j, x in enumerate(array_of_sparse)
@@ -1208,7 +1240,6 @@ def _4d_scipy_matrix_to_values(
             ]
         )
 
-        number_of_features = array_of_array_of_sparse[0][0].shape[-1]
         shape = np.array((combined_dialogue_len, max_seq_len, number_of_features))
 
         return [
diff --git a/rasa/utils/tensorflow/model_data_utils.py b/rasa/utils/tensorflow/model_data_utils.py
index ffdf1c21d67b..ca3d000b7310 100644
--- a/rasa/utils/tensorflow/model_data_utils.py
+++ b/rasa/utils/tensorflow/model_data_utils.py
@@ -191,15 +191,17 @@ def _create_zero_features(
         )
     )
 
-    # create zero_features for nones
+    # create zero_features for Nones
     zero_features = []
     for _features in example_features:
         new_features = copy.deepcopy(_features)
         if _features.is_dense():
-            new_features.features = np.zeros_like(_features.features)
+            new_features.features = np.zeros(
+                (0, _features.features.shape[-1]), _features.features.dtype
+            )
         if _features.is_sparse():
             new_features.features = scipy.sparse.coo_matrix(
-                _features.features.shape, _features.features.dtype
+                (0, _features.features.shape[-1]), _features.features.dtype
             )
         zero_features.append(new_features)
 
@@ -337,18 +339,9 @@ def _features_for_attribute(
                 np.array([v[0] for v in values]), number_of_dimensions=3
             )
 
-    if consider_dialogue_dimension:
-        attribute_to_feature_arrays = {
-            MASK: [FeatureArray(np.array(attribute_masks), number_of_dimensions=4)]
-        }
-    else:
-        attribute_to_feature_arrays = {
-            MASK: [
-                FeatureArray(
-                    np.array(np.squeeze(attribute_masks, -1)), number_of_dimensions=3
-                )
-            ]
-        }
+    attribute_to_feature_arrays = {
+        MASK: [FeatureArray(np.array(attribute_masks), number_of_dimensions=3)]
+    }
 
     feature_types = set()
     feature_types.update(list(dense_features.keys()))
@@ -433,9 +426,10 @@ def _extract_features(
         for key, value in dialogue_dense_features.items():
             dense_features[key].append(value)
 
-        # add additional dimensions to attribute mask to get a 3D vector
-        # resulting shape dialogue length x 1 x 1
-        attribute_mask = np.expand_dims(np.expand_dims(attribute_mask, -1), -1)
+        # add additional dimension to attribute mask
+        # to get a vector of shape (dialogue length x 1),
+        # the batch dim will be added later
+        attribute_mask = np.expand_dims(attribute_mask, -1)
         attribute_masks.append(attribute_mask)
 
     return attribute_masks, dense_features, sparse_features
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index af78b765fa76..3708f4a8a99b 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -153,6 +153,15 @@ def batch_loss(
         """
         raise NotImplementedError
 
+    def prepare_for_predict(self) -> None:
+        """Prepares tf graph fpr prediction.
+
+        This method should contain necessary tf calculations
+        and set self variables that are used in `batch_predict`.
+        For example, pre calculation of `self.all_labels_embed`.
+        """
+        pass
+
     def batch_predict(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> Dict[Text, tf.Tensor]:
@@ -311,6 +320,7 @@ def build_for_predict(
         self, predict_data: RasaModelData, eager: bool = False
     ) -> None:
         self._training = False  # needed for tf graph mode
+        self.prepare_for_predict()
         self._predict_function = self._get_tf_call_model_function(
             predict_data.as_tf_dataset, self.batch_predict, eager, "prediction"
         )
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 9ef6239529b6..c54f9e16314c 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -360,21 +360,21 @@ async def test_gen_batch(self, trained_policy, default_domain):
         (
             batch_label_ids,
             batch_action_name_mask,
-            batch_action_name_sentence_1,
-            batch_action_name_sentence_2,
-            batch_action_name_sentence_3,
+            batch_action_name_sentence_indices,
+            batch_action_name_sentence_data,
+            batch_action_name_sentence_shape,
             batch_entities_mask,
-            batch_entities_sentence_1,
-            batch_entities_sentence_2,
-            batch_entities_sentence_3,
+            batch_entities_sentence_indices,
+            batch_entities_sentence_data,
+            batch_entities_sentence_shape,
             batch_intent_mask,
-            batch_intent_sentence_1,
-            batch_intent_sentence_2,
-            batch_intent_sentence_3,
+            batch_intent_sentence_indices,
+            batch_intent_sentence_data,
+            batch_intent_sentence_shape,
             batch_slots_mask,
-            batch_slots_sentence_1,
-            batch_slots_sentence_2,
-            batch_slots_sentence_3,
+            batch_slots_sentence_indices,
+            batch_slots_sentence_data,
+            batch_slots_sentence_shape,
             batch_dialogue_length,
         ) = next(model_data._gen_batch(batch_size=batch_size))
 
@@ -382,39 +382,57 @@ async def test_gen_batch(self, trained_policy, default_domain):
             batch_label_ids.shape[0] == batch_size
             and batch_dialogue_length.shape[0] == batch_size
         )
-        # batch and dialogue dimensions are combined
-        first_dimension_size = batch_size if self.max_history <= 1 else batch_size + 1
+        # batch and dialogue dimensions are NOT combined for masks
         assert (
-            batch_slots_mask.shape[0] == first_dimension_size
-            and batch_intent_mask.shape[0] == first_dimension_size
-            and batch_entities_mask.shape[0] == first_dimension_size
-            and batch_action_name_mask.shape[0] == first_dimension_size
+            batch_slots_mask.shape[0] == batch_size
+            and batch_intent_mask.shape[0] == batch_size
+            and batch_entities_mask.shape[0] == batch_size
+            and batch_action_name_mask.shape[0] == batch_size
+        )
+        # some features might be "fake" so there sequence is `0`
+        seq_len = max(
+            [
+                batch_intent_sentence_shape[1],
+                batch_action_name_sentence_shape[1],
+                batch_entities_sentence_shape[1],
+                batch_slots_sentence_shape[1],
+            ]
+        )
+        assert (
+            batch_intent_sentence_shape[1] == seq_len
+            or batch_intent_sentence_shape[1] == 0
+        )
+        assert (
+            batch_action_name_sentence_shape[1] == seq_len
+            or batch_action_name_sentence_shape[1] == 0
+        )
+        assert (
+            batch_entities_sentence_shape[1] == seq_len
+            or batch_entities_sentence_shape[1] == 0
         )
         assert (
-            batch_intent_sentence_3[1]
-            == batch_action_name_sentence_3[1]
-            == batch_entities_sentence_3[1]
-            == batch_slots_sentence_3[1]
+            batch_slots_sentence_shape[1] == seq_len
+            or batch_slots_sentence_shape[1] == 0
         )
 
         (
             batch_label_ids,
+            batch_action_name_mask,
+            batch_action_name_sentence_indices,
+            batch_action_name_sentence_data,
+            batch_action_name_sentence_shape,
             batch_entities_mask,
-            batch_entities_sentence_1,
-            batch_entities_sentence_2,
-            batch_entities_sentence_3,
+            batch_entities_sentence_indices,
+            batch_entities_sentence_data,
+            batch_entities_sentence_shape,
             batch_intent_mask,
-            batch_intent_sentence_1,
-            batch_intent_sentence_2,
-            batch_intent_sentence_3,
+            batch_intent_sentence_indices,
+            batch_intent_sentence_data,
+            batch_intent_sentence_shape,
             batch_slots_mask,
-            batch_slots_sentence_1,
-            batch_slots_sentence_2,
-            batch_slots_sentence_3,
-            batch_action_name_mask,
-            batch_action_name_sentence_1,
-            batch_action_name_sentence_2,
-            batch_action_name_sentence_3,
+            batch_slots_sentence_indices,
+            batch_slots_sentence_data,
+            batch_slots_sentence_shape,
             batch_dialogue_length,
         ) = next(
             model_data._gen_batch(
@@ -426,11 +444,30 @@ async def test_gen_batch(self, trained_policy, default_domain):
             batch_label_ids.shape[0] == batch_size
             and batch_dialogue_length.shape[0] == batch_size
         )
+        # some features might be "fake" so there sequence is `0`
+        seq_len = max(
+            [
+                batch_intent_sentence_shape[1],
+                batch_action_name_sentence_shape[1],
+                batch_entities_sentence_shape[1],
+                batch_slots_sentence_shape[1],
+            ]
+        )
+        assert (
+            batch_intent_sentence_shape[1] == seq_len
+            or batch_intent_sentence_shape[1] == 0
+        )
+        assert (
+            batch_action_name_sentence_shape[1] == seq_len
+            or batch_action_name_sentence_shape[1] == 0
+        )
+        assert (
+            batch_entities_sentence_shape[1] == seq_len
+            or batch_entities_sentence_shape[1] == 0
+        )
         assert (
-            batch_intent_sentence_3[1]
-            == batch_action_name_sentence_3[1]
-            == batch_entities_sentence_3[1]
-            == batch_slots_sentence_3[1]
+            batch_slots_sentence_shape[1] == seq_len
+            or batch_slots_sentence_shape[1] == 0
         )
 
 
diff --git a/tests/utils/tensorflow/test_model_data_utils.py b/tests/utils/tensorflow/test_model_data_utils.py
index 11bc5723d528..f495222958df 100644
--- a/tests/utils/tensorflow/test_model_data_utils.py
+++ b/tests/utils/tensorflow/test_model_data_utils.py
@@ -43,7 +43,7 @@ def test_create_zero_features():
     zero_features = model_data_utils._create_zero_features(features)
     assert len(zero_features) == 1
     assert zero_features[0].is_dense()
-    assert (zero_features[0].features == np.zeros(shape)).all()
+    assert zero_features[0].features.shape == (0, shape)
 
     # SPARSE FEATURES
     sparse_feature_sentence_features = Features(
@@ -56,7 +56,8 @@ def test_create_zero_features():
     zero_features = model_data_utils._create_zero_features(features)
     assert len(zero_features) == 1
     assert zero_features[0].is_sparse()
-    assert (zero_features[0].features != scipy.sparse.coo_matrix((1, shape))).nnz == 0
+    assert zero_features[0].features.shape == (0, shape)
+    assert zero_features[0].features.nnz == 0
 
 
 def test_surface_attributes():
@@ -168,7 +169,7 @@ def test_extract_features():
         dense_features,
         sparse_features,
     ) = model_data_utils._extract_features(tracker_features, zero_features_list, INTENT)
-    expected_mask = np.array([[[1], [0], [1]], [[0], [0], [1]], [[1], [1], [1]]])
+    expected_mask = np.array([[1, 0, 1], [0, 0, 1], [1, 1, 1]])
 
     assert np.all(np.squeeze(np.array(attribute_masks), 2) == expected_mask)
     assert np.array(dense_features[SENTENCE]).shape[-1] == zero_features.shape[-1]