From 8279a174cdd8644417db7f2caf7d3535c8712bbf Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 20 Oct 2020 14:58:51 +0200
Subject: [PATCH 01/62] add diet to ted

---
 examples/e2ebot/config.yml              |   7 +-
 rasa/core/policies/rule_policy.py       |   2 +-
 rasa/core/policies/ted_policy.py        |  46 ++++++-
 rasa/nlu/classifiers/diet_classifier.py | 159 +----------------------
 rasa/utils/tensorflow/constants.py      |   1 +
 rasa/utils/tensorflow/model_data.py     |   1 +
 rasa/utils/tensorflow/models.py         | 165 +++++++++++++++++++++++-
 7 files changed, 211 insertions(+), 170 deletions(-)

diff --git a/examples/e2ebot/config.yml b/examples/e2ebot/config.yml
index 0ce241f491f5..e2cbcd0cb5af 100644
--- a/examples/e2ebot/config.yml
+++ b/examples/e2ebot/config.yml
@@ -10,10 +10,7 @@ pipeline:
     min_ngram: 1
     max_ngram: 4
   - name: DIETClassifier
-    epochs: 100
+    epochs: 1
 policies:
 - name: TEDPolicy
-  epochs: 100
-  batch_size:
-  - 32
-  - 64
+  epochs: 200
diff --git a/rasa/core/policies/rule_policy.py b/rasa/core/policies/rule_policy.py
index 0da4a4a6d90f..face3ced59ec 100644
--- a/rasa/core/policies/rule_policy.py
+++ b/rasa/core/policies/rule_policy.py
@@ -770,7 +770,7 @@ def predict_action_probabilities(
         if default_action_name and not rules_action_name_from_text:
             return self._prediction_result(default_action_name, tracker, domain), False
 
-        # A loop has priority over any other rule.
+        # A loop has priority over any other rule except defaults.
         # The rules or any other prediction will be applied only if a loop was rejected.
         # If we are in a loop, and the loop didn't run previously or rejected, we can
         # simply force predict the loop.
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 4cb64a0dcc8f..a6f1c796fc89 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -71,9 +71,13 @@
     ENCODING_DIMENSION,
     UNIDIRECTIONAL_ENCODER,
     SEQUENCE,
+    SEQUENCE_LENGTH,
     SENTENCE,
     DENSE_DIMENSION,
     E2E_CONFIDENCE_THRESHOLD,
+    SPARSE_INPUT_DROPOUT,
+    DENSE_INPUT_DROPOUT,
+    MASKED_LM,
 )
 
 
@@ -89,6 +93,7 @@
 LENGTH = "length"
 POSSIBLE_FEATURE_TYPES = [SEQUENCE, SENTENCE]
 FEATURES_TO_ENCODE = [INTENT, TEXT, ACTION_NAME, ACTION_TEXT]
+SEQUENCE_FEATURES_TO_ENCODE = [TEXT, ACTION_TEXT]
 LABEL_FEATURES_TO_ENCODE = [f"{LABEL}_{ACTION_NAME}", f"{LABEL}_{ACTION_TEXT}"]
 STATE_LEVEL_FEATURES = [ENTITIES, SLOTS, ACTIVE_LOOP]
 
@@ -190,6 +195,13 @@ class TEDPolicy(Policy):
         DROP_RATE_ATTENTION: 0,
         # Sparsity of the weights in dense layers
         WEIGHT_SPARSITY: 0.8,
+        # If 'True' apply dropout to sparse input tensors
+        SPARSE_INPUT_DROPOUT: True,
+        # If 'True' apply dropout to dense input tensors
+        DENSE_INPUT_DROPOUT: True,
+        # If 'True' random tokens of the input message will be masked and the model
+        # should predict those tokens.
+        MASKED_LM: False,
         # ## Evaluation parameters
         # How often calculate validation accuracy.
         # Small values may hurt performance, e.g. model accuracy.
@@ -320,6 +332,8 @@ def _create_model_data(
         model_data.add_lengths(
             DIALOGUE, LENGTH, next(iter(list(attribute_data.keys()))), MASK
         )
+        model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE)
+        model_data.add_lengths(ACTION_TEXT, SEQUENCE_LENGTH, ACTION_TEXT, SEQUENCE)
 
         return model_data
 
@@ -623,7 +637,10 @@ def _check_data(self) -> None:
     def _prepare_layers(self) -> None:
         for name in self.data_signature.keys():
             self._prepare_sparse_dense_layer_for(name, self.data_signature)
-            self._prepare_encoding_layers(name)
+            if name in SEQUENCE_FEATURES_TO_ENCODE:
+                self._prepare_sequence_layers(name)
+            else:
+                self._prepare_encoding_layers(name)
 
         for name in self.label_signature.keys():
             self._prepare_sparse_dense_layer_for(name, self.label_signature)
@@ -756,11 +773,30 @@ def _encode_features_per_attribute(
             A tensor combining  all features for `attribute`
         """
 
-        if not tf_batch_data[attribute]:
-            return None
-
         attribute_mask = tf_batch_data[attribute][MASK][0]
-        # TODO transformer has to be used to process sequence features
+
+        if attribute in SEQUENCE_FEATURES_TO_ENCODE:
+            batch_dim = self._get_batch_dim(tf_batch_data)
+            mask_sequence_text = self._get_mask_for(tf_batch_data, TEXT, SEQUENCE_LENGTH)
+            sequence_lengths = self._get_sequence_lengths(
+                tf_batch_data, TEXT, SEQUENCE_LENGTH, batch_dim
+            )
+            mask_text = self._compute_mask(sequence_lengths)
+
+            attribute_features, _, _, _ = self._create_sequence(
+                tf_batch_data[TEXT][SEQUENCE],
+                tf_batch_data[TEXT][SENTENCE],
+                mask_sequence_text,
+                mask_text,
+                attribute,
+                sparse_dropout=self.config[SPARSE_INPUT_DROPOUT],
+                dense_dropout=self.config[DENSE_INPUT_DROPOUT],
+                masked_lm_loss=self.config[MASKED_LM],
+                sequence_ids=True,
+            )
+            # TODO entities
+            return self._last_token(attribute_features, sequence_lengths) * attribute_mask
+
         attribute_features = self._combine_sparse_dense_features(
             tf_batch_data[attribute][SENTENCE],
             f"{attribute}_{SENTENCE}",
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index f476d4917351..736de4d71eb0 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -7,7 +7,6 @@
 import os
 import scipy.sparse
 import tensorflow as tf
-import tensorflow_addons as tfa
 
 from typing import Any, Dict, List, Optional, Text, Tuple, Union, Type, NamedTuple
 
@@ -90,6 +89,7 @@
     FEATURIZERS,
     CHECKPOINT_MODEL,
     SEQUENCE,
+    SEQUENCE_LENGTH,
     SENTENCE,
     DENSE_DIMENSION,
 )
@@ -100,7 +100,6 @@
 
 SPARSE = "sparse"
 DENSE = "dense"
-SEQUENCE_LENGTH = f"{SEQUENCE}_lengths"
 LABEL_KEY = LABEL
 LABEL_SUB_KEY = "ids"
 TAG_IDS = "tag_ids"
@@ -1282,39 +1281,6 @@ def _prepare_layers(self) -> None:
         if self.config[ENTITY_RECOGNITION]:
             self._prepare_entity_recognition_layers()
 
-    def _prepare_input_layers(self, name: Text) -> None:
-        self._prepare_ffnn_layer(
-            name, self.config[HIDDEN_LAYERS_SIZES][name], self.config[DROP_RATE]
-        )
-
-        for feature_type in [SENTENCE, SEQUENCE]:
-            if (
-                name not in self.data_signature
-                or feature_type not in self.data_signature[name]
-            ):
-                continue
-
-            self._prepare_sparse_dense_dropout_layers(
-                f"{name}_{feature_type}", self.config[DROP_RATE]
-            )
-            self._prepare_sparse_dense_layers(
-                self.data_signature[name][feature_type],
-                f"{name}_{feature_type}",
-                self.config[DENSE_DIMENSION][name],
-            )
-            self._prepare_ffnn_layer(
-                f"{name}_{feature_type}",
-                [self.config[CONCAT_DIMENSION][name]],
-                self.config[DROP_RATE],
-                prefix="concat_layer",
-            )
-
-    def _prepare_sequence_layers(self, name: Text) -> None:
-        self._prepare_input_layers(name)
-        self._prepare_transformer_layer(
-            name, self.config[DROP_RATE], self.config[DROP_RATE_ATTENTION]
-        )
-
     def _prepare_mask_lm_layers(self, name: Text) -> None:
         self._tf_layers[f"{name}_input_mask"] = layers.InputMask()
 
@@ -1347,67 +1313,6 @@ def _prepare_entity_recognition_layers(self) -> None:
                 f"tags.{name}",
             )
 
-    def _features_as_seq_ids(
-        self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text
-    ) -> Optional[tf.Tensor]:
-        """Creates dense labels for negative sampling."""
-
-        # if there are dense features - we can use them
-        for f in features:
-            if not isinstance(f, tf.SparseTensor):
-                seq_ids = tf.stop_gradient(f)
-                # add a zero to the seq dimension for the sentence features
-                seq_ids = tf.pad(seq_ids, [[0, 0], [0, 1], [0, 0]])
-                return seq_ids
-
-        # use additional sparse to dense layer
-        for f in features:
-            if isinstance(f, tf.SparseTensor):
-                seq_ids = tf.stop_gradient(
-                    self._tf_layers[f"sparse_to_dense_ids.{name}"](f)
-                )
-                # add a zero to the seq dimension for the sentence features
-                seq_ids = tf.pad(seq_ids, [[0, 0], [0, 1], [0, 0]])
-                return seq_ids
-
-        return None
-
-    def _combine_sequence_sentence_features(
-        self,
-        sequence_features: List[Union[tf.Tensor, tf.SparseTensor]],
-        sentence_features: List[Union[tf.Tensor, tf.SparseTensor]],
-        mask_sequence: tf.Tensor,
-        mask_text: tf.Tensor,
-        name: Text,
-        sparse_dropout: bool = False,
-        dense_dropout: bool = False,
-    ) -> tf.Tensor:
-        sequence_x = self._combine_sparse_dense_features(
-            sequence_features,
-            f"{name}_{SEQUENCE}",
-            mask_sequence,
-            sparse_dropout,
-            dense_dropout,
-        )
-        sentence_x = self._combine_sparse_dense_features(
-            sentence_features, f"{name}_{SENTENCE}", None, sparse_dropout, dense_dropout
-        )
-
-        if sequence_x is not None and sentence_x is None:
-            return sequence_x
-
-        if sequence_x is None and sentence_x is not None:
-            return sentence_x
-
-        if sequence_x is not None and sentence_x is not None:
-            return self._concat_sequence_sentence_features(
-                sequence_x, sentence_x, name, mask_text
-            )
-
-        raise ValueError(
-            "No features are present. Please check your configuration file."
-        )
-
     def _concat_sequence_sentence_features(
         self,
         sequence_x: tf.Tensor,
@@ -1464,52 +1369,6 @@ def _create_bow(
         x = tf.reduce_sum(x, axis=1)  # convert to bag-of-words
         return self._tf_layers[f"ffnn.{name}"](x, self._training)
 
-    def _create_sequence(
-        self,
-        sequence_features: List[Union[tf.Tensor, tf.SparseTensor]],
-        sentence_features: List[Union[tf.Tensor, tf.SparseTensor]],
-        mask_sequence: tf.Tensor,
-        mask: tf.Tensor,
-        name: Text,
-        sparse_dropout: bool = False,
-        dense_dropout: bool = False,
-        masked_lm_loss: bool = False,
-        sequence_ids: bool = False,
-    ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]:
-        if sequence_ids:
-            seq_ids = self._features_as_seq_ids(sequence_features, f"{name}_{SEQUENCE}")
-        else:
-            seq_ids = None
-
-        inputs = self._combine_sequence_sentence_features(
-            sequence_features,
-            sentence_features,
-            mask_sequence,
-            mask,
-            name,
-            sparse_dropout,
-            dense_dropout,
-        )
-        inputs = self._tf_layers[f"ffnn.{name}"](inputs, self._training)
-
-        if masked_lm_loss:
-            transformer_inputs, lm_mask_bool = self._tf_layers[f"{name}_input_mask"](
-                inputs, mask, self._training
-            )
-        else:
-            transformer_inputs = inputs
-            lm_mask_bool = None
-
-        outputs = self._tf_layers[f"transformer.{name}"](
-            transformer_inputs, 1 - mask, self._training
-        )
-
-        if self.config[NUM_TRANSFORMER_LAYERS] > 0:
-            # apply activation
-            outputs = tfa.activations.gelu(outputs)
-
-        return outputs, inputs, seq_ids, lm_mask_bool
-
     def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_label_ids = self.tf_label_data[LABEL_KEY][LABEL_SUB_KEY][0]
 
@@ -1598,22 +1457,6 @@ def _calculate_entity_loss(
 
         return loss, f1, logits
 
-    @staticmethod
-    def _get_sequence_lengths(
-        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
-        key: Text,
-        sub_key: Text,
-        batch_dim: int = 1,
-    ) -> tf.Tensor:
-        # sentence features have a sequence lengths of 1
-        # if sequence features are present we add the sequence lengths of those
-
-        sequence_lengths = tf.ones([batch_dim], dtype=tf.int32)
-        if key in tf_batch_data and sub_key in tf_batch_data[key]:
-            sequence_lengths += tf.cast(tf_batch_data[key][sub_key][0], dtype=tf.int32)
-
-        return sequence_lengths
-
     @staticmethod
     def _get_batch_dim(tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]) -> int:
         if TEXT in tf_batch_data and SEQUENCE in tf_batch_data[TEXT]:
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index f214319e7e3a..1bd4322b7c91 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -67,6 +67,7 @@
 BALANCED = "balanced"
 
 SEQUENCE = "sequence"
+SEQUENCE_LENGTH = f"{SEQUENCE}_lengths"
 SENTENCE = "sentence"
 
 POOLING = "pooling"
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 083db97d167e..3d011eef1806 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -455,6 +455,7 @@ def add_lengths(
         self.data[key][sub_key] = []
 
         for data in self.data[from_key][from_sub_key]:
+            print(data)
             if len(data) > 0:
                 lengths = np.array([x.shape[0] for x in data])
                 self.data[key][sub_key].extend(
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index d26e77efe5ef..d860cfe97851 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -1,6 +1,7 @@
 import datetime
 
 import tensorflow as tf
+import tensorflow_addons as tfa
 import numpy as np
 import logging
 import os
@@ -26,6 +27,7 @@
 from rasa.utils.tensorflow.model_data import RasaModelData, FeatureSignature
 from rasa.utils.tensorflow.constants import (
     SEQUENCE,
+    SENTENCE,
     TENSORBOARD_LOG_LEVEL,
     RANDOM_SEED,
     TENSORBOARD_LOG_DIR,
@@ -47,6 +49,11 @@
     MAX_NEG_SIM,
     USE_MAX_NEG_SIM,
     NEGATIVE_MARGIN_SCALE,
+    HIDDEN_LAYERS_SIZES,
+    DROP_RATE,
+    DENSE_DIMENSION,
+    CONCAT_DIMENSION,
+    DROP_RATE_ATTENTION,
 )
 from rasa.utils.tensorflow import layers
 from rasa.utils.tensorflow.transformer import TransformerEncoder
@@ -153,7 +160,7 @@ def fit(
         batch_strategy: Text,
         silent: bool = False,
         loading: bool = False,
-        eager: bool = False,
+        eager: bool = True,
     ) -> None:
         """Fit model data"""
 
@@ -767,6 +774,39 @@ def _prepare_sparse_dense_layers(
                     units=2, trainable=False, name=f"sparse_to_dense_ids.{name}"
                 )
 
+    def _prepare_input_layers(self, name: Text) -> None:
+        self._prepare_ffnn_layer(
+            name, self.config[HIDDEN_LAYERS_SIZES][name], self.config[DROP_RATE]
+        )
+
+        for feature_type in [SENTENCE, SEQUENCE]:
+            if (
+                name not in self.data_signature
+                or feature_type not in self.data_signature[name]
+            ):
+                continue
+
+            self._prepare_sparse_dense_dropout_layers(
+                f"{name}_{feature_type}", self.config[DROP_RATE]
+            )
+            self._prepare_sparse_dense_layers(
+                self.data_signature[name][feature_type],
+                f"{name}_{feature_type}",
+                self.config[DENSE_DIMENSION][name],
+            )
+            self._prepare_ffnn_layer(
+                f"{name}_{feature_type}",
+                [self.config[CONCAT_DIMENSION][name]],
+                self.config[DROP_RATE],
+                prefix="concat_layer",
+            )
+
+    def _prepare_sequence_layers(self, name: Text) -> None:
+        self._prepare_input_layers(name)
+        self._prepare_transformer_layer(
+            name, self.config[DROP_RATE], self.config[DROP_RATE_ATTENTION]
+        )
+
     def _combine_sparse_dense_features(
         self,
         features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]],
@@ -806,6 +846,113 @@ def _combine_sparse_dense_features(
 
         return tf.concat(dense_features, axis=-1) * mask
 
+    def _combine_sequence_sentence_features(
+        self,
+        sequence_features: List[Union[tf.Tensor, tf.SparseTensor]],
+        sentence_features: List[Union[tf.Tensor, tf.SparseTensor]],
+        mask_sequence: tf.Tensor,
+        mask_text: tf.Tensor,
+        name: Text,
+        sparse_dropout: bool = False,
+        dense_dropout: bool = False,
+    ) -> tf.Tensor:
+        sequence_x = self._combine_sparse_dense_features(
+            sequence_features,
+            f"{name}_{SEQUENCE}",
+            mask_sequence,
+            sparse_dropout,
+            dense_dropout,
+        )
+        sentence_x = self._combine_sparse_dense_features(
+            sentence_features, f"{name}_{SENTENCE}", None, sparse_dropout, dense_dropout
+        )
+
+        if sequence_x is not None and sentence_x is None:
+            return sequence_x
+
+        if sequence_x is None and sentence_x is not None:
+            return sentence_x
+
+        if sequence_x is not None and sentence_x is not None:
+            return self._concat_sequence_sentence_features(
+                sequence_x, sentence_x, name, mask_text
+            )
+
+        raise ValueError(
+            "No features are present. Please check your configuration file."
+        )
+
+    def _features_as_seq_ids(
+        self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text
+    ) -> Optional[tf.Tensor]:
+        """Creates dense labels for negative sampling."""
+
+        # if there are dense features - we can use them
+        for f in features:
+            if not isinstance(f, tf.SparseTensor):
+                seq_ids = tf.stop_gradient(f)
+                # add a zero to the seq dimension for the sentence features
+                seq_ids = tf.pad(seq_ids, [[0, 0], [0, 1], [0, 0]])
+                return seq_ids
+
+        # use additional sparse to dense layer
+        for f in features:
+            if isinstance(f, tf.SparseTensor):
+                seq_ids = tf.stop_gradient(
+                    self._tf_layers[f"sparse_to_dense_ids.{name}"](f)
+                )
+                # add a zero to the seq dimension for the sentence features
+                seq_ids = tf.pad(seq_ids, [[0, 0], [0, 1], [0, 0]])
+                return seq_ids
+
+        return None
+
+    def _create_sequence(
+        self,
+        sequence_features: List[Union[tf.Tensor, tf.SparseTensor]],
+        sentence_features: List[Union[tf.Tensor, tf.SparseTensor]],
+        mask_sequence: tf.Tensor,
+        mask: tf.Tensor,
+        name: Text,
+        sparse_dropout: bool = False,
+        dense_dropout: bool = False,
+        masked_lm_loss: bool = False,
+        sequence_ids: bool = False,
+    ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]:
+        if sequence_ids:
+            seq_ids = self._features_as_seq_ids(sequence_features, f"{name}_{SEQUENCE}")
+        else:
+            seq_ids = None
+
+        inputs = self._combine_sequence_sentence_features(
+            sequence_features,
+            sentence_features,
+            mask_sequence,
+            mask,
+            name,
+            sparse_dropout,
+            dense_dropout,
+        )
+        inputs = self._tf_layers[f"ffnn.{name}"](inputs, self._training)
+
+        if masked_lm_loss:
+            transformer_inputs, lm_mask_bool = self._tf_layers[f"{name}_input_mask"](
+                inputs, mask, self._training
+            )
+        else:
+            transformer_inputs = inputs
+            lm_mask_bool = None
+
+        outputs = self._tf_layers[f"transformer.{name}"](
+            transformer_inputs, 1 - mask, self._training
+        )
+
+        if self.config[NUM_TRANSFORMER_LAYERS] > 0:
+            # apply activation
+            outputs = tfa.activations.gelu(outputs)
+
+        return outputs, inputs, seq_ids, lm_mask_bool
+
     @staticmethod
     def _compute_mask(sequence_lengths: tf.Tensor) -> tf.Tensor:
         mask = tf.sequence_mask(sequence_lengths, dtype=tf.float32)
@@ -833,6 +980,22 @@ def _get_mask_for(
         sequence_lengths = tf.cast(tf_batch_data[key][sub_key][0], dtype=tf.int32)
         return self._compute_mask(sequence_lengths)
 
+    @staticmethod
+    def _get_sequence_lengths(
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        key: Text,
+        sub_key: Text,
+        batch_dim: int = 1,
+    ) -> tf.Tensor:
+        # sentence features have a sequence lengths of 1
+        # if sequence features are present we add the sequence lengths of those
+
+        sequence_lengths = tf.ones([batch_dim], dtype=tf.int32)
+        if key in tf_batch_data and sub_key in tf_batch_data[key]:
+            sequence_lengths += tf.cast(tf_batch_data[key][sub_key][0], dtype=tf.int32)
+
+        return sequence_lengths
+
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:

From e4f795a50ffddde961dd306f1290ff1b2a155b1b Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 22 Oct 2020 15:42:19 +0200
Subject: [PATCH 02/62] reshape 4d tensors into 3d and back

---
 examples/e2ebot/config.yml              |  2 +-
 rasa/core/policies/ted_policy.py        | 83 ++++++++++++++++++-------
 rasa/nlu/classifiers/diet_classifier.py | 49 +++------------
 rasa/utils/tensorflow/model_data.py     | 34 ++++++----
 rasa/utils/tensorflow/models.py         | 44 ++++++++++++-
 5 files changed, 133 insertions(+), 79 deletions(-)

diff --git a/examples/e2ebot/config.yml b/examples/e2ebot/config.yml
index e2cbcd0cb5af..f38558adb0ad 100644
--- a/examples/e2ebot/config.yml
+++ b/examples/e2ebot/config.yml
@@ -10,7 +10,7 @@ pipeline:
     min_ngram: 1
     max_ngram: 4
   - name: DIETClassifier
-    epochs: 1
+    epochs: 200
 policies:
 - name: TEDPolicy
   epochs: 200
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 85fa1d8cb112..6ec2591ea79a 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -74,11 +74,13 @@
     SEQUENCE_LENGTH,
     SENTENCE,
     DENSE_DIMENSION,
+    CONCAT_DIMENSION,
     E2E_CONFIDENCE_THRESHOLD,
     SPARSE_INPUT_DROPOUT,
     DENSE_INPUT_DROPOUT,
     MASKED_LM,
     MASK,
+    HIDDEN_LAYERS_SIZES,
 )
 
 
@@ -125,7 +127,20 @@ class TEDPolicy(Policy):
         # The number of hidden layers is equal to the length of the corresponding
         # list.
         # TODO add 2 parallel NNs: transformer for text and ffnn for names
-        DENSE_DIMENSION: 20,
+        # Hidden layer sizes for layers before the embedding layers for user message
+        # and labels.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
+        HIDDEN_LAYERS_SIZES: {TEXT: [], ACTION_TEXT: []},
+        DENSE_DIMENSION: {
+            TEXT: 128,
+            ACTION_TEXT: 128,
+            f"{LABEL}_{ACTION_TEXT}": 20,
+            INTENT: 20,
+            ACTION_NAME: 20,
+            f"{LABEL}_{ACTION_NAME}": 20,
+        },
+        CONCAT_DIMENSION: {TEXT: 128, ACTION_TEXT: 128},
         ENCODING_DIMENSION: 50,
         # Number of units in transformer
         TRANSFORMER_SIZE: 128,
@@ -639,8 +654,7 @@ def _prepare_layers(self) -> None:
             self._prepare_sparse_dense_layer_for(name, self.data_signature)
             if name in SEQUENCE_FEATURES_TO_ENCODE:
                 self._prepare_sequence_layers(name)
-            else:
-                self._prepare_encoding_layers(name)
+            self._prepare_encoding_layers(name)
 
         for name in self.label_signature.keys():
             self._prepare_sparse_dense_layer_for(name, self.label_signature)
@@ -679,7 +693,7 @@ def _prepare_sparse_dense_layer_for(
             self._prepare_sparse_dense_layers(
                 signature[name][feature_type],
                 f"{name}_{feature_type}",
-                self.config[DENSE_DIMENSION],
+                self.config[DENSE_DIMENSION][name],
             )
 
     def _prepare_encoding_layers(self, name: Text) -> None:
@@ -704,7 +718,7 @@ def _prepare_encoding_layers(self, name: Text) -> None:
             return
 
         self._prepare_ffnn_layer(
-            f"{name}_{feature_type}",
+            f"{name}",
             [self.config[ENCODING_DIMENSION]],
             self.config[DROP_RATE_DIALOGUE],
         )
@@ -776,39 +790,56 @@ def _encode_features_per_attribute(
         attribute_mask = tf_batch_data[attribute][MASK][0]
 
         if attribute in SEQUENCE_FEATURES_TO_ENCODE:
-            batch_dim = self._get_batch_dim(tf_batch_data)
-            mask_sequence_text = self._get_mask_for(
-                tf_batch_data, TEXT, SEQUENCE_LENGTH
-            )
-            sequence_lengths = self._get_sequence_lengths(
-                tf_batch_data, TEXT, SEQUENCE_LENGTH, batch_dim
+            sequence_shape = [tf.shape(x) for x in tf_batch_data[attribute][SEQUENCE]]
+            sentence_shape = [tf.shape(x) for x in tf_batch_data[attribute][SENTENCE]]
+
+            sequence = [
+                tf.sparse.reshape(x, (-1, shape[2], shape[3]))
+                if isinstance(x, tf.SparseTensor)
+                else tf.reshape(x, (-1, shape[2], shape[3]))
+                for x, shape in zip(tf_batch_data[attribute][SEQUENCE], sequence_shape)
+            ]
+            sentence = [
+                tf.sparse.reshape(x, (-1, 1, shape[2]))
+                if isinstance(x, tf.SparseTensor)
+                else tf.reshape(x, (-1, shape[2]))
+                for x, shape in zip(tf_batch_data[attribute][SENTENCE], sentence_shape)
+            ]
+
+            _sequence_lengths = tf.cast(
+                tf_batch_data[attribute][SEQUENCE_LENGTH][0], dtype=tf.int32
             )
+            _sequence_lengths = tf.reshape(_sequence_lengths, (-1,))
+            mask_sequence_text = self._compute_mask(_sequence_lengths)
+            sequence_lengths = _sequence_lengths + 1
             mask_text = self._compute_mask(sequence_lengths)
 
             attribute_features, _, _, _ = self._create_sequence(
-                tf_batch_data[TEXT][SEQUENCE],
-                tf_batch_data[TEXT][SENTENCE],
+                sequence,
+                sentence,
                 mask_sequence_text,
                 mask_text,
                 attribute,
                 sparse_dropout=self.config[SPARSE_INPUT_DROPOUT],
                 dense_dropout=self.config[DENSE_INPUT_DROPOUT],
                 masked_lm_loss=self.config[MASKED_LM],
-                sequence_ids=True,
+                sequence_ids=False,
             )
             # TODO entities
-            return (
-                self._last_token(attribute_features, sequence_lengths) * attribute_mask
+            last_token = self._last_token(attribute_features, sequence_lengths)
+            attribute_features = tf.reshape(
+                last_token, (sequence_shape[0][0], sequence_shape[0][1], -1)
             )
 
-        attribute_features = self._combine_sparse_dense_features(
-            tf_batch_data[attribute][SENTENCE],
-            f"{attribute}_{SENTENCE}",
-            mask=attribute_mask,
-        )
+        else:
+            attribute_features = self._combine_sparse_dense_features(
+                tf_batch_data[attribute][SENTENCE],
+                f"{attribute}_{SENTENCE}",
+                mask=attribute_mask,
+            )
 
         if attribute in FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
-            attribute_features = self._tf_layers[f"ffnn.{attribute}_{SENTENCE}"](
+            attribute_features = self._tf_layers[f"ffnn.{attribute}"](
                 attribute_features
             )
 
@@ -879,7 +910,13 @@ def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
-
+        for k, v in tf_batch_data.items():
+            print(k)
+            for _k, _v in v.items():
+                print("  ", _k)
+                for __v in _v:
+                    print("    ", __v.shape)
+        # exit()
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
 
         all_label_ids, all_labels_embed = self._create_all_labels_embed()
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index c6989c3cbc02..3b3ffea3ba18 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1318,39 +1318,6 @@ def _prepare_entity_recognition_layers(self) -> None:
                 f"tags.{name}",
             )
 
-    def _concat_sequence_sentence_features(
-        self,
-        sequence_x: tf.Tensor,
-        sentence_x: tf.Tensor,
-        name: Text,
-        mask_text: tf.Tensor,
-    ):
-        if sequence_x.shape[-1] != sentence_x.shape[-1]:
-            sequence_x = self._tf_layers[f"concat_layer.{name}_{SEQUENCE}"](
-                sequence_x, self._training
-            )
-            sentence_x = self._tf_layers[f"concat_layer.{name}_{SENTENCE}"](
-                sentence_x, self._training
-            )
-
-        # we need to concatenate the sequence features with the sentence features
-        # we cannot use tf.concat as the sequence features are padded
-
-        # (1) get position of sentence features in mask
-        last = mask_text * tf.math.cumprod(
-            1 - mask_text, axis=1, exclusive=True, reverse=True
-        )
-        # (2) multiply by sentence features so that we get a matrix of
-        #     batch-dim x seq-dim x feature-dim with zeros everywhere except for
-        #     for the sentence features
-        sentence_x = last * sentence_x
-
-        # (3) add a zero to the end of sequence matrix to match the final shape
-        sequence_x = tf.pad(sequence_x, [[0, 0], [0, 1], [0, 0]])
-
-        # (4) sum up sequence features and sentence features
-        return sequence_x + sentence_x
-
     def _create_bow(
         self,
         sequence_features: List[Union[tf.Tensor, tf.SparseTensor]],
@@ -1462,19 +1429,17 @@ def _calculate_entity_loss(
 
         return loss, f1, logits
 
-    @staticmethod
-    def _get_batch_dim(tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]) -> int:
-        if TEXT in tf_batch_data and SEQUENCE in tf_batch_data[TEXT]:
-            return tf.shape(tf_batch_data[TEXT][SEQUENCE][0])[0]
-
-        return tf.shape(tf_batch_data[TEXT][SENTENCE][0])[0]
-
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
-
-        batch_dim = self._get_batch_dim(tf_batch_data)
+        for k, v in tf_batch_data.items():
+            print(k)
+            for _k, _v in v.items():
+                print("  ", _k)
+                for __v in _v:
+                    print("    ", __v.shape)
+        batch_dim = self._get_batch_dim(tf_batch_data[TEXT])
         mask_sequence_text = self._get_mask_for(tf_batch_data, TEXT, SEQUENCE_LENGTH)
         sequence_lengths = self._get_sequence_lengths(
             tf_batch_data, TEXT, SEQUENCE_LENGTH, batch_dim
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index d10f8318828c..1d1d2093afa2 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -93,7 +93,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
             "at": ufunc.at,
             "__call__": ufunc,
         }
-        # convert the inputs to np.ndarray to prevent recursion, call the function, then cast it back as FeatureArray
+        # convert the inputs to np.ndarray to prevent recursion, call the function,
+        # then cast it back as FeatureArray
         output = FeatureArray(
             f[method](*(i.view(np.ndarray) for i in inputs), **kwargs),
             number_of_dimensions=kwargs["number_of_dimensions"],
@@ -141,20 +142,21 @@ def _validate_number_of_dimensions(
                 dim = i
                 break
 
-        # If the resulting sub_array is sparse, the remaining number of dimensions should be at least 2
+        # If the resulting sub_array is sparse, the remaining number of dimensions
+        # should be at least 2
         if isinstance(_sub_array, scipy.sparse.spmatrix):
             if dim > 2:
                 raise ValueError(
-                    f"Given number of dimensions '{number_of_dimensions}' does not match dimensiona of given input "
-                    f"array: {input_array}."
+                    f"Given number of dimensions '{number_of_dimensions}' does not "
+                    f"match dimensiona of given input array: {input_array}."
                 )
         # If the resulting sub_array is dense, the sub_array should be a single number
         elif not np.issubdtype(type(_sub_array), np.integer) and not isinstance(
             _sub_array, (np.float32, np.float64)
         ):
             raise ValueError(
-                f"Given number of dimensions '{number_of_dimensions}' does not match dimensiona of given input "
-                f"array: {input_array}."
+                f"Given number of dimensions '{number_of_dimensions}' does not match "
+                f"dimensions of given input array: {input_array}."
             )
 
     def get_shape_type_info(
@@ -486,12 +488,22 @@ def add_lengths(
         for features in self.data[from_key][from_sub_key]:
             if len(features) > 0:
                 if features.number_of_dimensions == 4:
-                    lengths = np.array([x[0].shape[0] for x in features])
+                    lengths = FeatureArray(
+                        np.array(
+                            [
+                                # add one more dim so that dialogue dim
+                                # would be a sequence
+                                np.array([[[x.shape[0]]] for x in _features])
+                                for _features in features
+                            ]
+                        ),
+                        number_of_dimensions=4,
+                    )
                 else:
-                    lengths = np.array([x.shape[0] for x in features])
-                self.data[key][sub_key].extend(
-                    [FeatureArray(lengths, number_of_dimensions=1)]
-                )
+                    lengths = FeatureArray(
+                        np.array([x.shape[0] for x in features]), number_of_dimensions=1
+                    )
+                self.data[key][sub_key].extend([lengths])
                 break
 
     def split(
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index d860cfe97851..17e8df4f31e7 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -160,7 +160,7 @@ def fit(
         batch_strategy: Text,
         silent: bool = False,
         loading: bool = False,
-        eager: bool = True,
+        eager: bool = False,
     ) -> None:
         """Fit model data"""
 
@@ -882,6 +882,39 @@ def _combine_sequence_sentence_features(
             "No features are present. Please check your configuration file."
         )
 
+    def _concat_sequence_sentence_features(
+        self,
+        sequence_x: tf.Tensor,
+        sentence_x: tf.Tensor,
+        name: Text,
+        mask_text: tf.Tensor,
+    ):
+        if sequence_x.shape[-1] != sentence_x.shape[-1]:
+            sequence_x = self._tf_layers[f"concat_layer.{name}_{SEQUENCE}"](
+                sequence_x, self._training
+            )
+            sentence_x = self._tf_layers[f"concat_layer.{name}_{SENTENCE}"](
+                sentence_x, self._training
+            )
+
+        # we need to concatenate the sequence features with the sentence features
+        # we cannot use tf.concat as the sequence features are padded
+
+        # (1) get position of sentence features in mask
+        last = mask_text * tf.math.cumprod(
+            1 - mask_text, axis=1, exclusive=True, reverse=True
+        )
+        # (2) multiply by sentence features so that we get a matrix of
+        #     batch-dim x seq-dim x feature-dim with zeros everywhere except for
+        #     for the sentence features
+        sentence_x = last * sentence_x
+
+        # (3) add a zero to the end of sequence matrix to match the final shape
+        sequence_x = tf.pad(sequence_x, [[0, 0], [0, 1], [0, 0]])
+
+        # (4) sum up sequence features and sentence features
+        return sequence_x + sentence_x
+
     def _features_as_seq_ids(
         self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text
     ) -> Optional[tf.Tensor]:
@@ -994,7 +1027,14 @@ def _get_sequence_lengths(
         if key in tf_batch_data and sub_key in tf_batch_data[key]:
             sequence_lengths += tf.cast(tf_batch_data[key][sub_key][0], dtype=tf.int32)
 
-        return sequence_lengths
+        return tf.cast(tf_batch_data[key][sub_key][0], dtype=tf.int32) + 1
+
+    @staticmethod
+    def _get_batch_dim(attribute_data: Dict[Text, List[tf.Tensor]]) -> int:
+        if SEQUENCE in attribute_data:
+            return tf.shape(attribute_data[SEQUENCE][0])[0]
+
+        return tf.shape(attribute_data[SENTENCE][0])[0]
 
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]

From 0a97001fefb493f88eda73e0b79d601eac5df285 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 22 Oct 2020 16:07:48 +0200
Subject: [PATCH 03/62] fix shapes in non eager mode

---
 rasa/core/policies/ted_policy.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 6ec2591ea79a..5e8bc4186a3f 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -790,8 +790,14 @@ def _encode_features_per_attribute(
         attribute_mask = tf_batch_data[attribute][MASK][0]
 
         if attribute in SEQUENCE_FEATURES_TO_ENCODE:
-            sequence_shape = [tf.shape(x) for x in tf_batch_data[attribute][SEQUENCE]]
-            sentence_shape = [tf.shape(x) for x in tf_batch_data[attribute][SENTENCE]]
+            sequence_shape = [
+                [tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2], x.shape[-1]]
+                for x in tf_batch_data[attribute][SEQUENCE]
+            ]
+            sentence_shape = [
+                [tf.shape(x)[0], tf.shape(x)[1], x.shape[-1]]
+                for x in tf_batch_data[attribute][SENTENCE]
+            ]
 
             sequence = [
                 tf.sparse.reshape(x, (-1, shape[2], shape[3]))
@@ -805,6 +811,22 @@ def _encode_features_per_attribute(
                 else tf.reshape(x, (-1, shape[2]))
                 for x, shape in zip(tf_batch_data[attribute][SENTENCE], sentence_shape)
             ]
+            sequence = [
+                tf.SparseTensor(
+                    x.indices, x.values, (tf.shape(x)[0], tf.shape(x)[1], shape[3])
+                )
+                if isinstance(x, tf.SparseTensor)
+                else x
+                for x, shape in zip(sequence, sequence_shape)
+            ]
+            sentence = [
+                tf.SparseTensor(
+                    x.indices, x.values, (tf.shape(x)[0], tf.shape(x)[1], shape[2])
+                )
+                if isinstance(x, tf.SparseTensor)
+                else x
+                for x, shape in zip(sentence, sentence_shape)
+            ]
 
             _sequence_lengths = tf.cast(
                 tf_batch_data[attribute][SEQUENCE_LENGTH][0], dtype=tf.int32

From f904e46075d06f8ba5dd1365cc624e471474fb21 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 22 Oct 2020 16:14:57 +0200
Subject: [PATCH 04/62] make shape indices more general

---
 rasa/core/policies/ted_policy.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 5e8bc4186a3f..4c7909ba600f 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -795,25 +795,25 @@ def _encode_features_per_attribute(
                 for x in tf_batch_data[attribute][SEQUENCE]
             ]
             sentence_shape = [
-                [tf.shape(x)[0], tf.shape(x)[1], x.shape[-1]]
+                [tf.shape(x)[0], tf.shape(x)[1], 1, x.shape[-1]]
                 for x in tf_batch_data[attribute][SENTENCE]
             ]
 
             sequence = [
-                tf.sparse.reshape(x, (-1, shape[2], shape[3]))
+                tf.sparse.reshape(x, (-1, shape[2], shape[-1]))
                 if isinstance(x, tf.SparseTensor)
-                else tf.reshape(x, (-1, shape[2], shape[3]))
+                else tf.reshape(x, (-1, shape[2], shape[-1]))
                 for x, shape in zip(tf_batch_data[attribute][SEQUENCE], sequence_shape)
             ]
             sentence = [
-                tf.sparse.reshape(x, (-1, 1, shape[2]))
+                tf.sparse.reshape(x, (-1, shape[2], shape[-1]))
                 if isinstance(x, tf.SparseTensor)
-                else tf.reshape(x, (-1, shape[2]))
+                else tf.reshape(x, (-1, shape[2], shape[-1]))
                 for x, shape in zip(tf_batch_data[attribute][SENTENCE], sentence_shape)
             ]
             sequence = [
                 tf.SparseTensor(
-                    x.indices, x.values, (tf.shape(x)[0], tf.shape(x)[1], shape[3])
+                    x.indices, x.values, (tf.shape(x)[0], tf.shape(x)[1], shape[-1])
                 )
                 if isinstance(x, tf.SparseTensor)
                 else x
@@ -821,7 +821,7 @@ def _encode_features_per_attribute(
             ]
             sentence = [
                 tf.SparseTensor(
-                    x.indices, x.values, (tf.shape(x)[0], tf.shape(x)[1], shape[2])
+                    x.indices, x.values, (tf.shape(x)[0], tf.shape(x)[1], shape[-1])
                 )
                 if isinstance(x, tf.SparseTensor)
                 else x

From dd576e249140b7e11f58f97ba5b35bea10b5a23a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 22 Oct 2020 17:44:49 +0200
Subject: [PATCH 05/62] fix add_length

---
 rasa/utils/tensorflow/model_data.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index d10f8318828c..9d7a488e608f 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -484,15 +484,27 @@ def add_lengths(
         self.data[key][sub_key] = []
 
         for features in self.data[from_key][from_sub_key]:
-            if len(features) > 0:
-                if features.number_of_dimensions == 4:
-                    lengths = np.array([x[0].shape[0] for x in features])
-                else:
-                    lengths = np.array([x.shape[0] for x in features])
-                self.data[key][sub_key].extend(
-                    [FeatureArray(lengths, number_of_dimensions=1)]
+            if len(features) == 0:
+                continue
+
+            if features.number_of_dimensions == 4:
+                lengths = FeatureArray(
+                    np.array(
+                        [
+                            # add one more dim so that dialogue dim
+                            # would be a sequence
+                            np.array([[[x.shape[0]]] for x in _features])
+                            for _features in features
+                        ]
+                    ),
+                    number_of_dimensions=4,
                 )
-                break
+            else:
+                lengths = FeatureArray(
+                    np.array([x.shape[0] for x in features]), number_of_dimensions=1
+                )
+            self.data[key][sub_key].extend([lengths])
+            break
 
     def split(
         self, number_of_test_examples: int, random_seed: int

From 45a29828a213c13d95af6a4516e08a617a725291 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 23 Oct 2020 09:25:49 +0200
Subject: [PATCH 06/62] add todo

---
 rasa/core/policies/ted_policy.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 55e65c36bbc7..fc09fb63fcc5 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -296,6 +296,12 @@ def _create_model_data(
         """
         model_data = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY)
 
+        # TODO:
+        #  sentence features should also be 4D
+        #  sequence length should be 4D
+        #  pad_data should convert 4D to 3D (sum up batch and dialogue dimension)
+        #  inside batch_loss after the transformer convert 3D back to 4D
+
         if label_ids is not None and encoded_all_labels is not None:
 
             label_ids = np.array(

From 6e64bd739f489f70df64264b3b284673d954af39 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 23 Oct 2020 10:11:19 +0200
Subject: [PATCH 07/62] sentence features are now also 4D

---
 rasa/utils/tensorflow/model_data_utils.py | 32 +++++++----------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/rasa/utils/tensorflow/model_data_utils.py b/rasa/utils/tensorflow/model_data_utils.py
index f3fb30a2813a..09b54fc3af03 100644
--- a/rasa/utils/tensorflow/model_data_utils.py
+++ b/rasa/utils/tensorflow/model_data_utils.py
@@ -282,36 +282,22 @@ def _features_for_attribute(
     sparse_features = {}
     dense_features = {}
 
-    # vstack serves as removing dimension in case we are not dealing with a sequence
     for key, values in _sparse_features.items():
-        if key == SEQUENCE:
-            if consider_dialogue_dimension:
-                sparse_features[key] = FeatureArray(
-                    np.array(values), number_of_dimensions=4
-                )
-            else:
-                sparse_features[key] = FeatureArray(
-                    np.array([v[0] for v in values]), number_of_dimensions=3
-                )
+        if consider_dialogue_dimension:
+            sparse_features[key] = FeatureArray(
+                np.array(values), number_of_dimensions=4
+            )
         else:
-            features = [scipy.sparse.vstack(value) for value in values]
             sparse_features[key] = FeatureArray(
-                np.array(features), number_of_dimensions=3
+                np.array([v[0] for v in values]), number_of_dimensions=3
             )
+
     for key, values in _dense_features.items():
-        if key == SEQUENCE:
-            if consider_dialogue_dimension:
-                dense_features[key] = FeatureArray(
-                    np.array(values), number_of_dimensions=4
-                )
-            else:
-                dense_features[key] = FeatureArray(
-                    np.array([v[0] for v in values]), number_of_dimensions=3
-                )
+        if consider_dialogue_dimension:
+            dense_features[key] = FeatureArray(np.array(values), number_of_dimensions=4)
         else:
-            features = [np.vstack(value) for value in values]
             dense_features[key] = FeatureArray(
-                np.array(features), number_of_dimensions=3
+                np.array([v[0] for v in values]), number_of_dimensions=3
             )
 
     attribute_to_feature_arrays = {

From 94c0fa901e7be8f3c30b4b561453af8eb485988e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 23 Oct 2020 10:26:56 +0200
Subject: [PATCH 08/62] sequence length is 4D

---
 rasa/core/policies/ted_policy.py        | 11 +++++++++--
 rasa/nlu/classifiers/diet_classifier.py |  2 +-
 rasa/utils/tensorflow/constants.py      |  1 +
 rasa/utils/tensorflow/models.py         |  6 +++---
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index fc09fb63fcc5..1f75792ec29a 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -72,6 +72,7 @@
     UNIDIRECTIONAL_ENCODER,
     SEQUENCE,
     SENTENCE,
+    SEQUENCE_LENGTH,
     DENSE_DIMENSION,
     E2E_CONFIDENCE_THRESHOLD,
     MASK,
@@ -297,8 +298,6 @@ def _create_model_data(
         model_data = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY)
 
         # TODO:
-        #  sentence features should also be 4D
-        #  sequence length should be 4D
         #  pad_data should convert 4D to 3D (sum up batch and dialogue dimension)
         #  inside batch_loss after the transformer convert 3D back to 4D
 
@@ -326,6 +325,7 @@ def _create_model_data(
         model_data.add_lengths(
             DIALOGUE, LENGTH, next(iter(list(attribute_data.keys()))), MASK
         )
+        model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE)
 
         return model_data
 
@@ -846,6 +846,13 @@ def batch_loss(
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
+        for key, values in tf_batch_data.items():
+            print(key)
+            for sub_key, tensors in values.items():
+                print(f"   {sub_key}")
+                for t in tensors:
+                    print(f"     {t.shape}")
+
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
 
         all_label_ids, all_labels_embed = self._create_all_labels_embed()
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index f58e6d301ad7..2470c2ca0f2e 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -90,6 +90,7 @@
     CHECKPOINT_MODEL,
     SEQUENCE,
     SENTENCE,
+    SEQUENCE_LENGTH,
     DENSE_DIMENSION,
     MASK,
 )
@@ -99,7 +100,6 @@
 
 SPARSE = "sparse"
 DENSE = "dense"
-SEQUENCE_LENGTH = f"{SEQUENCE}_lengths"
 LABEL_KEY = LABEL
 LABEL_SUB_KEY = "ids"
 TAG_IDS = "tag_ids"
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 3f42323260be..9244e35cedff 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -68,6 +68,7 @@
 
 SEQUENCE = "sequence"
 SENTENCE = "sentence"
+SEQUENCE_LENGTH = f"{SEQUENCE}_lengths"
 
 POOLING = "pooling"
 MAX_POOLING = "max"
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index d26e77efe5ef..b2b105b90d25 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -153,7 +153,7 @@ def fit(
         batch_strategy: Text,
         silent: bool = False,
         loading: bool = False,
-        eager: bool = False,
+        eager: bool = True,
     ) -> None:
         """Fit model data"""
 
@@ -285,7 +285,7 @@ def train_on_batch(
         self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
     def build_for_predict(
-        self, predict_data: RasaModelData, eager: bool = False
+        self, predict_data: RasaModelData, eager: bool = True
     ) -> None:
         self._training = False  # needed for tf graph mode
         self._predict_function = self._get_tf_call_model_function(
@@ -533,7 +533,7 @@ def batch_to_model_data_format(
                             batch[idx + 2][i] for i in range(number_of_dimensions - 1)
                         ] + [feature_dimension]
                         batch_data[key][sub_key].append(
-                            tf.SparseTensor(batch[idx], batch[idx + 1], shape,)
+                            tf.SparseTensor(batch[idx], batch[idx + 1], shape)
                         )
                         idx += 3
                     else:

From 5263b6064d34e4edc7119506e035c721c1036228 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 23 Oct 2020 11:39:47 +0200
Subject: [PATCH 09/62] convert 4d to 3 during padding

---
 rasa/utils/tensorflow/model_data.py       | 72 ++++++++++++++---------
 rasa/utils/tensorflow/models.py           |  3 +
 tests/utils/tensorflow/test_model_data.py | 14 ++---
 3 files changed, 54 insertions(+), 35 deletions(-)

diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 9d7a488e608f..90bb00d6defc 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -178,13 +178,13 @@ def get_shape_type_info(
             A list of type tuples.
         """
         if self.is_sparse:
+            # 4D tensors were converted into 3D tensors during padding
+            number_of_dimensions = (
+                self.number_of_dimensions if self.number_of_dimensions != 4 else 3
+            )
             # scipy matrix is converted into indices, data, shape
             return (
-                [
-                    (None, self.number_of_dimensions),
-                    (None,),
-                    (self.number_of_dimensions),
-                ],
+                [(None, number_of_dimensions), (None,), (number_of_dimensions)],
                 [tf.int64, tf.float32, tf.int64],
             )
 
@@ -198,13 +198,15 @@ def get_shape_type_info(
             return [(None, None, self.units)], [tf.float32]
 
         if self.number_of_dimensions == 4:
-            return [(None, None, None, self.units)], [tf.float32]
+            # 4D tensors were converted into 3D tensors during padding
+            return [(None, None, self.units)], [tf.float32]
 
         return [], []
 
 
 class FeatureSignature(NamedTuple):
-    """Stores the number of units, the type (sparse vs dense), and the number of dimensions of features."""
+    """Stores the number of units, the type (sparse vs dense), and the number of
+    dimensions of features."""
 
     is_sparse: bool
     units: Optional[int]
@@ -1068,8 +1070,15 @@ def _pad_dense_data(array_of_dense: FeatureArray) -> np.ndarray:
     def _pad_4d_dense_data(array_of_array_of_dense: FeatureArray) -> np.ndarray:
         # in case of dialogue data we may have 4 dimensions
         # batch size x dialogue history length x sequence length x number of features
-        data_size = len(array_of_array_of_dense)
-        max_dialogue_len = max(
+
+        # as transformers cannot handle 4D tensors pad and reshape the data
+        # so that the resulting tensor is 3D
+        # the shape is (sum of dialogue history length for all tensors in the
+        # batch x max sequence length x number of features)
+        # the original shape is passed on the model via the data signature, the
+        # original shape can be used to transform the 3D tensor back into 4D
+
+        sum_dialogue_len = sum(
             len(array_of_dense) for array_of_dense in array_of_array_of_dense
         )
         max_seq_len = max(
@@ -1081,18 +1090,15 @@ def _pad_4d_dense_data(array_of_array_of_dense: FeatureArray) -> np.ndarray:
         )
 
         data_padded = np.zeros(
-            [
-                data_size,
-                max_dialogue_len,
-                max_seq_len,
-                array_of_array_of_dense[0][0].shape[-1],
-            ],
+            [sum_dialogue_len, max_seq_len, array_of_array_of_dense[0][0].shape[-1]],
             dtype=array_of_array_of_dense[0][0].dtype,
         )
 
+        current_sum_dialogue_len = 0
         for i, array_of_dense in enumerate(array_of_array_of_dense):
             for j, dense in enumerate(array_of_dense):
-                data_padded[i, j, : dense.shape[0], :] = dense
+                data_padded[current_sum_dialogue_len + j, : dense.shape[0], :] = dense
+            current_sum_dialogue_len += len(array_of_dense)
 
         return data_padded.astype(np.float32)
 
@@ -1136,10 +1142,19 @@ def _scipy_matrix_to_values(array_of_sparse: FeatureArray) -> List[np.ndarray]:
         ]
 
     @staticmethod
-    def _4d_scipy_matrix_to_values(array_of_array_of_sparse: FeatureArray):
+    def _4d_scipy_matrix_to_values(
+        array_of_array_of_sparse: FeatureArray,
+    ) -> List[np.ndarray]:
         # in case of dialogue data we may have 4 dimensions
         # batch size x dialogue history length x sequence length x number of features
 
+        # as transformers cannot handle 4D tensors pad and reshape the data
+        # so that the resulting tensor is 3D
+        # the shape is (sum of dialogue history length for all tensors in the
+        # batch x max sequence length x number of features)
+        # the original shape is passed on the model via the data signature, the
+        # original shape can be used to transform the 3D tensor back into 4D
+
         # we need to make sure that the matrices are coo_matrices otherwise the
         # transformation does not work (e.g. you cannot access x.row, x.col)
         if not isinstance(array_of_array_of_sparse[0][0], scipy.sparse.coo_matrix):
@@ -1148,8 +1163,8 @@ def _4d_scipy_matrix_to_values(array_of_array_of_sparse: FeatureArray):
                 for array_of_sparse in array_of_array_of_sparse
             ]
 
-        max_dialogue_len = max(
-            [len(array_of_sparse) for array_of_sparse in array_of_array_of_sparse]
+        max_dialogue_len = sum(
+            len(array_of_sparse) for array_of_sparse in array_of_array_of_sparse
         )
         max_seq_len = max(
             [
@@ -1162,7 +1177,15 @@ def _4d_scipy_matrix_to_values(array_of_array_of_sparse: FeatureArray):
         indices = np.hstack(
             [
                 np.vstack(
-                    [i * np.ones_like(x.row), j * np.ones_like(x.row), x.row, x.col]
+                    [
+                        sum(
+                            len(array_of_sparse)
+                            for array_of_sparse in array_of_array_of_sparse[:i]
+                        )
+                        + j * np.ones_like(x.row),
+                        x.row,
+                        x.col,
+                    ]
                 )
                 for i, array_of_sparse in enumerate(array_of_array_of_sparse)
                 for j, x in enumerate(array_of_sparse)
@@ -1178,14 +1201,7 @@ def _4d_scipy_matrix_to_values(array_of_array_of_sparse: FeatureArray):
         )
 
         number_of_features = array_of_array_of_sparse[0][0].shape[-1]
-        shape = np.array(
-            (
-                len(array_of_array_of_sparse),
-                max_dialogue_len,
-                max_seq_len,
-                number_of_features,
-            )
-        )
+        shape = np.array((max_dialogue_len, max_seq_len, number_of_features))
 
         return [
             indices.astype(np.int64),
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index b2b105b90d25..b11a38350b6e 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -526,6 +526,9 @@ def batch_to_model_data_format(
         for key, values in data_signature.items():
             for sub_key, signature in values.items():
                 for is_sparse, feature_dimension, number_of_dimensions in signature:
+                    number_of_dimensions = (
+                        number_of_dimensions if number_of_dimensions != 4 else 3
+                    )
                     if is_sparse:
                         # explicitly substitute last dimension in shape with known
                         # static value
diff --git a/tests/utils/tensorflow/test_model_data.py b/tests/utils/tensorflow/test_model_data.py
index d26bac06d722..304179434b13 100644
--- a/tests/utils/tensorflow/test_model_data.py
+++ b/tests/utils/tensorflow/test_model_data.py
@@ -96,7 +96,7 @@ async def model_data() -> RasaModelData:
                                 [
                                     scipy.sparse.csr_matrix(
                                         np.random.randint(5, size=(3, 10))
-                                    ),
+                                    )
                                 ],
                                 [
                                     scipy.sparse.csr_matrix(
@@ -123,13 +123,13 @@ async def model_data() -> RasaModelData:
                                     np.random.rand(1, 14),
                                     np.random.rand(3, 14),
                                 ],
-                                [np.random.rand(5, 14), np.random.rand(2, 14),],
+                                [np.random.rand(5, 14), np.random.rand(2, 14)],
                                 [
                                     np.random.rand(5, 14),
                                     np.random.rand(1, 14),
                                     np.random.rand(3, 14),
                                 ],
-                                [np.random.rand(3, 14),],
+                                [np.random.rand(3, 14)],
                                 [
                                     np.random.rand(3, 14),
                                     np.random.rand(1, 14),
@@ -383,12 +383,12 @@ def test_get_num_of_features(model_data: RasaModelData):
                                 np.random.rand(7, 10),
                             ]
                         ),
-                        np.array([np.random.rand(2, 10),]),
+                        np.array([np.random.rand(2, 10)]),
                     ]
                 ),
                 number_of_dimensions=4,
             ),
-            (3, 4, 7, 10),
+            (8, 7, 10),
         ),
     ],
 )
@@ -466,14 +466,14 @@ def test_pad_dense_data(incoming_data: FeatureArray, expected_shape: np.ndarray)
                             [
                                 scipy.sparse.csr_matrix(
                                     np.random.randint(10, size=(2, 10))
-                                ),
+                                )
                             ]
                         ),
                     ]
                 ),
                 number_of_dimensions=4,
             ),
-            (3, 4, 7, 10),
+            (8, 7, 10),
         ),
     ],
 )

From b5d479b7edb2342b5771cb1ca98b34ee41e806f5 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 23 Oct 2020 14:07:00 +0200
Subject: [PATCH 10/62] mask is 4d now

---
 rasa/utils/tensorflow/model_data_utils.py | 24 ++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/rasa/utils/tensorflow/model_data_utils.py b/rasa/utils/tensorflow/model_data_utils.py
index 09b54fc3af03..0459cee91b8d 100644
--- a/rasa/utils/tensorflow/model_data_utils.py
+++ b/rasa/utils/tensorflow/model_data_utils.py
@@ -300,9 +300,18 @@ def _features_for_attribute(
                 np.array([v[0] for v in values]), number_of_dimensions=3
             )
 
-    attribute_to_feature_arrays = {
-        MASK: [FeatureArray(np.array(attribute_masks), number_of_dimensions=3)]
-    }
+    if consider_dialogue_dimension:
+        attribute_to_feature_arrays = {
+            MASK: [FeatureArray(np.array(attribute_masks), number_of_dimensions=4)]
+        }
+    else:
+        attribute_to_feature_arrays = {
+            MASK: [
+                FeatureArray(
+                    np.array(np.squeeze(attribute_masks, -1)), number_of_dimensions=3
+                )
+            ]
+        }
 
     feature_types = set()
     feature_types.update(list(dense_features.keys()))
@@ -354,7 +363,7 @@ def _extract_features(
         # create a mask for every state
         # to capture which turn has which input
         attribute_mask = np.expand_dims(
-            np.ones(len(list_of_list_of_features), np.float32), -1
+            np.expand_dims(np.ones(len(list_of_list_of_features), np.float32), -1), -1
         )
 
         for i, list_of_features in enumerate(list_of_list_of_features):
@@ -365,9 +374,10 @@ def _extract_features(
                 list_of_features = zero_features
 
             for features in list_of_features:
-                # in case of ENTITIES, if the attribute type matches either 'entity', 'role', or 'group' the
-                # features correspond to the tag ids of that entity type
-                # in order to distinguish later on between the different tag ids, we use the entity type as key
+                # in case of ENTITIES, if the attribute type matches either 'entity',
+                # 'role', or 'group' the features correspond to the tag ids of that
+                # entity type in order to distinguish later on between the different
+                # tag ids, we use the entity type as key
                 if attribute == ENTITIES and features.attribute in [
                     ENTITY_ATTRIBUTE_TYPE,
                     ENTITY_ATTRIBUTE_GROUP,

From df2ccc3bbfc88cb08a484713d95d90603f987c36 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 23 Oct 2020 14:20:42 +0200
Subject: [PATCH 11/62] bring mask in correct shape before transformer

---
 rasa/core/policies/ted_policy.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 1f75792ec29a..46d967e4153a 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -297,10 +297,6 @@ def _create_model_data(
         """
         model_data = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY)
 
-        # TODO:
-        #  pad_data should convert 4D to 3D (sum up batch and dialogue dimension)
-        #  inside batch_loss after the transformer convert 3D back to 4D
-
         if label_ids is not None and encoded_all_labels is not None:
 
             label_ids = np.array(
@@ -733,12 +729,16 @@ def _emebed_dialogue(
         """Create dialogue level embedding and mask."""
 
         mask = self._compute_mask(sequence_lengths)
+        # remove the additional dimensions that were added due to 4D shape
+        mask = tf.squeeze(tf.squeeze(mask, axis=-1), axis=-1)
 
         dialogue_transformed = self._tf_layers[f"transformer.{DIALOGUE}"](
             dialogue_in, 1 - mask, self._training
         )
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
+        # TODO transform back to original 4D shape
+
         if self.max_history_tracker_featurizer_used:
             # pick last vector if max history featurizer is used
             dialogue_transformed = tf.expand_dims(

From 9813014d9a45c61a77d812a40a8d51e1eb633057 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 23 Oct 2020 15:35:55 +0200
Subject: [PATCH 12/62] keep also the orginial dialogue length

---
 rasa/core/policies/ted_policy.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 46d967e4153a..68b8f164c457 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -321,6 +321,14 @@ def _create_model_data(
         model_data.add_lengths(
             DIALOGUE, LENGTH, next(iter(list(attribute_data.keys()))), MASK
         )
+        model_data.data[DIALOGUE][f"3D_{LENGTH}"] = [
+            FeatureArray(
+                np.array(
+                    [np.squeeze(f, -1) for f in model_data.data[DIALOGUE][LENGTH][0]]
+                ),
+                number_of_dimensions=3,
+            )
+        ]
         model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE)
 
         return model_data
@@ -724,7 +732,10 @@ def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
         return all_label_ids, all_labels_embed
 
     def _emebed_dialogue(
-        self, dialogue_in: tf.Tensor, sequence_lengths: tf.Tensor
+        self,
+        dialogue_in: tf.Tensor,
+        sequence_lengths: tf.Tensor,
+        dialogue_3d_lengths: tf.Tensor,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         """Create dialogue level embedding and mask."""
 
@@ -738,6 +749,17 @@ def _emebed_dialogue(
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
         # TODO transform back to original 4D shape
+        output = tf.zeros(
+            (
+                dialogue_3d_lengths.shape[0],
+                dialogue_3d_lengths.shape[1],
+                dialogue_transformed.shape[1],
+                dialogue_transformed.shape[2],
+            )
+        )
+
+        # output shape 32, 29, 1, 128
+        # dialogue_transformed shape 647, 1, 128
 
         if self.max_history_tracker_featurizer_used:
             # pick last vector if max history featurizer is used
@@ -854,6 +876,9 @@ def batch_loss(
                     print(f"     {t.shape}")
 
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+        dialogue_3d_lengths = tf.cast(
+            tf_batch_data[DIALOGUE][f"3D_{LENGTH}"][0], tf.int32
+        )
 
         all_label_ids, all_labels_embed = self._create_all_labels_embed()
 
@@ -862,7 +887,7 @@ def batch_loss(
 
         dialogue_in = self._process_batch_data(tf_batch_data)
         dialogue_embed, dialogue_mask = self._emebed_dialogue(
-            dialogue_in, dialogue_lengths
+            dialogue_in, dialogue_lengths, dialogue_3d_lengths
         )
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 

From 71c527f927239fa35233b1357a10de5e845b6efb Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 23 Oct 2020 17:25:56 +0200
Subject: [PATCH 13/62] update doc strings

---
 rasa/core/policies/ted_policy.py | 39 ++++++++++++--------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 68b8f164c457..f1db91017424 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -749,17 +749,6 @@ def _emebed_dialogue(
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
         # TODO transform back to original 4D shape
-        output = tf.zeros(
-            (
-                dialogue_3d_lengths.shape[0],
-                dialogue_3d_lengths.shape[1],
-                dialogue_transformed.shape[1],
-                dialogue_transformed.shape[2],
-            )
-        )
-
-        # output shape 32, 29, 1, 128
-        # dialogue_transformed shape 647, 1, 128
 
         if self.max_history_tracker_featurizer_used:
             # pick last vector if max history featurizer is used
@@ -775,11 +764,13 @@ def _emebed_dialogue(
     def _encode_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
     ) -> Optional[tf.Tensor]:
-        """
-        Encodes features for a given attribute
+        """Encodes features for a given attribute
+
         Args:
             tf_batch_data: dictionary mapping every attribute to its features and masks
-            attribute: the attribute we will encode features for (e.g., ACTION_NAME, INTENT)
+            attribute: the attribute we will encode features for
+            (e.g., ACTION_NAME, INTENT)
+
         Returns:
             A tensor combining  all features for `attribute`
         """
@@ -805,9 +796,12 @@ def _encode_features_per_attribute(
     def _process_batch_data(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
     ) -> tf.Tensor:
-        """Encodes batch data; combines intent and text and action name and action text if both are present
+        """Encodes batch data; combines intent and text and action name and action
+        text if both are present.
+
         Args:
             tf_batch_data: dictionary mapping every attribute to its features and masks
+
         Returns:
              Tensor: encoding of all features in the batch, combined;
         """
@@ -817,7 +811,8 @@ def _process_batch_data(
             for key in tf_batch_data.keys()
             if LABEL_KEY not in key and DIALOGUE not in key
         }
-        # if both action text and action name are present, combine them; otherwise, return the one which is present
+        # if both action text and action name are present, combine them; otherwise,
+        # return the one which is present
 
         if (
             batch_encoded.get(ACTION_TEXT) is not None
@@ -868,13 +863,6 @@ def batch_loss(
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
-        for key, values in tf_batch_data.items():
-            print(key)
-            for sub_key, tensors in values.items():
-                print(f"   {sub_key}")
-                for t in tensors:
-                    print(f"     {t.shape}")
-
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
         dialogue_3d_lengths = tf.cast(
             tf_batch_data[DIALOGUE][f"3D_{LENGTH}"][0], tf.int32
@@ -913,13 +901,16 @@ def batch_predict(
         )
 
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+        dialogue_3d_lengths = tf.cast(
+            tf_batch_data[DIALOGUE][f"3D_{LENGTH}"][0], tf.int32
+        )
 
         if self.all_labels_embed is None:
             _, self.all_labels_embed = self._create_all_labels_embed()
 
         dialogue_in = self._process_batch_data(tf_batch_data)
         dialogue_embed, dialogue_mask = self._emebed_dialogue(
-            dialogue_in, dialogue_lengths
+            dialogue_in, dialogue_lengths, dialogue_3d_lengths
         )
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 

From f4c119a33ca94ee728e6c95fd1e2e4b22f90bdc4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 26 Oct 2020 16:39:59 +0100
Subject: [PATCH 14/62] use tf.scatter_nd to tranform 3d back to 4d

---
 rasa/core/policies/ted_policy.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index f1db91017424..dda229102ec2 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -748,14 +748,31 @@ def _emebed_dialogue(
         )
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
-        # TODO transform back to original 4D shape
-
         if self.max_history_tracker_featurizer_used:
             # pick last vector if max history featurizer is used
             dialogue_transformed = tf.expand_dims(
-                self._last_token(dialogue_transformed, sequence_lengths), 1
+                self._last_token(dialogue_transformed, tf.squeeze(sequence_lengths)), 1
+            )
+            mask = tf.expand_dims(
+                self._last_token(mask, tf.squeeze(sequence_lengths)), 1
             )
-            mask = tf.expand_dims(self._last_token(mask, sequence_lengths), 1)
+
+        # transform dialogue tensor back to original 4D shape
+        indices = []
+        for batch_dim in range(dialogue_3d_lengths.shape[0]):
+            for dialogue_dim in range(dialogue_3d_lengths.shape[1]):
+                if dialogue_3d_lengths[batch_dim][dialogue_dim] > 0:
+                    indices.append([batch_dim, dialogue_dim])
+        indices = tf.constant(indices)
+        shape = tf.constant(
+            [
+                dialogue_3d_lengths.shape[0],
+                dialogue_3d_lengths.shape[1],
+                dialogue_transformed.shape[1],
+                dialogue_transformed.shape[2],
+            ]
+        )
+        dialogue_transformed = tf.scatter_nd(indices, dialogue_transformed, shape)
 
         dialogue_embed = self._tf_layers[f"embed.{DIALOGUE}"](dialogue_transformed)
 

From 886ab01110e0c668caf8b2b003e91197fd1aecad Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 27 Oct 2020 13:54:29 +0100
Subject: [PATCH 15/62] move tensor transformation to
 _encode_features_per_attribute

---
 rasa/core/policies/ted_policy.py | 109 ++++++++++---------------------
 1 file changed, 34 insertions(+), 75 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 7828d4e67274..d66d450d2a36 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -762,10 +762,7 @@ def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
         return all_label_ids, all_labels_embed
 
     def _emebed_dialogue(
-        self,
-        dialogue_in: tf.Tensor,
-        sequence_lengths: tf.Tensor,
-        dialogue_3d_lengths: tf.Tensor,
+        self, dialogue_in: tf.Tensor, sequence_lengths: tf.Tensor
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         """Create dialogue level embedding and mask."""
 
@@ -781,28 +778,9 @@ def _emebed_dialogue(
         if self.max_history_tracker_featurizer_used:
             # pick last vector if max history featurizer is used
             dialogue_transformed = tf.expand_dims(
-                self._last_token(dialogue_transformed, tf.squeeze(sequence_lengths)), 1
-            )
-            mask = tf.expand_dims(
-                self._last_token(mask, tf.squeeze(sequence_lengths)), 1
+                self._last_token(dialogue_transformed, sequence_lengths), 1
             )
-
-        # transform dialogue tensor back to original 4D shape
-        indices = []
-        for batch_dim in range(dialogue_3d_lengths.shape[0]):
-            for dialogue_dim in range(dialogue_3d_lengths.shape[1]):
-                if dialogue_3d_lengths[batch_dim][dialogue_dim] > 0:
-                    indices.append([batch_dim, dialogue_dim])
-        indices = tf.constant(indices)
-        shape = tf.constant(
-            [
-                dialogue_3d_lengths.shape[0],
-                dialogue_3d_lengths.shape[1],
-                dialogue_transformed.shape[1],
-                dialogue_transformed.shape[2],
-            ]
-        )
-        dialogue_transformed = tf.scatter_nd(indices, dialogue_transformed, shape)
+            mask = tf.expand_dims(self._last_token(mask, sequence_lengths), 1)
 
         dialogue_embed = self._tf_layers[f"embed.{DIALOGUE}"](dialogue_transformed)
 
@@ -825,55 +803,16 @@ def _encode_features_per_attribute(
         attribute_mask = tf_batch_data[attribute][MASK][0]
 
         if attribute in SEQUENCE_FEATURES_TO_ENCODE:
-            sequence_shape = [
-                [tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2], x.shape[-1]]
-                for x in tf_batch_data[attribute][SEQUENCE]
-            ]
-            sentence_shape = [
-                [tf.shape(x)[0], tf.shape(x)[1], 1, x.shape[-1]]
-                for x in tf_batch_data[attribute][SENTENCE]
-            ]
-
-            sequence = [
-                tf.sparse.reshape(x, (-1, shape[2], shape[-1]))
-                if isinstance(x, tf.SparseTensor)
-                else tf.reshape(x, (-1, shape[2], shape[-1]))
-                for x, shape in zip(tf_batch_data[attribute][SEQUENCE], sequence_shape)
-            ]
-            sentence = [
-                tf.sparse.reshape(x, (-1, shape[2], shape[-1]))
-                if isinstance(x, tf.SparseTensor)
-                else tf.reshape(x, (-1, shape[2], shape[-1]))
-                for x, shape in zip(tf_batch_data[attribute][SENTENCE], sentence_shape)
-            ]
-            sequence = [
-                tf.SparseTensor(
-                    x.indices, x.values, (tf.shape(x)[0], tf.shape(x)[1], shape[-1])
-                )
-                if isinstance(x, tf.SparseTensor)
-                else x
-                for x, shape in zip(sequence, sequence_shape)
-            ]
-            sentence = [
-                tf.SparseTensor(
-                    x.indices, x.values, (tf.shape(x)[0], tf.shape(x)[1], shape[-1])
-                )
-                if isinstance(x, tf.SparseTensor)
-                else x
-                for x, shape in zip(sentence, sentence_shape)
-            ]
-
             _sequence_lengths = tf.cast(
                 tf_batch_data[attribute][SEQUENCE_LENGTH][0], dtype=tf.int32
             )
-            _sequence_lengths = tf.reshape(_sequence_lengths, (-1,))
-            mask_sequence_text = self._compute_mask(_sequence_lengths)
+            mask_sequence_text = self._compute_mask(tf.squeeze(_sequence_lengths))
             sequence_lengths = _sequence_lengths + 1
-            mask_text = self._compute_mask(sequence_lengths)
+            mask_text = self._compute_mask(tf.squeeze(sequence_lengths))
 
             attribute_features, _, _, _ = self._create_sequence(
-                sequence,
-                sentence,
+                tf_batch_data[attribute][SEQUENCE],
+                tf_batch_data[attribute][SENTENCE],
                 mask_sequence_text,
                 mask_text,
                 attribute,
@@ -883,9 +822,32 @@ def _encode_features_per_attribute(
                 sequence_ids=False,
             )
             # TODO entities
-            last_token = self._last_token(attribute_features, sequence_lengths)
-            attribute_features = tf.reshape(
-                last_token, (sequence_shape[0][0], sequence_shape[0][1], -1)
+            last_token = self._last_token(
+                attribute_features, tf.squeeze(sequence_lengths)
+            )
+
+            # transform attribute features back to original
+            # batch x dialogue length x units
+            indices = []
+            dialogue_lengths = tf.cast(
+                tf_batch_data[DIALOGUE][f"3D_{LENGTH}"][0], tf.int32
+            )
+            for batch_dim in range(dialogue_lengths.shape[0]):
+                for dialogue_dim in range(dialogue_lengths.shape[1]):
+                    if dialogue_lengths[batch_dim][dialogue_dim] > 0:
+                        indices.append([batch_dim, dialogue_dim])
+            indices = tf.constant(indices)
+            shape = tf.constant(
+                [
+                    dialogue_lengths.shape[0],
+                    dialogue_lengths.shape[1],
+                    last_token.shape[-1],
+                ]
+            )
+            attribute_features = tf.scatter_nd(indices, last_token, shape)
+
+            attribute_mask = tf.expand_dims(
+                tf.squeeze(self._compute_mask(tf.squeeze(dialogue_lengths))), axis=-1
             )
 
         else:
@@ -979,9 +941,6 @@ def batch_loss(
                     print("    ", __v.shape)
         # exit()
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
-        dialogue_3d_lengths = tf.cast(
-            tf_batch_data[DIALOGUE][f"3D_{LENGTH}"][0], tf.int32
-        )
 
         all_label_ids, all_labels_embed = self._create_all_labels_embed()
 
@@ -990,7 +949,7 @@ def batch_loss(
 
         dialogue_in = self._process_batch_data(tf_batch_data)
         dialogue_embed, dialogue_mask = self._emebed_dialogue(
-            dialogue_in, dialogue_lengths, dialogue_3d_lengths
+            dialogue_in, dialogue_lengths
         )
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 

From 098e441360f8b80e41b375569a32db245ec41dbc Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 27 Oct 2020 14:26:53 +0100
Subject: [PATCH 16/62] fix issues in _encode_features_per_attribute

---
 rasa/core/policies/ted_policy.py | 41 +++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index d66d450d2a36..fe81e59862fc 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -780,7 +780,9 @@ def _emebed_dialogue(
             dialogue_transformed = tf.expand_dims(
                 self._last_token(dialogue_transformed, sequence_lengths), 1
             )
-            mask = tf.expand_dims(self._last_token(mask, sequence_lengths), 1)
+            mask = tf.expand_dims(
+                self._last_token(mask, tf.squeeze(sequence_lengths)), 1
+            )
 
         dialogue_embed = self._tf_layers[f"embed.{DIALOGUE}"](dialogue_transformed)
 
@@ -821,11 +823,26 @@ def _encode_features_per_attribute(
                 masked_lm_loss=self.config[MASKED_LM],
                 sequence_ids=False,
             )
+
             # TODO entities
-            last_token = self._last_token(
+
+            attribute_features = self._last_token(
                 attribute_features, tf.squeeze(sequence_lengths)
             )
 
+        else:
+            attribute_features = self._combine_sparse_dense_features(
+                tf_batch_data[attribute][SENTENCE],
+                f"{attribute}_{SENTENCE}",
+                mask=attribute_mask,
+            )
+
+        if attribute in FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
+            attribute_features = self._tf_layers[f"ffnn.{attribute}"](
+                attribute_features
+            )
+
+        if attribute in FEATURES_TO_ENCODE:
             # transform attribute features back to original
             # batch x dialogue length x units
             indices = []
@@ -841,27 +858,17 @@ def _encode_features_per_attribute(
                 [
                     dialogue_lengths.shape[0],
                     dialogue_lengths.shape[1],
-                    last_token.shape[-1],
+                    attribute_features.shape[-1],
                 ]
             )
-            attribute_features = tf.scatter_nd(indices, last_token, shape)
+            attribute_features = tf.scatter_nd(
+                indices, tf.squeeze(attribute_features), shape
+            )
 
             attribute_mask = tf.expand_dims(
                 tf.squeeze(self._compute_mask(tf.squeeze(dialogue_lengths))), axis=-1
             )
 
-        else:
-            attribute_features = self._combine_sparse_dense_features(
-                tf_batch_data[attribute][SENTENCE],
-                f"{attribute}_{SENTENCE}",
-                mask=attribute_mask,
-            )
-
-        if attribute in FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
-            attribute_features = self._tf_layers[f"ffnn.{attribute}"](
-                attribute_features
-            )
-
         return attribute_features * attribute_mask
 
     def _process_batch_data(
@@ -940,7 +947,7 @@ def batch_loss(
                 for __v in _v:
                     print("    ", __v.shape)
         # exit()
-        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][f"3D_{LENGTH}"][0], tf.int32)
 
         all_label_ids, all_labels_embed = self._create_all_labels_embed()
 

From 94e0d8191c50d2683885072f64a5a912f2f22af7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 27 Oct 2020 14:47:29 +0100
Subject: [PATCH 17/62] use correct dialogue length

---
 rasa/core/policies/ted_policy.py        | 19 +++++++++----------
 rasa/nlu/classifiers/diet_classifier.py | 12 ++++++------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index fe81e59862fc..ea019c5dbf24 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -776,13 +776,12 @@ def _emebed_dialogue(
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
         if self.max_history_tracker_featurizer_used:
+            dialogue_lengths = tf.squeeze(tf.reduce_sum(sequence_lengths, axis=1))
             # pick last vector if max history featurizer is used
             dialogue_transformed = tf.expand_dims(
-                self._last_token(dialogue_transformed, sequence_lengths), 1
-            )
-            mask = tf.expand_dims(
-                self._last_token(mask, tf.squeeze(sequence_lengths)), 1
+                self._last_token(dialogue_transformed, dialogue_lengths), 1
             )
+            mask = tf.expand_dims(self._last_token(mask, dialogue_lengths), 1)
 
         dialogue_embed = self._tf_layers[f"embed.{DIALOGUE}"](dialogue_transformed)
 
@@ -940,12 +939,12 @@ def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
-        for k, v in tf_batch_data.items():
-            print(k)
-            for _k, _v in v.items():
-                print("  ", _k)
-                for __v in _v:
-                    print("    ", __v.shape)
+        # for k, v in tf_batch_data.items():
+        #     print(k)
+        #     for _k, _v in v.items():
+        #         print("  ", _k)
+        #         for __v in _v:
+        #             print("    ", __v.shape)
         # exit()
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][f"3D_{LENGTH}"][0], tf.int32)
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index b31717a5ae76..481bce505c38 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1434,12 +1434,12 @@ def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
-        for k, v in tf_batch_data.items():
-            print(k)
-            for _k, _v in v.items():
-                print("  ", _k)
-                for __v in _v:
-                    print("    ", __v.shape)
+        # for k, v in tf_batch_data.items():
+        #     print(k)
+        #     for _k, _v in v.items():
+        #         print("  ", _k)
+        #         for __v in _v:
+        #             print("    ", __v.shape)
         batch_dim = self._get_batch_dim(tf_batch_data[TEXT])
         mask_sequence_text = self._get_mask_for(tf_batch_data, TEXT, SEQUENCE_LENGTH)
         sequence_lengths = self._get_sequence_lengths(

From 032666241994ea1f2cfef7f004e5fac7847e2243 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 27 Oct 2020 15:02:00 +0100
Subject: [PATCH 18/62] add comments

---
 rasa/core/policies/ted_policy.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index ea019c5dbf24..bb6179955f37 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -825,11 +825,15 @@ def _encode_features_per_attribute(
 
             # TODO entities
 
+            # resulting attribute features will have shape
+            # combined batch dimension and dialogue length x sequence length x units
             attribute_features = self._last_token(
                 attribute_features, tf.squeeze(sequence_lengths)
             )
 
         else:
+            # resulting attribute features will have shape
+            # combined batch dimension and dialogue length x 1 x units
             attribute_features = self._combine_sparse_dense_features(
                 tf_batch_data[attribute][SENTENCE],
                 f"{attribute}_{SENTENCE}",
@@ -864,6 +868,8 @@ def _encode_features_per_attribute(
                 indices, tf.squeeze(attribute_features), shape
             )
 
+            # create a attribute mask that has the shape
+            # batch x dialogue length
             attribute_mask = tf.expand_dims(
                 tf.squeeze(self._compute_mask(tf.squeeze(dialogue_lengths))), axis=-1
             )

From 03cc881d9b09c31cfa29bc8aa192dd805f389aca Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 28 Oct 2020 15:07:33 +0100
Subject: [PATCH 19/62] clean up

---
 rasa/core/policies/ted_policy.py        | 80 ++++++++++++++-----------
 rasa/nlu/classifiers/diet_classifier.py |  7 +--
 rasa/utils/tensorflow/model_data.py     |  8 +--
 rasa/utils/tensorflow/models.py         |  4 +-
 4 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index bb6179955f37..1792ab30966f 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -95,7 +95,7 @@
 LABEL_SUB_KEY = "ids"
 LENGTH = "length"
 POSSIBLE_FEATURE_TYPES = [SEQUENCE, SENTENCE]
-FEATURES_TO_ENCODE = [INTENT, TEXT, ACTION_NAME, ACTION_TEXT]
+SENTENCE_FEATURES_TO_ENCODE = [INTENT, TEXT, ACTION_NAME, ACTION_TEXT]
 SEQUENCE_FEATURES_TO_ENCODE = [TEXT, ACTION_TEXT]
 LABEL_FEATURES_TO_ENCODE = [f"{LABEL}_{ACTION_NAME}", f"{LABEL}_{ACTION_TEXT}"]
 STATE_LEVEL_FEATURES = [ENTITIES, SLOTS, ACTIVE_LOOP]
@@ -350,6 +350,10 @@ def _create_model_data(
         )
         model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE)
         model_data.add_lengths(ACTION_TEXT, SEQUENCE_LENGTH, ACTION_TEXT, SEQUENCE)
+        # Add the dialogue in 3D, e.g. batch-size x dialogue-length x 1 to have
+        # the actual dialogue length inside the model
+        # (the 4D dialogue length will be converted into
+        # combined batch size and dialogue length x sequence length x 1)
         model_data.data[DIALOGUE][f"3D_{LENGTH}"] = [
             FeatureArray(
                 np.array(
@@ -590,7 +594,7 @@ def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
                 for feature_name, features in model_data_example.items()
                 if feature_name
                 # we need to remove label features for prediction if they are present
-                in STATE_LEVEL_FEATURES + FEATURES_TO_ENCODE + [DIALOGUE]
+                in STATE_LEVEL_FEATURES + SENTENCE_FEATURES_TO_ENCODE + [DIALOGUE]
             },
         )
         model.build_for_predict(predict_data_example)
@@ -623,7 +627,8 @@ def __init__(
         self.predict_data_signature = {
             feature_name: features
             for feature_name, features in data_signature.items()
-            if feature_name in STATE_LEVEL_FEATURES + FEATURES_TO_ENCODE + [DIALOGUE]
+            if feature_name
+            in STATE_LEVEL_FEATURES + SENTENCE_FEATURES_TO_ENCODE + [DIALOGUE]
         }
 
         # optimizer
@@ -714,10 +719,13 @@ def _prepare_encoding_layers(self, name: Text) -> None:
         """
         feature_type = SENTENCE
         # create encoding layers only for the features which should be encoded;
-        if name not in FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
+        if name not in SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
             return
         # check that there are SENTENCE features for the attribute name in data
-        if name in FEATURES_TO_ENCODE and feature_type not in self.data_signature[name]:
+        if (
+            name in SENTENCE_FEATURES_TO_ENCODE
+            and feature_type not in self.data_signature[name]
+        ):
             return
         #  same for label_data
         if (
@@ -776,6 +784,7 @@ def _emebed_dialogue(
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
         if self.max_history_tracker_featurizer_used:
+            # get the actual dialogue length in a 1D tensor
             dialogue_lengths = tf.squeeze(tf.reduce_sum(sequence_lengths, axis=1))
             # pick last vector if max history featurizer is used
             dialogue_transformed = tf.expand_dims(
@@ -840,32 +849,18 @@ def _encode_features_per_attribute(
                 mask=attribute_mask,
             )
 
-        if attribute in FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
+        if attribute in SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
             attribute_features = self._tf_layers[f"ffnn.{attribute}"](
                 attribute_features
             )
 
-        if attribute in FEATURES_TO_ENCODE:
-            # transform attribute features back to original
-            # batch x dialogue length x units
-            indices = []
+        if attribute in SENTENCE_FEATURES_TO_ENCODE:
             dialogue_lengths = tf.cast(
                 tf_batch_data[DIALOGUE][f"3D_{LENGTH}"][0], tf.int32
             )
-            for batch_dim in range(dialogue_lengths.shape[0]):
-                for dialogue_dim in range(dialogue_lengths.shape[1]):
-                    if dialogue_lengths[batch_dim][dialogue_dim] > 0:
-                        indices.append([batch_dim, dialogue_dim])
-            indices = tf.constant(indices)
-            shape = tf.constant(
-                [
-                    dialogue_lengths.shape[0],
-                    dialogue_lengths.shape[1],
-                    attribute_features.shape[-1],
-                ]
-            )
-            attribute_features = tf.scatter_nd(
-                indices, tf.squeeze(attribute_features), shape
+
+            attribute_features = self._convert_to_original_shape(
+                attribute_features, dialogue_lengths
             )
 
             # create a attribute mask that has the shape
@@ -876,6 +871,29 @@ def _encode_features_per_attribute(
 
         return attribute_features * attribute_mask
 
+    @staticmethod
+    def _convert_to_original_shape(
+        attribute_features: tf.Tensor, dialogue_lengths: tf.Tensor
+    ) -> tf.Tensor:
+        # transform attribute features back to original shape:
+        # batch x dialogue length x units
+        indices = []
+        for batch_dim in range(dialogue_lengths.shape[0]):
+            for dialogue_dim in range(dialogue_lengths.shape[1]):
+                if dialogue_lengths[batch_dim][dialogue_dim] > 0:
+                    indices.append([batch_dim, dialogue_dim])
+        indices = tf.constant(indices)
+
+        shape = tf.constant(
+            [
+                dialogue_lengths.shape[0],
+                dialogue_lengths.shape[1],
+                attribute_features.shape[-1],
+            ]
+        )
+
+        return tf.scatter_nd(indices, tf.squeeze(attribute_features), shape)
+
     def _process_batch_data(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
     ) -> tf.Tensor:
@@ -945,13 +963,6 @@ def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
-        # for k, v in tf_batch_data.items():
-        #     print(k)
-        #     for _k, _v in v.items():
-        #         print("  ", _k)
-        #         for __v in _v:
-        #             print("    ", __v.shape)
-        # exit()
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][f"3D_{LENGTH}"][0], tf.int32)
 
         all_label_ids, all_labels_embed = self._create_all_labels_embed()
@@ -986,17 +997,14 @@ def batch_predict(
             batch_in, self.predict_data_signature
         )
 
-        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
-        dialogue_3d_lengths = tf.cast(
-            tf_batch_data[DIALOGUE][f"3D_{LENGTH}"][0], tf.int32
-        )
+        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][f"3D_{LENGTH}"][0], tf.int32)
 
         if self.all_labels_embed is None:
             _, self.all_labels_embed = self._create_all_labels_embed()
 
         dialogue_in = self._process_batch_data(tf_batch_data)
         dialogue_embed, dialogue_mask = self._emebed_dialogue(
-            dialogue_in, dialogue_lengths, dialogue_3d_lengths
+            dialogue_in, dialogue_lengths
         )
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 481bce505c38..22197e83943e 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1434,12 +1434,7 @@ def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
-        # for k, v in tf_batch_data.items():
-        #     print(k)
-        #     for _k, _v in v.items():
-        #         print("  ", _k)
-        #         for __v in _v:
-        #             print("    ", __v.shape)
+
         batch_dim = self._get_batch_dim(tf_batch_data[TEXT])
         mask_sequence_text = self._get_mask_for(tf_batch_data, TEXT, SEQUENCE_LENGTH)
         sequence_lengths = self._get_sequence_lengths(
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index a72abb3831dc..dd6f11db3dfa 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -1077,8 +1077,8 @@ def _pad_4d_dense_data(array_of_array_of_dense: FeatureArray) -> np.ndarray:
         # so that the resulting tensor is 3D
         # the shape is (sum of dialogue history length for all tensors in the
         # batch x max sequence length x number of features)
-        # the original shape is passed on the model via the data signature, the
-        # original shape can be used to transform the 3D tensor back into 4D
+        # the original shape and the original dialogue length is passed on to the model
+        # it can be used to transform the 3D tensor back into 4D
 
         sum_dialogue_len = sum(
             len(array_of_dense) for array_of_dense in array_of_array_of_dense
@@ -1154,8 +1154,8 @@ def _4d_scipy_matrix_to_values(
         # so that the resulting tensor is 3D
         # the shape is (sum of dialogue history length for all tensors in the
         # batch x max sequence length x number of features)
-        # the original shape is passed on the model via the data signature, the
-        # original shape can be used to transform the 3D tensor back into 4D
+        # the original shape and the original dialogue length is passed on to the model
+        # it can be used to transform the 3D tensor back into 4D
 
         # we need to make sure that the matrices are coo_matrices otherwise the
         # transformation does not work (e.g. you cannot access x.row, x.col)
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 0b49b739a453..0938b8710c2f 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -160,7 +160,7 @@ def fit(
         batch_strategy: Text,
         silent: bool = False,
         loading: bool = False,
-        eager: bool = True,
+        eager: bool = False,
     ) -> None:
         """Fit model data"""
 
@@ -292,7 +292,7 @@ def train_on_batch(
         self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
     def build_for_predict(
-        self, predict_data: RasaModelData, eager: bool = True
+        self, predict_data: RasaModelData, eager: bool = False
     ) -> None:
         self._training = False  # needed for tf graph mode
         self._predict_function = self._get_tf_call_model_function(

From 2cb13f52e32cbb23d3ff65e060e2c3f218934bb0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 28 Oct 2020 15:26:33 +0100
Subject: [PATCH 20/62] update constants

---
 rasa/core/policies/ted_policy.py   | 19 ++++++++++++-------
 rasa/utils/tensorflow/constants.py |  1 -
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index bf01c050af30..e61898782d61 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -17,7 +17,15 @@
     MaxHistoryTrackerFeaturizer,
 )
 from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer
-from rasa.shared.nlu.constants import ACTION_TEXT, ACTION_NAME, INTENT, TEXT, ENTITIES
+from rasa.shared.nlu.constants import (
+    ACTION_TEXT,
+    ACTION_NAME,
+    INTENT,
+    TEXT,
+    ENTITIES,
+    VALID_FEATURE_TYPES,
+    FEATURE_TYPE_SENTENCE,
+)
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter
 from rasa.core.policies.policy import Policy
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY, DIALOGUE
@@ -71,7 +79,6 @@
     ENCODING_DIMENSION,
     UNIDIRECTIONAL_ENCODER,
     SEQUENCE,
-    SEQUENCE_LENGTH,
     SENTENCE,
     SEQUENCE_LENGTH,
     DENSE_DIMENSION,
@@ -95,7 +102,6 @@
 LABEL_KEY = LABEL
 LABEL_SUB_KEY = "ids"
 LENGTH = "length"
-POSSIBLE_FEATURE_TYPES = [SEQUENCE, SENTENCE]
 SENTENCE_FEATURES_TO_ENCODE = [INTENT, TEXT, ACTION_NAME, ACTION_TEXT]
 SEQUENCE_FEATURES_TO_ENCODE = [TEXT, ACTION_TEXT]
 LABEL_FEATURES_TO_ENCODE = [f"{LABEL}_{ACTION_NAME}", f"{LABEL}_{ACTION_TEXT}"]
@@ -702,7 +708,7 @@ def _prepare_sparse_dense_layer_for(
             name: the attribute name
             signature: data signature
         """
-        for feature_type in POSSIBLE_FEATURE_TYPES:
+        for feature_type in VALID_FEATURE_TYPES:
             if name not in signature or feature_type not in signature[name]:
                 # features for feature type are not present
                 continue
@@ -725,20 +731,19 @@ def _prepare_encoding_layers(self, name: Text) -> None:
         Args:
             name: attribute name
         """
-        feature_type = SENTENCE
         # create encoding layers only for the features which should be encoded;
         if name not in SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
             return
         # check that there are SENTENCE features for the attribute name in data
         if (
             name in SENTENCE_FEATURES_TO_ENCODE
-            and feature_type not in self.data_signature[name]
+            and FEATURE_TYPE_SENTENCE not in self.data_signature[name]
         ):
             return
         #  same for label_data
         if (
             name in LABEL_FEATURES_TO_ENCODE
-            and feature_type not in self.label_signature[name]
+            and FEATURE_TYPE_SENTENCE not in self.label_signature[name]
         ):
             return
 
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 80497fa261d5..06f81775a673 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -69,7 +69,6 @@
 SEQUENCE = "sequence"
 SEQUENCE_LENGTH = f"{SEQUENCE}_lengths"
 SENTENCE = "sentence"
-SEQUENCE_LENGTH = f"{SEQUENCE}_lengths"
 
 POOLING = "pooling"
 MAX_POOLING = "max"

From 4d2b5a18f92c5c4f3d54e9d38d4926a02ff6aa43 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 28 Oct 2020 16:34:51 +0100
Subject: [PATCH 21/62] review comment

---
 rasa/core/featurizers/single_state_featurizer.py | 2 +-
 rasa/core/policies/ted_policy.py                 | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index 6831bca01efa..87779199ff4a 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -87,7 +87,7 @@ def _create_features(
 
         features = np.zeros(len(self._default_feature_states[attribute]), np.float32)
         for state_feature, value in state_features.items():
-            # check that the value is in default_feature_states to be able to assigh
+            # check that the value is in default_feature_states to be able to assign
             # its value
             if state_feature in self._default_feature_states[attribute]:
                 features[self._default_feature_states[attribute][state_feature]] = value
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index e61898782d61..1869abfc7a42 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -862,12 +862,16 @@ def _encode_features_per_attribute(
                 mask=attribute_mask,
             )
 
-        if attribute in SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
+        if attribute in set(
+            SENTENCE_FEATURES_TO_ENCODE
+            + SEQUENCE_FEATURES_TO_ENCODE
+            + LABEL_FEATURES_TO_ENCODE
+        ):
             attribute_features = self._tf_layers[f"ffnn.{attribute}"](
                 attribute_features
             )
 
-        if attribute in SENTENCE_FEATURES_TO_ENCODE:
+        if attribute in set(SENTENCE_FEATURES_TO_ENCODE + SEQUENCE_FEATURES_TO_ENCODE):
             dialogue_lengths = tf.cast(
                 tf_batch_data[DIALOGUE][f"3D_{LENGTH}"][0], tf.int32
             )

From 6525b8e7b4c6db7d01c5a4b4e8c4ee0d58476f8e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 28 Oct 2020 17:33:27 +0100
Subject: [PATCH 22/62] keep entity dict

---
 .../featurizers/single_state_featurizer.py    |  1 +
 rasa/shared/core/domain.py                    | 10 +++++++--
 rasa/shared/core/events.py                    |  2 +-
 rasa/shared/core/trackers.py                  | 22 +++++++++++--------
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index 87779199ff4a..ce12467e2682 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -207,6 +207,7 @@ def encode_state(
                     self._extract_state_features(sub_state, interpreter, sparse=True)
                 )
                 if sub_state.get(ENTITIES):
+                    # TODO entities is a frozenset
                     state_features[ENTITIES] = self._create_features(
                         sub_state, ENTITIES, sparse=True
                     )
diff --git a/rasa/shared/core/domain.py b/rasa/shared/core/domain.py
index b14a046dda54..cb6a61367756 100644
--- a/rasa/shared/core/domain.py
+++ b/rasa/shared/core/domain.py
@@ -676,7 +676,9 @@ def input_states(self) -> List[Text]:
             + self.form_names
         )
 
-    def _get_featurized_entities(self, latest_message: UserUttered) -> Set[Text]:
+    def _get_featurized_entities(
+        self, latest_message: UserUttered
+    ) -> List[Dict[Text, Any]]:
         intent_name = latest_message.intent.get(
             rasa.shared.nlu.constants.INTENT_NAME_KEY
         )
@@ -688,7 +690,11 @@ def _get_featurized_entities(self, latest_message: UserUttered) -> Set[Text]:
 
         wanted_entities = set(intent_config.get(USED_ENTITIES_KEY, entity_names))
 
-        return entity_names.intersection(wanted_entities)
+        return [
+            entity
+            for entity in latest_message.entities
+            if entity["entity"] in wanted_entities
+        ]
 
     def _get_user_sub_state(
         self, tracker: "DialogueStateTracker"
diff --git a/rasa/shared/core/events.py b/rasa/shared/core/events.py
index fdc2b4fbd690..5ad058bcf124 100644
--- a/rasa/shared/core/events.py
+++ b/rasa/shared/core/events.py
@@ -355,7 +355,7 @@ def as_sub_state(self) -> Dict[Text, Union[None, Text, List[Optional[Text]]]]:
         if self.intent_name and not self.use_text_for_featurization:
             out[INTENT] = self.intent_name
         if entities:
-            out[ENTITIES] = entities
+            out[ENTITIES] = self.entities
 
         return out
 
diff --git a/rasa/shared/core/trackers.py b/rasa/shared/core/trackers.py
index 296f07be5385..60dbdecc3902 100644
--- a/rasa/shared/core/trackers.py
+++ b/rasa/shared/core/trackers.py
@@ -229,15 +229,19 @@ def _events_for_verbosity(
 
     @staticmethod
     def freeze_current_state(state: State) -> FrozenState:
-        frozen_state = frozenset(
-            {
-                key: frozenset(values.items())
-                if isinstance(values, Dict)
-                else frozenset(values)
-                for key, values in state.items()
-            }.items()
-        )
-        return frozen_state
+        state_copy = copy.deepcopy(state)
+        frozen_state = {}
+        for key, values in state_copy.items():
+            if isinstance(values, dict):
+                if "entities" in values and isinstance(values["entities"][0], dict):
+                    values["entities"] = tuple(
+                        [frozenset(e.items()) for e in values["entities"]]
+                    )
+                frozen_state[key] = frozenset(values.items())
+            else:
+                frozen_state[key] = frozenset(values)
+
+        return frozenset(frozen_state.items())
 
     def past_states(self, domain: Domain) -> List[State]:
         """Generate the past states of this tracker based on the history.

From f4aec125a11d80d92c99fd9cc61654e62ddb664d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 28 Oct 2020 18:05:20 +0100
Subject: [PATCH 23/62] create tag_ids for TED

---
 .../featurizers/single_state_featurizer.py    | 33 +++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index ce12467e2682..53fb48211044 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -5,6 +5,7 @@
 from collections import defaultdict
 
 import rasa.shared.utils.io
+from nlu.constants import TOKENS_NAMES
 from rasa.shared.core.domain import SubState, State, Domain
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter
 from rasa.shared.core.constants import PREVIOUS_ACTION, ACTIVE_LOOP, USER, SLOTS
@@ -16,9 +17,12 @@
     ACTION_TEXT,
     ACTION_NAME,
     INTENT,
+    FEATURE_TYPE_SEQUENCE,
+    TEXT,
 )
 from rasa.shared.nlu.training_data.features import Features
 from rasa.shared.nlu.training_data.message import Message
+from utils.tensorflow.model_data_utils import TAG_ID_ORIGIN
 
 logger = logging.getLogger(__name__)
 
@@ -101,6 +105,32 @@ def _create_features(
         )
         return [features]
 
+    def _create_entity_tag_features(
+        self, sub_state: SubState, interpreter: NaturalLanguageInterpreter
+    ) -> List["Features"]:
+        from rasa.nlu.test import determine_token_labels
+
+        # TODO what about roles and groups
+
+        parsed_text = interpreter.featurize_message(Message({TEXT: sub_state[TEXT]}))
+        entities = [dict(entity) for entity in sub_state[ENTITIES]]
+
+        _tags = []
+        for token in parsed_text.get(TOKENS_NAMES[TEXT]):
+            _tag = determine_token_labels(token, entities, attribute_key="entity")
+            if _tag in self._default_feature_states[ENTITIES]:
+                # +1 to keep the 0 for the NO ENTITY TAG
+                _tags.append(self._default_feature_states[ENTITIES][_tag] + 1)
+            else:
+                _tags.append(0)
+
+        # transpose to have seq_len x 1
+        return [
+            Features(
+                np.array([_tags]).T, FEATURE_TYPE_SEQUENCE, "entity", TAG_ID_ORIGIN
+            )
+        ]
+
     @staticmethod
     def _to_sparse_sentence_features(
         sparse_sequence_features: List["Features"],
@@ -207,10 +237,9 @@ def encode_state(
                     self._extract_state_features(sub_state, interpreter, sparse=True)
                 )
                 if sub_state.get(ENTITIES):
-                    # TODO entities is a frozenset
                     state_features[ENTITIES] = self._create_features(
                         sub_state, ENTITIES, sparse=True
-                    )
+                    ) + self._create_entity_tag_features(sub_state, interpreter)
 
             if state_type in {SLOTS, ACTIVE_LOOP}:
                 state_features[state_type] = self._create_features(

From 2fd1c5208afca37365998e98e05500d077ce6101 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 3 Nov 2020 15:40:44 +0100
Subject: [PATCH 24/62] clean up after merge

---
 rasa/core/policies/ted_policy.py        | 3 ---
 rasa/nlu/classifiers/diet_classifier.py | 1 -
 2 files changed, 4 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 169586a9306b..3eaa9399bdb5 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -787,7 +787,6 @@ def _emebed_dialogue(
         tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
     ) -> Tuple[tf.Tensor, tf.Tensor]:
         """Create dialogue level embedding and mask."""
-
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
         mask = self._compute_mask(dialogue_lengths)
 
@@ -797,8 +796,6 @@ def _emebed_dialogue(
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
         if self.max_history_tracker_featurizer_used:
-            # get the actual dialogue length in a 1D tensor
-            dialogue_lengths = tf.squeeze(tf.reduce_sum(dialogue_lengths, axis=1))
             # pick last vector if max history featurizer is used
             dialogue_transformed = tf.expand_dims(
                 self._last_token(dialogue_transformed, dialogue_lengths), 1
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index af864175dc13..fe0890c8a66c 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -89,7 +89,6 @@
     FEATURIZERS,
     CHECKPOINT_MODEL,
     SEQUENCE,
-    SEQUENCE_LENGTH,
     SENTENCE,
     SEQUENCE_LENGTH,
     DENSE_DIMENSION,

From 62d8bab9615372cef7199d73840b00bb2442dbe8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 5 Nov 2020 09:56:51 +0100
Subject: [PATCH 25/62] add batch_loss_entities (not working)

---
 rasa/core/policies/ted_policy.py          | 85 ++++++++++++++++++++++-
 rasa/utils/tensorflow/model_data_utils.py |  2 +-
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 3eaa9399bdb5..eee732fb4f76 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -27,6 +27,7 @@
     ENTITIES,
     VALID_FEATURE_TYPES,
     FEATURE_TYPE_SENTENCE,
+    ENTITY_ATTRIBUTE_TYPE,
 )
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter
 from rasa.core.policies.policy import Policy
@@ -656,6 +657,9 @@ def __init__(
 
         self._prepare_layers()
 
+        self.text_seq_transformer_output: Optional[tf.Tensor] = None
+        self.dialogue_transformer_output: Optional[tf.Tensor] = None
+
     def _check_data(self) -> None:
         if not any(key in [INTENT, TEXT] for key in self.data_signature.keys()):
             raise ValueError(
@@ -795,6 +799,8 @@ def _emebed_dialogue(
         )
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
+        self.dialogue_transformer_output = dialogue_transformed
+
         if self.max_history_tracker_featurizer_used:
             # pick last vector if max history featurizer is used
             dialogue_transformed = tf.expand_dims(
@@ -844,7 +850,8 @@ def _encode_features_per_attribute(
                 sequence_ids=False,
             )
 
-            # TODO entities
+            if attribute == TEXT:
+                self.text_seq_transformer_output = attribute_features
 
             # resulting attribute features will have shape
             # combined batch dimension and dialogue length x 1 x units
@@ -890,6 +897,76 @@ def _encode_features_per_attribute(
 
         return attribute_features
 
+    def _batch_loss_entities(
+        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
+    ) -> List[tf.Tensor]:
+        _sequence_lengths = tf.cast(
+            tf_batch_data[TEXT][SEQUENCE_LENGTH][0], dtype=tf.int32
+        )
+        _sequence_lengths = tf.squeeze(_sequence_lengths, axis=-1)
+        sequence_lengths = _sequence_lengths + 1
+        mask_text = tf.squeeze(self._compute_mask(sequence_lengths), axis=1)
+
+        sequence_lengths -= 1  # remove sentence features
+
+        entity_tags = None
+
+        if ENTITY_ATTRIBUTE_TYPE not in tf_batch_data.get(ENTITIES, {}):
+            return []
+
+        # text_seq_transformer: 1260 x  5 x 128 -> 64 x 28 x 5 x 128
+        # dialogue_transformer:   64 x 28 x 128 -> 64 x 28 x 1 x 128
+        # tag_ids:              1260 x  5 x 1   -> 64 x 28 x 5 x 1
+        # sequence_length:      1260 x  1       -> 64 x 28 x 1
+        # mask:                 1260 x  5 x 1   -> 64 x 28 x 5 x 1
+
+        text_transformed = tf.concat(
+            [self.text_seq_transformer_output, self.dialogue_transformer_output]
+        )
+
+        tag_ids = tf_batch_data[ENTITIES][ENTITY_ATTRIBUTE_TYPE][0]
+        # add a zero (no entity) for the sentence features to match the shape of
+        # inputs
+        tag_ids = tf.pad(tag_ids, [[0, 0], [0, 1], [0, 0]])
+
+        loss, f1, _logits = self._calculate_entity_loss(
+            text_transformed,
+            tag_ids,
+            mask_text,
+            sequence_lengths,
+            ENTITY_ATTRIBUTE_TYPE,
+            entity_tags,
+        )
+
+        return [loss]
+
+    def _calculate_entity_loss(
+        self,
+        inputs: tf.Tensor,
+        tag_ids: tf.Tensor,
+        mask: tf.Tensor,
+        sequence_lengths: tf.Tensor,
+        tag_name: Text,
+        entity_tags: Optional[tf.Tensor] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+
+        tag_ids = tf.cast(tag_ids[:, :, 0], tf.int32)
+
+        if entity_tags is not None:
+            _tags = self._tf_layers[f"embed.{tag_name}.tags"](entity_tags)
+            inputs = tf.concat([inputs, _tags], axis=-1)
+
+        logits = self._tf_layers[f"embed.{tag_name}.logits"](inputs)
+
+        # should call first to build weights
+        pred_ids, _ = self._tf_layers[f"crf.{tag_name}"](logits, sequence_lengths)
+        loss = self._tf_layers[f"crf.{tag_name}"].loss(
+            logits, tag_ids, sequence_lengths
+        )
+        f1 = self._tf_layers[f"crf.{tag_name}"].f1_score(tag_ids, pred_ids, mask)
+
+        return loss, f1, logits
+
     @staticmethod
     def _convert_to_original_shape(
         attribute_features: tf.Tensor,
@@ -1036,6 +1113,12 @@ def batch_loss(
             dialogue_mask,
         )
 
+        if (
+            self.dialogue_transformer_output is not None
+            and self.text_seq_transformer_output is not None
+        ):
+            self._batch_loss_entities(tf_batch_data)
+
         self.action_loss.update_state(loss)
         self.action_acc.update_state(acc)
 
diff --git a/rasa/utils/tensorflow/model_data_utils.py b/rasa/utils/tensorflow/model_data_utils.py
index ffdf1c21d67b..0f70f5464c77 100644
--- a/rasa/utils/tensorflow/model_data_utils.py
+++ b/rasa/utils/tensorflow/model_data_utils.py
@@ -260,7 +260,7 @@ def convert_to_data_format(
     num_examples = 1
     for _features in attribute_to_features.values():
         num_examples = max(num_examples, len(_features))
-        dialogue_length = max(dialogue_length, len(_features[0]))
+        dialogue_length = max(dialogue_length, max(len(f) for f in _features))
     empty_features = [[None] * dialogue_length] * num_examples
 
     for attribute in attributes:

From e50f4eb97d1ae22d7a47bd1e08b19039118a518c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 5 Nov 2020 12:45:53 +0100
Subject: [PATCH 26/62] concatenate text and dialogue transformer output

---
 .../featurizers/single_state_featurizer.py    |  35 +++-
 rasa/core/policies/ted_policy.py              | 171 ++++++++++++------
 rasa/nlu/classifiers/diet_classifier.py       |  43 -----
 rasa/utils/tensorflow/model_data_utils.py     |   4 +-
 rasa/utils/tensorflow/models.py               |  48 ++++-
 5 files changed, 192 insertions(+), 109 deletions(-)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index 3a653fade973..693134343a69 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -5,7 +5,7 @@
 from collections import defaultdict
 
 import rasa.shared.utils.io
-from nlu.constants import TOKENS_NAMES
+from rasa.nlu.constants import TOKENS_NAMES
 from rasa.shared.core.domain import SubState, State, Domain
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter
 from rasa.shared.core.constants import PREVIOUS_ACTION, ACTIVE_LOOP, USER, SLOTS
@@ -19,10 +19,11 @@
     INTENT,
     FEATURE_TYPE_SEQUENCE,
     TEXT,
+    NO_ENTITY_TAG,
 )
 from rasa.shared.nlu.training_data.features import Features
 from rasa.shared.nlu.training_data.message import Message
-from utils.tensorflow.model_data_utils import TAG_ID_ORIGIN
+from rasa.utils.tensorflow.model_data_utils import TAG_ID_ORIGIN
 
 logger = logging.getLogger(__name__)
 
@@ -102,24 +103,42 @@ def _create_features(
         )
         return [features]
 
+    def get_entity_tag_ids(self) -> Dict[Text, int]:
+        """Returns the tag to index mapping for entities.
+
+        Returns:
+            Tag to index mapping.
+        """
+        if ENTITIES not in self._default_feature_states:
+            return {}
+
+        tag_ids = {
+            tag: idx + 1  # +1 to keep 0 for the NO_ENTITY_TAG
+            for tag, idx in self._default_feature_states[ENTITIES].items()
+        }
+        tag_ids[NO_ENTITY_TAG] = 0
+        return tag_ids
+
     def _create_entity_tag_features(
         self, sub_state: SubState, interpreter: NaturalLanguageInterpreter
     ) -> List["Features"]:
         from rasa.nlu.test import determine_token_labels
 
-        # TODO what about roles and groups
+        # TODO
+        #  The entity states used to create the tag-idx-mapping contains the
+        #  entities and the concatenated entity and roles/groups. We do not
+        #  distinguish between entities and roles/groups right now.
+        # TODO
+        #  Should we support BILOU tagging?
 
         parsed_text = interpreter.featurize_message(Message({TEXT: sub_state[TEXT]}))
         entities = [dict(entity) for entity in sub_state[ENTITIES]]
+        tag_id_mapping = self.get_entity_tag_ids()
 
         _tags = []
         for token in parsed_text.get(TOKENS_NAMES[TEXT]):
             _tag = determine_token_labels(token, entities, attribute_key="entity")
-            if _tag in self._default_feature_states[ENTITIES]:
-                # +1 to keep the 0 for the NO ENTITY TAG
-                _tags.append(self._default_feature_states[ENTITIES][_tag] + 1)
-            else:
-                _tags.append(0)
+            _tags.append(tag_id_mapping[_tag])
 
         # transpose to have seq_len x 1
         return [
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index eee732fb4f76..b1062beb7484 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -12,6 +12,7 @@
 from typing import Any, List, Optional, Text, Dict, Tuple, Union, TYPE_CHECKING
 
 import rasa.utils.io as io_utils
+from nlu.classifiers.diet_classifier import EntityTagSpec
 from rasa.shared.core.domain import Domain
 from rasa.core.featurizers.tracker_featurizers import (
     TrackerFeaturizer,
@@ -267,6 +268,7 @@ def __init__(
         max_history: Optional[int] = None,
         model: Optional[RasaModel] = None,
         zero_state_features: Optional[Dict[Text, List["Features"]]] = None,
+        entity_tag_specs: Optional[List[EntityTagSpec]] = None,
         **kwargs: Any,
     ) -> None:
         """Declare instance variables with default values."""
@@ -284,6 +286,8 @@ def __init__(
 
         self.model = model
 
+        self._entity_tag_specs = entity_tag_specs
+
         self.zero_state_features = zero_state_features or defaultdict(list)
 
         self._label_data: Optional[RasaModelData] = None
@@ -298,6 +302,28 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = train_utils.update_similarity_type(self.config)
         self.config = train_utils.update_evaluation_parameters(self.config)
 
+    def _create_entity_tag_specs(self) -> List[EntityTagSpec]:
+        """Create entity tag specifications with their respective tag id mappings."""
+
+        _tag_specs = []
+
+        # TODO
+        tag_id_index_mapping = {"O": 0, "emotion": 1}
+
+        if tag_id_index_mapping:
+            _tag_specs.append(
+                EntityTagSpec(
+                    tag_name=ENTITY_ATTRIBUTE_TYPE,
+                    tags_to_ids=tag_id_index_mapping,
+                    ids_to_tags={
+                        value: key for key, value in tag_id_index_mapping.items()
+                    },
+                    num_tags=len(tag_id_index_mapping),
+                )
+            )
+
+        return _tag_specs
+
     def _create_label_data(
         self, domain: Domain, interpreter: NaturalLanguageInterpreter
     ) -> Tuple[RasaModelData, List[Dict[Text, List["Features"]]]]:
@@ -418,6 +444,8 @@ def train(
             )
             return
 
+        self._entity_tag_specs = self._create_entity_tag_specs()
+
         # keep one example for persisting and loading
         self.data_example = model_data.first_data_example()
 
@@ -426,6 +454,7 @@ def train(
             self.config,
             isinstance(self.featurizer, MaxHistoryTrackerFeaturizer),
             self._label_data,
+            self._entity_tag_specs,
         )
 
         self.model.fit(
@@ -551,6 +580,16 @@ def persist(self, path: Union[Text, Path]) -> None:
             dict(self._label_data.data),
         )
 
+        entity_tag_specs = (
+            [tag_spec._asdict() for tag_spec in self._entity_tag_specs]
+            if self._entity_tag_specs
+            else []
+        )
+        rasa.shared.utils.io.dump_obj_as_json_to_file(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.entity_tag_specs.json",
+            entity_tag_specs,
+        )
+
     @classmethod
     def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
         """Loads a policy from the storage.
@@ -585,6 +624,22 @@ def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
         priority = io_utils.json_unpickle(
             model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl"
         )
+        entity_tag_specs = rasa.shared.utils.io.read_json_file(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.entity_tag_specs.json"
+        )
+        entity_tag_specs = [
+            EntityTagSpec(
+                tag_name=tag_spec["tag_name"],
+                ids_to_tags={
+                    int(key): value for key, value in tag_spec["ids_to_tags"].items()
+                },
+                tags_to_ids={
+                    key: int(value) for key, value in tag_spec["tags_to_ids"].items()
+                },
+                num_tags=tag_spec["num_tags"],
+            )
+            for tag_spec in entity_tag_specs
+        ]
 
         model_data_example = RasaModelData(
             label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data=loaded_data
@@ -600,6 +655,7 @@ def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
                 featurizer, MaxHistoryTrackerFeaturizer
             ),
             label_data=label_data,
+            entity_tag_specs=entity_tag_specs,
         )
 
         # build the graph for prediction
@@ -621,6 +677,7 @@ def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
             priority=priority,
             model=model,
             zero_state_features=zero_state_features,
+            entity_tag_specs=entity_tag_specs,
             **meta,
         )
 
@@ -632,6 +689,7 @@ def __init__(
         config: Dict[Text, Any],
         max_history_tracker_featurizer_used: bool,
         label_data: RasaModelData,
+        entity_tag_specs: Optional[List[EntityTagSpec]],
     ) -> None:
         super().__init__("TED", config, data_signature, label_data)
 
@@ -644,6 +702,8 @@ def __init__(
             in STATE_LEVEL_FEATURES + SENTENCE_FEATURES_TO_ENCODE + [DIALOGUE]
         }
 
+        self._entity_tag_specs = entity_tag_specs
+
         # optimizer
         self.optimizer = tf.keras.optimizers.Adam()
 
@@ -699,6 +759,7 @@ def _prepare_layers(self) -> None:
         self._prepare_embed_layers(LABEL)
 
         self._prepare_dot_product_loss(LABEL, self.config[SCALE_LOSS])
+        self._prepare_entity_recognition_layers()
 
     def _prepare_sparse_dense_layer_for(
         self, name: Text, signature: Dict[Text, Dict[Text, List[FeatureSignature]]]
@@ -900,77 +961,71 @@ def _encode_features_per_attribute(
     def _batch_loss_entities(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
     ) -> List[tf.Tensor]:
-        _sequence_lengths = tf.cast(
-            tf_batch_data[TEXT][SEQUENCE_LENGTH][0], dtype=tf.int32
-        )
-        _sequence_lengths = tf.squeeze(_sequence_lengths, axis=-1)
-        sequence_lengths = _sequence_lengths + 1
-        mask_text = tf.squeeze(self._compute_mask(sequence_lengths), axis=1)
-
-        sequence_lengths -= 1  # remove sentence features
-
-        entity_tags = None
-
         if ENTITY_ATTRIBUTE_TYPE not in tf_batch_data.get(ENTITIES, {}):
             return []
 
-        # text_seq_transformer: 1260 x  5 x 128 -> 64 x 28 x 5 x 128
-        # dialogue_transformer:   64 x 28 x 128 -> 64 x 28 x 1 x 128
-        # tag_ids:              1260 x  5 x 1   -> 64 x 28 x 5 x 1
-        # sequence_length:      1260 x  1       -> 64 x 28 x 1
-        # mask:                 1260 x  5 x 1   -> 64 x 28 x 5 x 1
+        sequence_lengths = tf.cast(
+            tf_batch_data[TEXT][SEQUENCE_LENGTH][0], dtype=tf.int32
+        )
+        sequence_lengths = tf.squeeze(sequence_lengths, axis=-1)
+        sequence_lengths += 1  # add sentence features
+        mask = tf.squeeze(self._compute_mask(sequence_lengths), axis=1)
+        sequence_lengths -= 1  # remove sentence features
 
-        text_transformed = tf.concat(
-            [self.text_seq_transformer_output, self.dialogue_transformer_output]
+        # convert from (combined batch and dialogue dimension x 1) to
+        # (batch-dim x dialogue length x 1)
+        sequence_lengths = tf.squeeze(
+            self._convert_to_original_shape(
+                tf.expand_dims(sequence_lengths, axis=-1), tf_batch_data, False
+            ),
+            axis=-1,
         )
+        # convert from (combined batch and dialogue dimension x sequence length x 1) to
+        # (batch-dim x dialogue length x sequence length x 1)
+        mask = self._convert_to_original_shape(mask, tf_batch_data, False)
 
         tag_ids = tf_batch_data[ENTITIES][ENTITY_ATTRIBUTE_TYPE][0]
         # add a zero (no entity) for the sentence features to match the shape of
         # inputs
         tag_ids = tf.pad(tag_ids, [[0, 0], [0, 1], [0, 0]])
+        # convert from (combined batch and dialogue dimension x sequence length x 1) to
+        # (batch-dim x dialogue length x sequence length x 1)
+        tag_ids = self._convert_to_original_shape(tag_ids, tf_batch_data, False)
+
+        # convert from (combined batch and dialogue dimension x sequence length x units)
+        # to (batch-dim x dialogue length x sequence length x units)
+        text_seq_transformer_output = self._convert_to_original_shape(
+            self.text_seq_transformer_output, tf_batch_data, False
+        )
 
-        loss, f1, _logits = self._calculate_entity_loss(
-            text_transformed,
-            tag_ids,
-            mask_text,
-            sequence_lengths,
-            ENTITY_ATTRIBUTE_TYPE,
-            entity_tags,
+        # repeat the dialogue transformer output sequence-length-times to get the
+        # same shape as the text sequence transformer output
+        dialogue_transformer_output = tf.repeat(
+            tf.expand_dims(self.dialogue_transformer_output, axis=2),
+            text_seq_transformer_output.shape[2],
+            axis=2,
+        )
+        # add the output of the dialogue transformer to the output of the text
+        # sequence transformer (adding context)
+        text_transformed = tf.add(
+            text_seq_transformer_output, dialogue_transformer_output
         )
 
-        return [loss]
+        # TODO get last dialogue if max history
+        # check if this should happen before concat due to performance
+        # TODO CRF is currently failing, is it not compatible with 4D?
 
-    def _calculate_entity_loss(
-        self,
-        inputs: tf.Tensor,
-        tag_ids: tf.Tensor,
-        mask: tf.Tensor,
-        sequence_lengths: tf.Tensor,
-        tag_name: Text,
-        entity_tags: Optional[tf.Tensor] = None,
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        tag_ids = tf.cast(tag_ids[:, :, 0], tf.int32)
-
-        if entity_tags is not None:
-            _tags = self._tf_layers[f"embed.{tag_name}.tags"](entity_tags)
-            inputs = tf.concat([inputs, _tags], axis=-1)
-
-        logits = self._tf_layers[f"embed.{tag_name}.logits"](inputs)
-
-        # should call first to build weights
-        pred_ids, _ = self._tf_layers[f"crf.{tag_name}"](logits, sequence_lengths)
-        loss = self._tf_layers[f"crf.{tag_name}"].loss(
-            logits, tag_ids, sequence_lengths
+        loss, f1, _logits = self._calculate_entity_loss(
+            text_transformed, tag_ids, mask, sequence_lengths, ENTITY_ATTRIBUTE_TYPE
         )
-        f1 = self._tf_layers[f"crf.{tag_name}"].f1_score(tag_ids, pred_ids, mask)
 
-        return loss, f1, logits
+        return [loss]
 
     @staticmethod
     def _convert_to_original_shape(
         attribute_features: tf.Tensor,
         tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        squeeze_sequence_dimension: bool = True,
     ) -> tf.Tensor:
         """Transform attribute features back to original shape.
 
@@ -998,6 +1053,7 @@ def _convert_to_original_shape(
 
         batch_dim = tf.size(dialogue_lengths)
         dialogue_dim = tf.reduce_max(dialogue_lengths)
+        sequence_dim = attribute_features.shape[-2]
         units = attribute_features.shape[-1]
 
         batch_indices = tf.repeat(tf.range(batch_dim), dialogue_lengths)
@@ -1010,9 +1066,13 @@ def _convert_to_original_shape(
         ).values
         indices = tf.stack([batch_indices, dialogue_indices], axis=1)
 
-        shape = tf.convert_to_tensor([batch_dim, dialogue_dim, units])
+        if squeeze_sequence_dimension:
+            attribute_features = tf.squeeze(attribute_features, axis=1)
+            shape = tf.convert_to_tensor([batch_dim, dialogue_dim, units])
+        else:
+            shape = tf.convert_to_tensor([batch_dim, dialogue_dim, sequence_dim, units])
 
-        return tf.scatter_nd(indices, tf.squeeze(attribute_features, axis=1), shape)
+        return tf.scatter_nd(indices, attribute_features, shape)
 
     def _process_batch_data(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
@@ -1104,6 +1164,8 @@ def batch_loss(
         )
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
+        losses = []
+
         loss, acc = self._tf_layers[f"loss.{LABEL}"](
             dialogue_embed,
             labels_embed,
@@ -1112,17 +1174,18 @@ def batch_loss(
             all_label_ids,
             dialogue_mask,
         )
+        losses.append(loss)
 
         if (
             self.dialogue_transformer_output is not None
             and self.text_seq_transformer_output is not None
         ):
-            self._batch_loss_entities(tf_batch_data)
+            losses.extend(self._batch_loss_entities(tf_batch_data))
 
         self.action_loss.update_state(loss)
         self.action_acc.update_state(acc)
 
-        return loss
+        return tf.math.add_n(losses)
 
     def batch_predict(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index fe0890c8a66c..9cbb4e36a21b 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -1309,22 +1309,6 @@ def _prepare_label_classification_layers(self) -> None:
 
         self._prepare_dot_product_loss(LABEL, self.config[SCALE_LOSS])
 
-    def _prepare_entity_recognition_layers(self) -> None:
-        for tag_spec in self._entity_tag_specs:
-            name = tag_spec.tag_name
-            num_tags = tag_spec.num_tags
-            self._tf_layers[f"embed.{name}.logits"] = layers.Embed(
-                num_tags, self.config[REGULARIZATION_CONSTANT], f"logits.{name}"
-            )
-            self._tf_layers[f"crf.{name}"] = layers.CRF(
-                num_tags, self.config[REGULARIZATION_CONSTANT], self.config[SCALE_LOSS]
-            )
-            self._tf_layers[f"embed.{name}.tags"] = layers.Embed(
-                self.config[EMBEDDING_DIMENSION],
-                self.config[REGULARIZATION_CONSTANT],
-                f"tags.{name}",
-            )
-
     def _create_bow(
         self,
         sequence_features: List[Union[tf.Tensor, tf.SparseTensor]],
@@ -1406,33 +1390,6 @@ def _calculate_label_loss(
             text_embed, label_embed, label_ids, all_labels_embed, all_label_ids
         )
 
-    def _calculate_entity_loss(
-        self,
-        inputs: tf.Tensor,
-        tag_ids: tf.Tensor,
-        mask: tf.Tensor,
-        sequence_lengths: tf.Tensor,
-        tag_name: Text,
-        entity_tags: Optional[tf.Tensor] = None,
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        tag_ids = tf.cast(tag_ids[:, :, 0], tf.int32)
-
-        if entity_tags is not None:
-            _tags = self._tf_layers[f"embed.{tag_name}.tags"](entity_tags)
-            inputs = tf.concat([inputs, _tags], axis=-1)
-
-        logits = self._tf_layers[f"embed.{tag_name}.logits"](inputs)
-
-        # should call first to build weights
-        pred_ids, _ = self._tf_layers[f"crf.{tag_name}"](logits, sequence_lengths)
-        loss = self._tf_layers[f"crf.{tag_name}"].loss(
-            logits, tag_ids, sequence_lengths
-        )
-        f1 = self._tf_layers[f"crf.{tag_name}"].f1_score(tag_ids, pred_ids, mask)
-
-        return loss, f1, logits
-
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
diff --git a/rasa/utils/tensorflow/model_data_utils.py b/rasa/utils/tensorflow/model_data_utils.py
index 0f70f5464c77..ad6af067814e 100644
--- a/rasa/utils/tensorflow/model_data_utils.py
+++ b/rasa/utils/tensorflow/model_data_utils.py
@@ -295,8 +295,8 @@ def _features_for_attribute(
         attribute_to_features: features for every example
         training: boolean indicating whether we are currently in training or not
         zero_features: zero features
-        consider_dialogue_dimension: If set to false the dialogue dimension will be removed from the resulting sequence
-            features.
+        consider_dialogue_dimension: If set to false the dialogue dimension will be
+          removed from the resulting sequence features.
 
     Returns:
         A dictionary of feature type to actual features for the given attribute.
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 0b492058db8b..4d446e16e226 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -54,6 +54,7 @@
     DENSE_DIMENSION,
     CONCAT_DIMENSION,
     DROP_RATE_ATTENTION,
+    SCALE_LOSS,
 )
 from rasa.utils.tensorflow import layers
 from rasa.utils.tensorflow.transformer import TransformerEncoder
@@ -176,7 +177,7 @@ def fit(
         batch_strategy: Text,
         silent: bool = False,
         loading: bool = False,
-        eager: bool = False,
+        eager: bool = True,
     ) -> None:
         """Fit model data"""
 
@@ -308,7 +309,7 @@ def train_on_batch(
         self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
     def build_for_predict(
-        self, predict_data: RasaModelData, eager: bool = False
+        self, predict_data: RasaModelData, eager: bool = True
     ) -> None:
         self._training = False  # needed for tf graph mode
         self._predict_function = self._get_tf_call_model_function(
@@ -826,6 +827,22 @@ def _prepare_sequence_layers(self, name: Text) -> None:
             name, self.config[DROP_RATE], self.config[DROP_RATE_ATTENTION]
         )
 
+    def _prepare_entity_recognition_layers(self) -> None:
+        for tag_spec in self._entity_tag_specs:
+            name = tag_spec.tag_name
+            num_tags = tag_spec.num_tags
+            self._tf_layers[f"embed.{name}.logits"] = layers.Embed(
+                num_tags, self.config[REGULARIZATION_CONSTANT], f"logits.{name}"
+            )
+            self._tf_layers[f"crf.{name}"] = layers.CRF(
+                num_tags, self.config[REGULARIZATION_CONSTANT], self.config[SCALE_LOSS]
+            )
+            self._tf_layers[f"embed.{name}.tags"] = layers.Embed(
+                self.config[EMBEDDING_DIMENSION],
+                self.config[REGULARIZATION_CONSTANT],
+                f"tags.{name}",
+            )
+
     def _combine_sparse_dense_features(
         self,
         features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]],
@@ -1055,6 +1072,33 @@ def _get_batch_dim(attribute_data: Dict[Text, List[tf.Tensor]]) -> int:
 
         return tf.shape(attribute_data[SENTENCE][0])[0]
 
+    def _calculate_entity_loss(
+        self,
+        inputs: tf.Tensor,
+        tag_ids: tf.Tensor,
+        mask: tf.Tensor,
+        sequence_lengths: tf.Tensor,
+        tag_name: Text,
+        entity_tags: Optional[tf.Tensor] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+
+        tag_ids = tf.cast(tag_ids[:, :, 0], tf.int32)
+
+        if entity_tags is not None:
+            _tags = self._tf_layers[f"embed.{tag_name}.tags"](entity_tags)
+            inputs = tf.concat([inputs, _tags], axis=-1)
+
+        logits = self._tf_layers[f"embed.{tag_name}.logits"](inputs)
+
+        # should call first to build weights
+        pred_ids, _ = self._tf_layers[f"crf.{tag_name}"](logits, sequence_lengths)
+        loss = self._tf_layers[f"crf.{tag_name}"].loss(
+            logits, tag_ids, sequence_lengths
+        )
+        f1 = self._tf_layers[f"crf.{tag_name}"].f1_score(tag_ids, pred_ids, mask)
+
+        return loss, f1, logits
+
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:

From 2833ef5206f9c345b536f06c8aab675e3dc693db Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 5 Nov 2020 14:22:37 +0100
Subject: [PATCH 27/62] get last dialogue before CRF

---
 rasa/core/policies/ted_policy.py | 35 ++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index b1062beb7484..e3392dfd362a 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -4,7 +4,6 @@
 from collections import defaultdict
 
 import numpy as np
-from tensorflow import RaggedTensorSpec
 
 import rasa.shared.utils.io
 import tensorflow as tf
@@ -12,7 +11,7 @@
 from typing import Any, List, Optional, Text, Dict, Tuple, Union, TYPE_CHECKING
 
 import rasa.utils.io as io_utils
-from nlu.classifiers.diet_classifier import EntityTagSpec
+from rasa.nlu.classifiers.diet_classifier import EntityTagSpec
 from rasa.shared.core.domain import Domain
 from rasa.core.featurizers.tracker_featurizers import (
     TrackerFeaturizer,
@@ -308,7 +307,7 @@ def _create_entity_tag_specs(self) -> List[EntityTagSpec]:
         _tag_specs = []
 
         # TODO
-        tag_id_index_mapping = {"O": 0, "emotion": 1}
+        tag_id_index_mapping = {"O": 0, "emotion": 1, "account_number": 2, "item": 3}
 
         if tag_id_index_mapping:
             _tag_specs.append(
@@ -1011,9 +1010,33 @@ def _batch_loss_entities(
             text_seq_transformer_output, dialogue_transformer_output
         )
 
-        # TODO get last dialogue if max history
-        # check if this should happen before concat due to performance
-        # TODO CRF is currently failing, is it not compatible with 4D?
+        if self.max_history_tracker_featurizer_used:
+            # get last dialogue turn for every batch example
+            # resulting shapes are
+            # text_transformed (batch-dim x sequence length x units)
+            # mask             (batch-dim x sequence length x 1)
+            # tag_ids          (batch-dim x sequence length x 1)
+            # sequence_lengths (batch-dim)
+            dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+            text_transformed = tf.squeeze(
+                tf.expand_dims(self._last_token(text_transformed, dialogue_lengths), 1),
+                axis=1,
+            )
+            mask = tf.squeeze(
+                tf.expand_dims(self._last_token(mask, dialogue_lengths), 1), axis=1
+            )
+            tag_ids = tf.squeeze(
+                tf.expand_dims(self._last_token(tag_ids, dialogue_lengths), 1), axis=1
+            )
+            sequence_lengths = tf.squeeze(
+                tf.expand_dims(self._last_token(sequence_lengths, dialogue_lengths), 1)
+            )
+
+        else:
+            # TODO
+            #   CRF cannot handle 4D tensors, convert text_transformed back to
+            #   combined batch and dialogue dimenstion x sequence length x untis
+            return []
 
         loss, f1, _logits = self._calculate_entity_loss(
             text_transformed, tag_ids, mask, sequence_lengths, ENTITY_ATTRIBUTE_TYPE

From ff6f002e24ff6adca38e94322ab50052f06a035b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 5 Nov 2020 15:15:25 +0100
Subject: [PATCH 28/62] add predicting entities

---
 rasa/core/policies/ted_policy.py | 105 +++++++++++++++++++++++++++++--
 1 file changed, 100 insertions(+), 5 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index e3392dfd362a..51177651502c 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -93,6 +93,7 @@
     MASK,
     HIDDEN_LAYERS_SIZES,
     FEATURIZERS,
+    ENTITY_RECOGNITION,
 )
 
 
@@ -252,6 +253,10 @@ class TEDPolicy(Policy):
         # Specify what features to use as sequence and sentence features.
         # By default all features in the pipeline are used.
         FEATURIZERS: [],
+        # If set to true, entities are predicted in user utterances.
+        # TODO Do not communicate this option to users yet as we have to run some
+        #   experiments first.
+        ENTITY_RECOGNITION: True,
     }
 
     @staticmethod
@@ -443,7 +448,8 @@ def train(
             )
             return
 
-        self._entity_tag_specs = self._create_entity_tag_specs()
+        if self.config[ENTITY_RECOGNITION]:
+            self._entity_tag_specs = self._create_entity_tag_specs()
 
         # keep one example for persisting and loading
         self.data_example = model_data.first_data_example()
@@ -709,7 +715,11 @@ def __init__(
         # metrics
         self.action_loss = tf.keras.metrics.Mean(name="loss")
         self.action_acc = tf.keras.metrics.Mean(name="acc")
+        self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
+        self.entity_f1 = tf.keras.metrics.Mean(name="e_f1")
         self.metrics_to_log += ["loss", "acc"]
+        if self.config[ENTITY_RECOGNITION]:
+            self.metrics_to_log += ["e_loss", "e_f1"]
 
         # needed for efficient prediction
         self.all_labels_embed: Optional[tf.Tensor] = None
@@ -758,7 +768,9 @@ def _prepare_layers(self) -> None:
         self._prepare_embed_layers(LABEL)
 
         self._prepare_dot_product_loss(LABEL, self.config[SCALE_LOSS])
-        self._prepare_entity_recognition_layers()
+
+        if self.config[ENTITY_RECOGNITION]:
+            self._prepare_entity_recognition_layers()
 
     def _prepare_sparse_dense_layer_for(
         self, name: Text, signature: Dict[Text, Dict[Text, List[FeatureSignature]]]
@@ -963,6 +975,11 @@ def _batch_loss_entities(
         if ENTITY_ATTRIBUTE_TYPE not in tf_batch_data.get(ENTITIES, {}):
             return []
 
+        # if no tags are present at all, we can skip training
+        # TODO is there a better solution?
+        if tf.reduce_max(tf_batch_data[ENTITIES][ENTITY_ATTRIBUTE_TYPE][0]) == 0.0:
+            return []
+
         sequence_lengths = tf.cast(
             tf_batch_data[TEXT][SEQUENCE_LENGTH][0], dtype=tf.int32
         )
@@ -1038,10 +1055,13 @@ def _batch_loss_entities(
             #   combined batch and dialogue dimenstion x sequence length x untis
             return []
 
-        loss, f1, _logits = self._calculate_entity_loss(
+        loss, f1, _ = self._calculate_entity_loss(
             text_transformed, tag_ids, mask, sequence_lengths, ENTITY_ATTRIBUTE_TYPE
         )
 
+        self.entity_loss.update_state(loss)
+        self.entity_f1.update_state(f1)
+
         return [loss]
 
     @staticmethod
@@ -1200,7 +1220,8 @@ def batch_loss(
         losses.append(loss)
 
         if (
-            self.dialogue_transformer_output is not None
+            self.config[ENTITY_RECOGNITION]
+            and self.dialogue_transformer_output is not None
             and self.text_seq_transformer_output is not None
         ):
             losses.extend(self._batch_loss_entities(tf_batch_data))
@@ -1234,6 +1255,15 @@ def batch_predict(
         )
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
+        predictions = {}
+
+        if (
+            self.config[ENTITY_RECOGNITION]
+            and self.dialogue_transformer_output is not None
+            and self.text_seq_transformer_output is not None
+        ):
+            predictions.update(self._batch_predict_entities(tf_batch_data))
+
         sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
             dialogue_embed[:, :, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, tf.newaxis, :, :],
@@ -1244,7 +1274,72 @@ def batch_predict(
             sim_all, self.config[SIMILARITY_TYPE]
         )
 
-        return {"action_scores": scores, "similarities": sim_all}
+        predictions.update({"action_scores": scores, "similarities": sim_all})
+
+        return predictions
+
+    def _batch_predict_entities(
+        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
+    ) -> Dict[Text, tf.Tensor]:
+        predictions: Dict[Text, tf.Tensor] = {}
+
+        sequence_lengths = tf.cast(
+            tf_batch_data[TEXT][SEQUENCE_LENGTH][0], dtype=tf.int32
+        )
+        sequence_lengths = tf.squeeze(sequence_lengths, axis=-1)
+
+        # convert from (combined batch and dialogue dimension x sequence length x units)
+        # to (batch-dim x dialogue length x sequence length x units)
+        text_seq_transformer_output = self._convert_to_original_shape(
+            self.text_seq_transformer_output, tf_batch_data, False
+        )
+        # repeat the dialogue transformer output sequence-length-times to get the
+        # same shape as the text sequence transformer output
+        dialogue_transformer_output = tf.repeat(
+            tf.expand_dims(self.dialogue_transformer_output, axis=2),
+            text_seq_transformer_output.shape[2],
+            axis=2,
+        )
+        # add the output of the dialogue transformer to the output of the text
+        # sequence transformer (adding context)
+        text_transformed = tf.add(
+            text_seq_transformer_output, dialogue_transformer_output
+        )
+
+        if self.max_history_tracker_featurizer_used:
+            # get last dialogue turn for every batch example
+            # resulting shapes are
+            # text_transformed (batch-dim x sequence length x units)
+            # mask             (batch-dim x sequence length x 1)
+            # tag_ids          (batch-dim x sequence length x 1)
+            # sequence_lengths (batch-dim)
+            dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+            text_transformed = tf.squeeze(
+                tf.expand_dims(self._last_token(text_transformed, dialogue_lengths), 1),
+                axis=1,
+            )
+            sequence_lengths = tf.squeeze(
+                tf.expand_dims(self._last_token(sequence_lengths, dialogue_lengths), 1)
+            )
+
+        else:
+            # TODO
+            #   CRF cannot handle 4D tensors, convert text_transformed back to
+            #   combined batch and dialogue dimenstion x sequence length x untis
+            return {}
+
+        name = ENTITY_ATTRIBUTE_TYPE
+        _input = text_transformed
+
+        _logits = self._tf_layers[f"embed.{name}.logits"](_input)
+        pred_ids, confidences = self._tf_layers[f"crf.{name}"](
+            _logits, sequence_lengths - 1
+        )
+
+        predictions[f"e_{name}_ids"] = pred_ids
+        predictions[f"e_{name}_scores"] = confidences
+
+        return predictions
 
 
 # pytype: enable=key-error

From 5b46f404d466d9bb28d58e295742f1fc03c098a0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 5 Nov 2020 15:25:10 +0100
Subject: [PATCH 29/62] clean up

---
 rasa/core/featurizers/single_state_featurizer.py | 10 ++++++++--
 rasa/core/policies/ted_policy.py                 |  5 ++++-
 rasa/shared/core/trackers.py                     | 11 +++++++----
 rasa/utils/tensorflow/model_data_utils.py        |  2 +-
 rasa/utils/tensorflow/models.py                  |  4 ++--
 5 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index 693134343a69..68a6f9a72ffd 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -20,6 +20,7 @@
     FEATURE_TYPE_SEQUENCE,
     TEXT,
     NO_ENTITY_TAG,
+    ENTITY_ATTRIBUTE_TYPE,
 )
 from rasa.shared.nlu.training_data.features import Features
 from rasa.shared.nlu.training_data.message import Message
@@ -137,13 +138,18 @@ def _create_entity_tag_features(
 
         _tags = []
         for token in parsed_text.get(TOKENS_NAMES[TEXT]):
-            _tag = determine_token_labels(token, entities, attribute_key="entity")
+            _tag = determine_token_labels(
+                token, entities, attribute_key=ENTITY_ATTRIBUTE_TYPE
+            )
             _tags.append(tag_id_mapping[_tag])
 
         # transpose to have seq_len x 1
         return [
             Features(
-                np.array([_tags]).T, FEATURE_TYPE_SEQUENCE, "entity", TAG_ID_ORIGIN
+                np.array([_tags]).T,
+                FEATURE_TYPE_SEQUENCE,
+                ENTITY_ATTRIBUTE_TYPE,
+                TAG_ID_ORIGIN,
             )
         ]
 
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 51177651502c..332f417fc538 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -976,7 +976,10 @@ def _batch_loss_entities(
             return []
 
         # if no tags are present at all, we can skip training
-        # TODO is there a better solution?
+        # check if there is any tag other than 0, which maps to NO_ENTITY_TAG
+        # TODO
+        #  If we remove this check the CRF layer is throwing an error.
+        #  Is there a better solution?
         if tf.reduce_max(tf_batch_data[ENTITIES][ENTITY_ATTRIBUTE_TYPE][0]) == 0.0:
             return []
 
diff --git a/rasa/shared/core/trackers.py b/rasa/shared/core/trackers.py
index 657ee9366e67..3d2ef8f1a8d9 100644
--- a/rasa/shared/core/trackers.py
+++ b/rasa/shared/core/trackers.py
@@ -29,6 +29,7 @@
     ENTITY_ATTRIBUTE_ROLE,
     ACTION_TEXT,
     ACTION_NAME,
+    ENTITIES,
 )
 from rasa.shared.core import events
 from rasa.shared.core.constants import (
@@ -68,7 +69,9 @@
 logger = logging.getLogger(__name__)
 
 # same as State but with Dict[...] substituted with FrozenSet[Tuple[...]]
-FrozenState = FrozenSet[Tuple[Text, FrozenSet[Tuple[Text, Tuple[Union[float, Text]]]]]]
+FrozenState = FrozenSet[
+    Tuple[Text, FrozenSet[Tuple[Text, Tuple[Union[float, Text, FrozenSet]]]]]
+]
 
 
 class EventVerbosity(Enum):
@@ -235,9 +238,9 @@ def freeze_current_state(state: State) -> FrozenState:
         frozen_state = {}
         for key, values in state_copy.items():
             if isinstance(values, dict):
-                if "entities" in values and isinstance(values["entities"][0], dict):
-                    values["entities"] = tuple(
-                        [frozenset(e.items()) for e in values["entities"]]
+                if ENTITIES in values and isinstance(values[ENTITIES][0], dict):
+                    values[ENTITIES] = tuple(
+                        [frozenset(e.items()) for e in values[ENTITIES]]
                     )
                 frozen_state[key] = frozenset(values.items())
             else:
diff --git a/rasa/utils/tensorflow/model_data_utils.py b/rasa/utils/tensorflow/model_data_utils.py
index ad6af067814e..887bbb4ed44d 100644
--- a/rasa/utils/tensorflow/model_data_utils.py
+++ b/rasa/utils/tensorflow/model_data_utils.py
@@ -260,7 +260,7 @@ def convert_to_data_format(
     num_examples = 1
     for _features in attribute_to_features.values():
         num_examples = max(num_examples, len(_features))
-        dialogue_length = max(dialogue_length, max(len(f) for f in _features))
+        dialogue_length = max(dialogue_length, len(_features[0]))
     empty_features = [[None] * dialogue_length] * num_examples
 
     for attribute in attributes:
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 4d446e16e226..9e8f67748ea0 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -177,7 +177,7 @@ def fit(
         batch_strategy: Text,
         silent: bool = False,
         loading: bool = False,
-        eager: bool = True,
+        eager: bool = False,
     ) -> None:
         """Fit model data"""
 
@@ -309,7 +309,7 @@ def train_on_batch(
         self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
     def build_for_predict(
-        self, predict_data: RasaModelData, eager: bool = True
+        self, predict_data: RasaModelData, eager: bool = False
     ) -> None:
         self._training = False  # needed for tf graph mode
         self._predict_function = self._get_tf_call_model_function(

From 906ff977d6fcae47fa4436002cbfa79bea6a7a2a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 5 Nov 2020 17:07:06 +0100
Subject: [PATCH 30/62] differentiate between max history tracker featurizer
 used or not

---
 rasa/core/policies/ted_policy.py | 180 ++++++++++++++++++-------------
 1 file changed, 107 insertions(+), 73 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 332f417fc538..98e66d679292 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -991,38 +991,25 @@ def _batch_loss_entities(
         mask = tf.squeeze(self._compute_mask(sequence_lengths), axis=1)
         sequence_lengths -= 1  # remove sentence features
 
-        # convert from (combined batch and dialogue dimension x 1) to
-        # (batch-dim x dialogue length x 1)
-        sequence_lengths = tf.squeeze(
-            self._convert_to_original_shape(
-                tf.expand_dims(sequence_lengths, axis=-1), tf_batch_data, False
-            ),
-            axis=-1,
-        )
-        # convert from (combined batch and dialogue dimension x sequence length x 1) to
-        # (batch-dim x dialogue length x sequence length x 1)
-        mask = self._convert_to_original_shape(mask, tf_batch_data, False)
+        # +1 for sentence features
+        sequence_dimension = tf.reduce_max(sequence_lengths) + 1
 
         tag_ids = tf_batch_data[ENTITIES][ENTITY_ATTRIBUTE_TYPE][0]
         # add a zero (no entity) for the sentence features to match the shape of
         # inputs
         tag_ids = tf.pad(tag_ids, [[0, 0], [0, 1], [0, 0]])
-        # convert from (combined batch and dialogue dimension x sequence length x 1) to
-        # (batch-dim x dialogue length x sequence length x 1)
-        tag_ids = self._convert_to_original_shape(tag_ids, tf_batch_data, False)
-
-        # convert from (combined batch and dialogue dimension x sequence length x units)
-        # to (batch-dim x dialogue length x sequence length x units)
-        text_seq_transformer_output = self._convert_to_original_shape(
-            self.text_seq_transformer_output, tf_batch_data, False
+
+        text_seq_transformer_output = self.text_seq_transformer_output
+        dialogue_transformer_output = self._combine_batch_and_dialogue_dimension(
+            self.dialogue_transformer_output, tf_batch_data
         )
 
         # repeat the dialogue transformer output sequence-length-times to get the
         # same shape as the text sequence transformer output
         dialogue_transformer_output = tf.repeat(
-            tf.expand_dims(self.dialogue_transformer_output, axis=2),
-            text_seq_transformer_output.shape[2],
-            axis=2,
+            tf.expand_dims(dialogue_transformer_output, axis=1),
+            sequence_dimension,
+            axis=1,
         )
         # add the output of the dialogue transformer to the output of the text
         # sequence transformer (adding context)
@@ -1031,35 +1018,44 @@ def _batch_loss_entities(
         )
 
         if self.max_history_tracker_featurizer_used:
-            # get last dialogue turn for every batch example
-            # resulting shapes are
-            # text_transformed (batch-dim x sequence length x units)
-            # mask             (batch-dim x sequence length x 1)
-            # tag_ids          (batch-dim x sequence length x 1)
-            # sequence_lengths (batch-dim)
             dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
-            text_transformed = tf.squeeze(
-                tf.expand_dims(self._last_token(text_transformed, dialogue_lengths), 1),
+
+            batch_dim = tf.size(dialogue_lengths)
+
+            # the first dimension of text transformed is the combined batch and dialogue
+            # dimension, which corresponds to the sum of all dialogue lengths
+            # if the max history tracker featurizer is used we just want the last
+            # dialogues of every batch example
+
+            # get the indices of all last dialogues
+            last_dialogue_indices = tf.cumsum(dialogue_lengths) - 1
+
+            # build up indices to get the last dialogues from text_transformed
+            dialogue_indices = tf.repeat(
+                tf.expand_dims(last_dialogue_indices, axis=1),
+                sequence_dimension,
                 axis=1,
             )
-            mask = tf.squeeze(
-                tf.expand_dims(self._last_token(mask, dialogue_lengths), 1), axis=1
-            )
-            tag_ids = tf.squeeze(
-                tf.expand_dims(self._last_token(tag_ids, dialogue_lengths), 1), axis=1
+            sequence_indices = tf.repeat(
+                tf.expand_dims(tf.range(sequence_dimension), axis=0), batch_dim, axis=0
             )
-            sequence_lengths = tf.squeeze(
-                tf.expand_dims(self._last_token(sequence_lengths, dialogue_lengths), 1)
+            indices = tf.stack([dialogue_indices, sequence_indices], axis=2)
+
+            # get all last dialogues from text_transformed using the above indices
+            text_transformed = tf.gather_nd(text_transformed, indices)
+            # do the same for the other tensors
+            tag_ids = tf.gather_nd(tag_ids, indices)
+            mask = tf.gather_nd(mask, indices)
+            sequence_lengths = tf.gather(
+                tf.squeeze(sequence_lengths), last_dialogue_indices
             )
 
-        else:
-            # TODO
-            #   CRF cannot handle 4D tensors, convert text_transformed back to
-            #   combined batch and dialogue dimenstion x sequence length x untis
-            return []
-
         loss, f1, _ = self._calculate_entity_loss(
-            text_transformed, tag_ids, mask, sequence_lengths, ENTITY_ATTRIBUTE_TYPE
+            text_transformed,
+            tag_ids,
+            mask,
+            tf.squeeze(sequence_lengths),
+            ENTITY_ATTRIBUTE_TYPE,
         )
 
         self.entity_loss.update_state(loss)
@@ -1067,11 +1063,42 @@ def _batch_loss_entities(
 
         return [loss]
 
+    @staticmethod
+    def _combine_batch_and_dialogue_dimension(
+        tensor: tf.Tensor, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
+    ):
+        """Combines the batch and dialogue dimension of the given tensor.
+
+        Before the tensor has shape (batch-size x dialogue-length x ...).
+        Afterwards the tensor will have shape
+        (combined batch and dialogue dimension x ...).
+
+        Args:
+            tensor: The tensor
+            tf_batch_data: the batch data
+
+        Returns:
+            The converted tensor
+        """
+        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+
+        batch_dim = tf.size(dialogue_lengths)
+        batch_indices = tf.repeat(tf.range(batch_dim), dialogue_lengths)
+        dialogue_indices = (
+            tf.map_fn(
+                tf.range,
+                dialogue_lengths,
+                fn_output_signature=tf.RaggedTensorSpec(shape=[None], dtype=tf.int32),
+            )
+        ).values
+        indices = tf.stack([batch_indices, dialogue_indices], axis=1)
+
+        return tf.gather_nd(tensor, indices)
+
     @staticmethod
     def _convert_to_original_shape(
         attribute_features: tf.Tensor,
         tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
-        squeeze_sequence_dimension: bool = True,
     ) -> tf.Tensor:
         """Transform attribute features back to original shape.
 
@@ -1099,7 +1126,6 @@ def _convert_to_original_shape(
 
         batch_dim = tf.size(dialogue_lengths)
         dialogue_dim = tf.reduce_max(dialogue_lengths)
-        sequence_dim = attribute_features.shape[-2]
         units = attribute_features.shape[-1]
 
         batch_indices = tf.repeat(tf.range(batch_dim), dialogue_lengths)
@@ -1112,12 +1138,9 @@ def _convert_to_original_shape(
         ).values
         indices = tf.stack([batch_indices, dialogue_indices], axis=1)
 
-        if squeeze_sequence_dimension:
-            attribute_features = tf.squeeze(attribute_features, axis=1)
-            shape = tf.convert_to_tensor([batch_dim, dialogue_dim, units])
-        else:
-            shape = tf.convert_to_tensor([batch_dim, dialogue_dim, sequence_dim, units])
+        shape = tf.convert_to_tensor([batch_dim, dialogue_dim, units])
 
+        attribute_features = tf.squeeze(attribute_features, axis=1)
         return tf.scatter_nd(indices, attribute_features, shape)
 
     def _process_batch_data(
@@ -1291,17 +1314,17 @@ def _batch_predict_entities(
         )
         sequence_lengths = tf.squeeze(sequence_lengths, axis=-1)
 
-        # convert from (combined batch and dialogue dimension x sequence length x units)
-        # to (batch-dim x dialogue length x sequence length x units)
-        text_seq_transformer_output = self._convert_to_original_shape(
-            self.text_seq_transformer_output, tf_batch_data, False
+        text_seq_transformer_output = self.text_seq_transformer_output
+        dialogue_transformer_output = self._combine_batch_and_dialogue_dimension(
+            self.dialogue_transformer_output, tf_batch_data
         )
+
         # repeat the dialogue transformer output sequence-length-times to get the
         # same shape as the text sequence transformer output
         dialogue_transformer_output = tf.repeat(
-            tf.expand_dims(self.dialogue_transformer_output, axis=2),
-            text_seq_transformer_output.shape[2],
-            axis=2,
+            tf.expand_dims(dialogue_transformer_output, axis=1),
+            text_seq_transformer_output.shape[1],
+            axis=1,
         )
         # add the output of the dialogue transformer to the output of the text
         # sequence transformer (adding context)
@@ -1310,26 +1333,37 @@ def _batch_predict_entities(
         )
 
         if self.max_history_tracker_featurizer_used:
-            # get last dialogue turn for every batch example
-            # resulting shapes are
-            # text_transformed (batch-dim x sequence length x units)
-            # mask             (batch-dim x sequence length x 1)
-            # tag_ids          (batch-dim x sequence length x 1)
-            # sequence_lengths (batch-dim)
             dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
-            text_transformed = tf.squeeze(
-                tf.expand_dims(self._last_token(text_transformed, dialogue_lengths), 1),
+
+            batch_dim = tf.size(dialogue_lengths)
+            # +1 for sentence features
+            sequence_dimension = tf.reduce_max(sequence_lengths) + 1
+
+            # the first dimension of text transformed is the combined batch and dialogue
+            # dimension, which corresponds to the sum of all dialogue lengths
+            # if the max history tracker featurizer is used we just want the last
+            # dialogues of every batch example
+
+            # get the indices of all last dialogues
+            last_dialogue_indices = tf.cumsum(dialogue_lengths) - 1
+
+            # build up indices to get the last dialogues from text_transformed
+            dialogue_indices = tf.repeat(
+                tf.expand_dims(last_dialogue_indices, axis=1),
+                sequence_dimension,
                 axis=1,
             )
-            sequence_lengths = tf.squeeze(
-                tf.expand_dims(self._last_token(sequence_lengths, dialogue_lengths), 1)
+            sequence_indices = tf.repeat(
+                tf.expand_dims(tf.range(sequence_dimension), axis=0), batch_dim, axis=0
             )
+            indices = tf.stack([dialogue_indices, sequence_indices], axis=2)
 
-        else:
-            # TODO
-            #   CRF cannot handle 4D tensors, convert text_transformed back to
-            #   combined batch and dialogue dimenstion x sequence length x untis
-            return {}
+            # get all last dialogues from text_transformed using the above indices
+            text_transformed = tf.gather_nd(text_transformed, indices)
+            # do the same for the other tensors
+            sequence_lengths = tf.gather(
+                tf.squeeze(sequence_lengths), last_dialogue_indices
+            )
 
         name = ENTITY_ATTRIBUTE_TYPE
         _input = text_transformed

From c0eaa70e808d7c8b67a042581b76470231df375b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 5 Nov 2020 17:32:10 +0100
Subject: [PATCH 31/62] add todo

---
 rasa/core/policies/ted_policy.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 98e66d679292..719d2aff975a 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -1013,6 +1013,8 @@ def _batch_loss_entities(
         )
         # add the output of the dialogue transformer to the output of the text
         # sequence transformer (adding context)
+        # resulting shape
+        # (combined batch and dialogue dimension x sequence length x units)
         text_transformed = tf.add(
             text_seq_transformer_output, dialogue_transformer_output
         )
@@ -1042,13 +1044,17 @@ def _batch_loss_entities(
             indices = tf.stack([dialogue_indices, sequence_indices], axis=2)
 
             # get all last dialogues from text_transformed using the above indices
+            # resulting shape (batch size x sequence length x units)
             text_transformed = tf.gather_nd(text_transformed, indices)
             # do the same for the other tensors
             tag_ids = tf.gather_nd(tag_ids, indices)
             mask = tf.gather_nd(mask, indices)
-            sequence_lengths = tf.gather(
-                tf.squeeze(sequence_lengths), last_dialogue_indices
+            sequence_lengths = tf.gather_nd(
+                sequence_lengths, tf.expand_dims(last_dialogue_indices, axis=1)
             )
+            # TODO
+            #  inside the LSTM of the CRF layer the check len(mask.shape) == 2
+            #  fails. mask is created from the sequence length.
 
         loss, f1, _ = self._calculate_entity_loss(
             text_transformed,

From 9239bfa5c9b5bb5edee0ce2bb9ab87245c71b1ae Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 5 Nov 2020 17:36:49 +0100
Subject: [PATCH 32/62] add comments

---
 rasa/core/policies/ted_policy.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 719d2aff975a..cdc03f66fcd6 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -999,6 +999,9 @@ def _batch_loss_entities(
         # inputs
         tag_ids = tf.pad(tag_ids, [[0, 0], [0, 1], [0, 0]])
 
+        # shape of the following two tensors
+        # (combined batch and dialogue dimension x sequence length x units)
+        # in case of dialogue_transformer_output sequence length is 1
         text_seq_transformer_output = self.text_seq_transformer_output
         dialogue_transformer_output = self._combine_batch_and_dialogue_dimension(
             self.dialogue_transformer_output, tf_batch_data
@@ -1049,8 +1052,8 @@ def _batch_loss_entities(
             # do the same for the other tensors
             tag_ids = tf.gather_nd(tag_ids, indices)
             mask = tf.gather_nd(mask, indices)
-            sequence_lengths = tf.gather_nd(
-                sequence_lengths, tf.expand_dims(last_dialogue_indices, axis=1)
+            sequence_lengths = tf.gather(
+                tf.squeeze(sequence_lengths), last_dialogue_indices
             )
             # TODO
             #  inside the LSTM of the CRF layer the check len(mask.shape) == 2

From 76b41ee1f83cbda63d3dfea86d3c35483cac1261 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 5 Nov 2020 17:42:44 +0100
Subject: [PATCH 33/62] use correct tag id mapping

---
 rasa/core/policies/ted_policy.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index cdc03f66fcd6..5e3c1bc26de9 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -311,8 +311,7 @@ def _create_entity_tag_specs(self) -> List[EntityTagSpec]:
 
         _tag_specs = []
 
-        # TODO
-        tag_id_index_mapping = {"O": 0, "emotion": 1, "account_number": 2, "item": 3}
+        tag_id_index_mapping = self.featurizer.state_featurizer.get_entity_tag_ids()
 
         if tag_id_index_mapping:
             _tag_specs.append(

From 58fc4ad951948a9dc2cf54c01a8bc9f7f8000b4f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 6 Nov 2020 08:45:07 +0100
Subject: [PATCH 34/62] check if text exists

---
 rasa/core/featurizers/single_state_featurizer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index 68a6f9a72ffd..671549c7658c 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -132,6 +132,9 @@ def _create_entity_tag_features(
         # TODO
         #  Should we support BILOU tagging?
 
+        if TEXT not in sub_state:
+            return []
+
         parsed_text = interpreter.featurize_message(Message({TEXT: sub_state[TEXT]}))
         entities = [dict(entity) for entity in sub_state[ENTITIES]]
         tag_id_mapping = self.get_entity_tag_ids()

From 74be41076a0a4834db1f6250ce0cef969a4119ff Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 6 Nov 2020 13:55:06 +0100
Subject: [PATCH 35/62] fix frozenset issues

---
 .../core/featurizers/single_state_featurizer.py |  2 +-
 rasa/shared/core/domain.py                      |  2 +-
 rasa/shared/core/generator.py                   | 17 +++++++++++++----
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index 671549c7658c..7607b37427a9 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -68,7 +68,7 @@ def _state_features_for_attribute(
         if attribute in {INTENT, ACTION_NAME}:
             return {sub_state[attribute]: 1}
         elif attribute == ENTITIES:
-            return {entity: 1 for entity in sub_state.get(ENTITIES, [])}
+            return {entity["entity"]: 1 for entity in sub_state.get(ENTITIES, [])}
         elif attribute == ACTIVE_LOOP:
             return {sub_state["name"]: 1}
         elif attribute == SLOTS:
diff --git a/rasa/shared/core/domain.py b/rasa/shared/core/domain.py
index 3aa818fdadc0..0d37ec7b643e 100644
--- a/rasa/shared/core/domain.py
+++ b/rasa/shared/core/domain.py
@@ -67,7 +67,7 @@
 # State is a dictionary with keys (USER, PREVIOUS_ACTION, SLOTS, ACTIVE_LOOP)
 # representing the origin of a SubState;
 # the values are SubStates, that contain the information needed for featurization
-SubState = Dict[Text, Union[Text, Tuple[Union[float, Text]]]]
+SubState = Dict[Text, Union[Text, Tuple[Union[float, Text, Dict]]]]
 State = Dict[Text, SubState]
 
 logger = logging.getLogger(__name__)
diff --git a/rasa/shared/core/generator.py b/rasa/shared/core/generator.py
index 994ee52fedaf..11ddfd9fb146 100644
--- a/rasa/shared/core/generator.py
+++ b/rasa/shared/core/generator.py
@@ -31,6 +31,7 @@
 )
 from rasa.shared.utils.io import is_logging_disabled
 import rasa.shared.utils.io
+from shared.nlu.constants import ENTITIES
 
 logger = logging.getLogger(__name__)
 
@@ -102,10 +103,18 @@ def past_states_for_hashing(self, domain: Domain) -> Deque[FrozenState]:
 
     @staticmethod
     def _unfreeze_states(frozen_states: Deque[FrozenState]) -> List[State]:
-        return [
-            {key: dict(value) for key, value in dict(frozen_state).items()}
-            for frozen_state in frozen_states
-        ]
+        states = []
+        for frozen_state in frozen_states:
+            state_dict = {}
+            for key, value in dict(frozen_state).items():
+                _value = dict(value)
+                if ENTITIES in _value:
+                    _value[ENTITIES] = [
+                        dict(frozen_entity) for frozen_entity in _value[ENTITIES]
+                    ]
+                state_dict[key] = _value
+            states.append(state_dict)
+        return states
 
     def past_states(self, domain: Domain) -> List[State]:
         states_for_hashing = self.past_states_for_hashing(domain)

From 90feabeb095585427fcedfd0414b782b231bff19 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 6 Nov 2020 14:38:12 +0100
Subject: [PATCH 36/62] ignore actual entity value in MemoizationPolicy

---
 rasa/core/policies/memoization.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/rasa/core/policies/memoization.py b/rasa/core/policies/memoization.py
index e511f35563ac..33b39a8864a4 100644
--- a/rasa/core/policies/memoization.py
+++ b/rasa/core/policies/memoization.py
@@ -1,3 +1,4 @@
+import copy
 import zlib
 
 import base64
@@ -22,6 +23,8 @@
 from rasa.shared.core.generator import TrackerWithCachedStates
 from rasa.shared.utils.io import is_logging_disabled
 from rasa.core.constants import MEMOIZATION_POLICY_PRIORITY
+from shared.core.constants import USER
+from shared.nlu.constants import ENTITIES, ENTITY_ATTRIBUTE_TYPE
 
 logger = logging.getLogger(__name__)
 
@@ -158,7 +161,22 @@ def _create_feature_key(self, states: List[State]) -> Text:
         # we sort keys to make sure that the same states
         # represented as dictionaries have the same json strings
         # quotes are removed for aesthetic reasons
-        feature_str = json.dumps(states, sort_keys=True).replace('"', "")
+
+        # Ignore the actual values of entities
+        # We are just interested whether an entity of a certain type was detected or not
+        _states = []
+        for state in states:
+            _state = {}
+            for key, value in state.items():
+                _state[key] = copy.deepcopy(value)
+                if USER == key and ENTITIES in _state[USER]:
+                    _state[USER][ENTITIES] = [
+                        entity[ENTITY_ATTRIBUTE_TYPE]
+                        for entity in _state[USER][ENTITIES]
+                    ]
+            _states.append(_state)
+
+        feature_str = json.dumps(_states, sort_keys=True).replace('"', "")
         if self.ENABLE_FEATURE_STRING_COMPRESSION:
             compressed = zlib.compress(
                 bytes(feature_str, rasa.shared.utils.io.DEFAULT_ENCODING)

From 6a5efc3321dce58c189f6fb7423484f9df21192e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 9 Nov 2020 08:42:33 +0100
Subject: [PATCH 37/62] fix import

---
 rasa/core/policies/memoization.py | 4 ++--
 rasa/shared/core/generator.py     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/rasa/core/policies/memoization.py b/rasa/core/policies/memoization.py
index 33b39a8864a4..7170071d17ad 100644
--- a/rasa/core/policies/memoization.py
+++ b/rasa/core/policies/memoization.py
@@ -23,8 +23,8 @@
 from rasa.shared.core.generator import TrackerWithCachedStates
 from rasa.shared.utils.io import is_logging_disabled
 from rasa.core.constants import MEMOIZATION_POLICY_PRIORITY
-from shared.core.constants import USER
-from shared.nlu.constants import ENTITIES, ENTITY_ATTRIBUTE_TYPE
+from rasa.shared.core.constants import USER
+from rasa.shared.nlu.constants import ENTITIES, ENTITY_ATTRIBUTE_TYPE
 
 logger = logging.getLogger(__name__)
 
diff --git a/rasa/shared/core/generator.py b/rasa/shared/core/generator.py
index 11ddfd9fb146..dcb69f20cea4 100644
--- a/rasa/shared/core/generator.py
+++ b/rasa/shared/core/generator.py
@@ -31,7 +31,7 @@
 )
 from rasa.shared.utils.io import is_logging_disabled
 import rasa.shared.utils.io
-from shared.nlu.constants import ENTITIES
+from rasa.shared.nlu.constants import ENTITIES
 
 logger = logging.getLogger(__name__)
 

From ccd93d14a13ff5550904229a779ba6d0a7470d80 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 9 Nov 2020 11:25:08 +0100
Subject: [PATCH 38/62] fix some tests

---
 .../test_single_state_featurizers.py          | 101 ++++++++++++++----
 tests/shared/core/test_domain.py              |  22 ++--
 .../story_writer/test_yaml_story_writer.py    |   6 +-
 tests/test_test.py                            |   2 -
 4 files changed, 93 insertions(+), 38 deletions(-)

diff --git a/tests/core/featurizers/test_single_state_featurizers.py b/tests/core/featurizers/test_single_state_featurizers.py
index 85e689d23a34..4ea08b9c773e 100644
--- a/tests/core/featurizers/test_single_state_featurizers.py
+++ b/tests/core/featurizers/test_single_state_featurizers.py
@@ -15,6 +15,10 @@
     INTENT,
     FEATURE_TYPE_SEQUENCE,
     FEATURE_TYPE_SENTENCE,
+    ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_ATTRIBUTE_VALUE,
+    ENTITY_ATTRIBUTE_START,
+    ENTITY_ATTRIBUTE_END,
 )
 from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS
 from rasa.shared.nlu.interpreter import RegexInterpreter
@@ -184,28 +188,51 @@ def test_single_state_featurizer_with_entity_roles_and_groups(
     interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter
 
     f = SingleStateFeaturizer()
-    f._default_feature_states[INTENT] = {"a": 0, "b": 1}
+    f._default_feature_states[INTENT] = {"inform": 0, "greet": 1}
     f._default_feature_states[ENTITIES] = {
-        "c": 0,
-        "d": 1,
-        f"d{ENTITY_LABEL_SEPARATOR}e": 2,
+        "city": 0,
+        "name": 1,
+        f"city{ENTITY_LABEL_SEPARATOR}to": 2,
+        f"city{ENTITY_LABEL_SEPARATOR}from": 3,
+    }
+    f._default_feature_states[ACTION_NAME] = {
+        "utter_ask_where_to": 0,
+        "utter_greet": 1,
+        "action_listen": 2,
+    }
+    f._default_feature_states[SLOTS] = {"slot_1": 0, "slot_2": 1, "slot_3": 2}
+    f._default_feature_states[ACTIVE_LOOP] = {
+        "active_loop_1": 0,
+        "active_loop_2": 1,
+        "active_loop_3": 2,
+        "active_loop_4": 3,
     }
-    f._default_feature_states[ACTION_NAME] = {"e": 0, "d": 1, "action_listen": 2}
-    f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2}
-    f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3}
     encoded = f.encode_state(
         {
             "user": {
-                "text": "a ball",
-                "intent": "b",
-                "entities": ["c", f"d{ENTITY_LABEL_SEPARATOR}e"],
+                "text": "I am flying from London to Paris",
+                "intent": "inform",
+                "entities": [
+                    {
+                        ENTITY_ATTRIBUTE_TYPE: "city",
+                        ENTITY_ATTRIBUTE_VALUE: "London",
+                        ENTITY_ATTRIBUTE_START: 17,
+                        ENTITY_ATTRIBUTE_END: 23,
+                    },
+                    {
+                        ENTITY_ATTRIBUTE_TYPE: f"city{ENTITY_LABEL_SEPARATOR}to",
+                        ENTITY_ATTRIBUTE_VALUE: "Paris",
+                        ENTITY_ATTRIBUTE_START: 27,
+                        ENTITY_ATTRIBUTE_END: 32,
+                    },
+                ],
             },
             "prev_action": {
                 "action_name": "action_listen",
                 "action_text": "throw a ball",
             },
-            "active_loop": {"name": "k"},
-            "slots": {"e": (1.0,)},
+            "active_loop": {"name": "active_loop_4"},
+            "slots": {"slot_1": (1.0,)},
         },
         interpreter=interpreter,
     )
@@ -213,7 +240,7 @@ def test_single_state_featurizer_with_entity_roles_and_groups(
     assert sorted(list(encoded.keys())) == sorted(
         [TEXT, ENTITIES, ACTION_NAME, SLOTS, ACTIVE_LOOP, INTENT, ACTION_TEXT]
     )
-    assert np.all(encoded[ENTITIES][0].features.toarray() == [1, 0, 1])
+    assert np.all(encoded[ENTITIES][0].features.toarray() == [1, 0, 1, 0])
 
 
 def test_single_state_featurizer_uses_dtype_float():
@@ -241,21 +268,51 @@ def test_single_state_featurizer_with_interpreter_state_with_action_listen(
     interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter
 
     f = SingleStateFeaturizer()
-    f._default_feature_states[INTENT] = {"a": 0, "b": 1}
-    f._default_feature_states[ENTITIES] = {"c": 0}
-    f._default_feature_states[ACTION_NAME] = {"e": 0, "d": 1, "action_listen": 2}
-    f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2}
-    f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3}
-
+    f._default_feature_states[INTENT] = {"inform": 0, "greet": 1}
+    f._default_feature_states[ENTITIES] = {
+        "city": 0,
+        "name": 1,
+        f"city{ENTITY_LABEL_SEPARATOR}to": 2,
+        f"city{ENTITY_LABEL_SEPARATOR}from": 3,
+    }
+    f._default_feature_states[ACTION_NAME] = {
+        "utter_ask_where_to": 0,
+        "utter_greet": 1,
+        "action_listen": 2,
+    }
+    f._default_feature_states[SLOTS] = {"slot_1": 0, "slot_2": 1, "slot_3": 2}
+    f._default_feature_states[ACTIVE_LOOP] = {
+        "active_loop_1": 0,
+        "active_loop_2": 1,
+        "active_loop_3": 2,
+        "active_loop_4": 3,
+    }
     encoded = f.encode_state(
         {
-            "user": {"text": "a ball", "intent": "b", "entities": ["c"]},
+            "user": {
+                "text": "I am flying from London to Paris",
+                "intent": "inform",
+                "entities": [
+                    {
+                        ENTITY_ATTRIBUTE_TYPE: "city",
+                        ENTITY_ATTRIBUTE_VALUE: "London",
+                        ENTITY_ATTRIBUTE_START: 17,
+                        ENTITY_ATTRIBUTE_END: 23,
+                    },
+                    {
+                        ENTITY_ATTRIBUTE_TYPE: f"city{ENTITY_LABEL_SEPARATOR}to",
+                        ENTITY_ATTRIBUTE_VALUE: "Paris",
+                        ENTITY_ATTRIBUTE_START: 27,
+                        ENTITY_ATTRIBUTE_END: 32,
+                    },
+                ],
+            },
             "prev_action": {
                 "action_name": "action_listen",
                 "action_text": "throw a ball",
             },
-            "active_loop": {"name": "k"},
-            "slots": {"e": (1.0,)},
+            "active_loop": {"name": "active_loop_4"},
+            "slots": {"slot_1": (1.0,)},
         },
         interpreter=interpreter,
     )
diff --git a/tests/shared/core/test_domain.py b/tests/shared/core/test_domain.py
index 32e6a83cc1f2..c94b20da97ee 100644
--- a/tests/shared/core/test_domain.py
+++ b/tests/shared/core/test_domain.py
@@ -75,7 +75,7 @@ async def test_create_train_data_no_history(default_domain):
     assert hashed == [
         "[{}]",
         '[{"prev_action": {"action_name": "utter_greet"}, "user": {"intent": "greet"}}]',
-        '[{"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}]',
+        '[{"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}]',
         '[{"prev_action": {"action_name": "utter_goodbye"}, "user": {"intent": "goodbye"}}]',
         '[{"prev_action": {"action_name": "utter_default"}, "user": {"intent": "default"}}]',
         '[{"prev_action": {"action_name": "utter_default"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}]',
@@ -83,7 +83,7 @@ async def test_create_train_data_no_history(default_domain):
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "goodbye"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "default"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}]',
-        '[{"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}]',
+        '[{"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}]',
     ]
 
 
@@ -104,13 +104,13 @@ async def test_create_train_data_with_history(default_domain):
     hashed = sorted(hashed)
 
     assert hashed == [
-        '[{"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "utter_default"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}]',
+        '[{"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "utter_default"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "utter_default"}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "goodbye"}}, {"prev_action": {"action_name": "utter_goodbye"}, "user": {"intent": "goodbye"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "user": {"intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "utter_default"}, "user": {"intent": "default"}}]',
         '[{"prev_action": {"action_name": "utter_greet"}, "user": {"intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "utter_default"}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "goodbye"}}]',
-        '[{}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}]',
-        '[{}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}]',
-        '[{}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}]',
+        '[{}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}]',
+        '[{}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}]',
+        '[{}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}]',
         '[{}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "user": {"intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "default"}}]',
         '[{}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "user": {"intent": "greet"}}]',
         '[{}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "greet"}}]',
@@ -158,7 +158,7 @@ async def test_create_train_data_unfeaturized_entities():
     assert hashed == [
         "[{}]",
         '[{"prev_action": {"action_name": "utter_greet"}, "user": {"intent": "greet"}}]',
-        '[{"prev_action": {"action_name": "utter_greet"}, "user": {"entities": ["name"], "intent": "greet"}}]',
+        '[{"prev_action": {"action_name": "utter_greet"}, "user": {"entities": [{"end": 81, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}]',
         '[{"prev_action": {"action_name": "utter_goodbye"}, "user": {"intent": "goodbye"}}]',
         '[{"prev_action": {"action_name": "utter_default"}, "user": {"intent": "why"}}]',
         '[{"prev_action": {"action_name": "utter_default"}, "user": {"intent": "thank"}}]',
@@ -168,9 +168,9 @@ async def test_create_train_data_unfeaturized_entities():
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "thank"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "greet"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "goodbye"}}]',
+        '[{"prev_action": {"action_name": "action_listen"}, "user": {"entities": [{"end": 81, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"entities": [], "intent": "default"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"entities": [], "intent": "ask"}}]',
-        '[{"prev_action": {"action_name": "action_listen"}, "user": {"entities": ["name"], "intent": "greet"}}]',
     ]
 
 
@@ -1057,7 +1057,7 @@ def test_get_featurized_entities():
 
     featurized_entities = domain._get_featurized_entities(user_uttered)
 
-    assert featurized_entities == set()
+    assert featurized_entities == []
 
     user_uttered = UserUttered(
         text="I am going to London",
@@ -1067,4 +1067,6 @@ def test_get_featurized_entities():
 
     featurized_entities = domain._get_featurized_entities(user_uttered)
 
-    assert featurized_entities == {"GPE", f"GPE{ENTITY_LABEL_SEPARATOR}destination"}
+    assert featurized_entities == [
+        {"entity": "GPE", "role": "destination", "value": "London"}
+    ]
diff --git a/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py b/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py
index fa746263b082..4e48ea67d793 100644
--- a/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py
+++ b/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py
@@ -108,8 +108,6 @@ def test_yaml_writer_dumps_user_messages():
         - story: default
           steps:
           - intent: greet
-            user: |-
-              Hello
           - action: utter_greet
 
     """
@@ -139,10 +137,10 @@ def test_yaml_writer_avoids_dumping_not_existing_user_messages():
 
 
 @pytest.mark.parametrize(
-    "input_yaml_file", ["data/test_yaml_stories/rules_with_stories_sorted.yaml",],
+    "input_yaml_file", ["data/test_yaml_stories/rules_with_stories_sorted.yaml"]
 )
 def test_yaml_writer_dumps_rules(
-    input_yaml_file: Text, tmpdir: Path, default_domain: Domain,
+    input_yaml_file: Text, tmpdir: Path, default_domain: Domain
 ):
     original_yaml_reader = YAMLStoryReader(default_domain, None, False)
     original_yaml_story_steps = original_yaml_reader.read_from_file(input_yaml_file)
diff --git a/tests/test_test.py b/tests/test_test.py
index b279fee01231..8bbc45bececb 100644
--- a/tests/test_test.py
+++ b/tests/test_test.py
@@ -197,8 +197,6 @@ def test_write_classification_errors():
         - story: default
           steps:
           - intent: greet  # predicted: goodbye: Hello
-            user: |-
-              Hello
           - action: utter_greet  # predicted: utter_goodbye
 
     """

From f3e2b8918ffc1b622b8e2021a13eb74db652892c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 10 Nov 2020 09:40:58 +0100
Subject: [PATCH 39/62] update after merge

---
 rasa/core/policies/ted_policy.py    | 176 +++++++++++++++++++++-------
 rasa/utils/tensorflow/model_data.py |   7 +-
 2 files changed, 138 insertions(+), 45 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index b02634b746bd..685af9d3b6b3 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -401,6 +401,9 @@ def _create_model_data(
 
         model_data.add_data(attribute_data)
         model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE)
+        model_data.add_lengths(
+            ENTITIES, SEQUENCE_LENGTH, ENTITIES, ENTITY_ATTRIBUTE_TYPE
+        )
         model_data.add_lengths(ACTION_TEXT, SEQUENCE_LENGTH, ACTION_TEXT, SEQUENCE)
 
         # add the dialogue lengths
@@ -996,7 +999,7 @@ def _encode_real_features_per_attribute(
             # resulting attribute features will have shape
             # combined batch dimension and dialogue length x 1 x units
             attribute_features = self._combine_sparse_dense_features(
-                tf_batch_data[attribute][SENTENCE], f"{attribute}_{SENTENCE}",
+                tf_batch_data[attribute][SENTENCE], f"{attribute}_{SENTENCE}"
             )
 
         if attribute in set(SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE):
@@ -1029,40 +1032,56 @@ def _batch_loss_entities(
         if ENTITY_ATTRIBUTE_TYPE not in tf_batch_data.get(ENTITIES, {}):
             return []
 
-        # if no tags are present at all, we can skip training
-        # check if there is any tag other than 0, which maps to NO_ENTITY_TAG
-        # TODO
-        #  If we remove this check the CRF layer is throwing an error.
-        #  Is there a better solution?
-        if tf.reduce_max(tf_batch_data[ENTITIES][ENTITY_ATTRIBUTE_TYPE][0]) == 0.0:
-            return []
-
-        sequence_lengths = tf.cast(
-            tf_batch_data[TEXT][SEQUENCE_LENGTH][0], dtype=tf.int32
-        )
-        sequence_lengths = tf.squeeze(sequence_lengths, axis=-1)
-        sequence_lengths += 1  # add sentence features
-        mask = tf.squeeze(self._compute_mask(sequence_lengths), axis=1)
-        sequence_lengths -= 1  # remove sentence features
+        # To calculate the loss for entities we need the output of the text
+        # sequence transformer (shape: combined batch dialogue dimension x
+        # sequence length x units), the output of the dialogue transformer
+        # (shape: batch size x dialogue length x units) and the tag ids for the
+        # entities (shape: combined batch dialogue dimension x sequence length x units)
+        # As the combined batch dialogue dimension for the output of the text sequence
+        # transformer and the tag ids differ, all tensors have different shapes.
+        # In order to process the tensors, they need to have the same shape.
+        # Convert all tensors to the same
+        #   combined batch dialogue dimension x sequence length x units
+        # shape.
+        # Note: The CRF layer cannot handle 4D tensors. E.g. we cannot use the shape
+        # batch size x dialogue length x sequence length x units
 
-        # +1 for sentence features
-        sequence_dimension = tf.reduce_max(sequence_lengths) + 1
+        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
 
         tag_ids = tf_batch_data[ENTITIES][ENTITY_ATTRIBUTE_TYPE][0]
         # add a zero (no entity) for the sentence features to match the shape of
         # inputs
         tag_ids = tf.pad(tag_ids, [[0, 0], [0, 1], [0, 0]])
+        # convert tag ids to shape batch-size x dialogue length x sequence length x 1
+        tag_ids = self._convert_to_original_shape(
+            tag_ids, tf_batch_data[ENTITIES][MASK][0], dialogue_lengths
+        )
+        # convert tag ids to shape
+        # combined batch dialogue dimension x sequence length x 1
+        tag_ids = self._combine_batch_and_dialogue_dimension(tag_ids, tf_batch_data)
+
+        # convert the output of the text sequence transformer to shape
+        # batch-size x dialogue length x sequence length x 1
+        text_seq_transformer_output = self._convert_to_original_shape(
+            self.text_seq_transformer_output,
+            tf_batch_data[TEXT][MASK][0],
+            dialogue_lengths,
+        )
+        # convert the output of the text sequence transformer to shape
+        # combined batch dialogue dimension x sequence length x units
+        text_seq_transformer_output = self._combine_batch_and_dialogue_dimension(
+            text_seq_transformer_output, tf_batch_data
+        )
 
-        # shape of the following two tensors
-        # (combined batch and dialogue dimension x sequence length x units)
-        # in case of dialogue_transformer_output sequence length is 1
-        text_seq_transformer_output = self.text_seq_transformer_output
+        # convert the output of the dialogue transformer to shape
+        # combined batch dialogue dimension x sequence length x units
         dialogue_transformer_output = self._combine_batch_and_dialogue_dimension(
             self.dialogue_transformer_output, tf_batch_data
         )
 
         # repeat the dialogue transformer output sequence-length-times to get the
         # same shape as the text sequence transformer output
+        sequence_dimension = tf.shape(tag_ids)[1]
         dialogue_transformer_output = tf.repeat(
             tf.expand_dims(dialogue_transformer_output, axis=1),
             sequence_dimension,
@@ -1076,9 +1095,33 @@ def _batch_loss_entities(
             text_seq_transformer_output, dialogue_transformer_output
         )
 
-        if self.max_history_tracker_featurizer_used:
-            dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+        # we need the sequence length and the mask for the CRF layer
+        _sequence_lengths = tf_batch_data[TEXT][SEQUENCE_LENGTH][0]
+        # extract only nonzero lengths and cast to int
+        _sequence_lengths = tf.cast(
+            tf.boolean_mask(_sequence_lengths, _sequence_lengths), dtype=tf.int32
+        )
+        # boolean mask returns flat tensor
+        _sequence_lengths = tf.expand_dims(_sequence_lengths, axis=-1)
+        # + 1 for sentence features
+        sequence_lengths = _sequence_lengths + 1
+        mask = tf.squeeze(self._compute_mask(sequence_lengths), axis=1)
 
+        # convert mask and sequence length to correct shape
+        mask = self._convert_to_original_shape(
+            mask, tf_batch_data[TEXT][MASK][0], dialogue_lengths
+        )
+        mask = self._combine_batch_and_dialogue_dimension(mask, tf_batch_data)
+        sequence_lengths = self._convert_to_original_shape(
+            tf.expand_dims(sequence_lengths, axis=-1),
+            tf_batch_data[TEXT][MASK][0],
+            dialogue_lengths,
+        )
+        sequence_lengths = self._combine_batch_and_dialogue_dimension(
+            sequence_lengths, tf_batch_data
+        )
+
+        if self.max_history_tracker_featurizer_used:
             batch_dim = tf.size(dialogue_lengths)
 
             # the first dimension of text transformed is the combined batch and dialogue
@@ -1086,6 +1129,8 @@ def _batch_loss_entities(
             # if the max history tracker featurizer is used we just want the last
             # dialogues of every batch example
 
+            # TODO the last dialogue turn might not contain any entities
+
             # get the indices of all last dialogues
             last_dialogue_indices = tf.cumsum(dialogue_lengths) - 1
 
@@ -1109,6 +1154,7 @@ def _batch_loss_entities(
             sequence_lengths = tf.gather(
                 tf.squeeze(sequence_lengths), last_dialogue_indices
             )
+
             # TODO
             #  inside the LSTM of the CRF layer the check len(mask.shape) == 2
             #  fails. mask is created from the sequence length.
@@ -1166,8 +1212,8 @@ def _convert_to_original_shape(
     ) -> tf.Tensor:
         """Transform attribute features back to original shape.
 
-        Given shape: combined batch and dialogue dimension x 1 x units
-        Original shape: batch x dialogue length x units
+        Given shape: combined batch and dialogue dimension x sequence length x units
+        Original shape: batch x dialogue length x sequence length x units
 
         Args:
             attribute_features: the "real" features to convert
@@ -1181,21 +1227,22 @@ def _convert_to_original_shape(
         """
 
         # in order to convert the attribute features with shape
-        # combined batch-size and dialogue length x 1 x units
-        # to a shape of batch-size x dialogue length x units
-        # we use tf.scatter_nd. Therefore, we need to the target shape and the indices
+        # combined batch-size and dialogue length x sequence length x units
+        # to a shape of batch-size x dialogue length x sequence length x units
+        # we use tf.scatter_nd. Therefore, we need the target shape and the indices
         # mapping the values of attribute features to the position in the resulting
         # tensor.
 
         batch_dim = tf.shape(attribute_mask)[0]
         dialogue_dim = tf.shape(attribute_mask)[1]
+        sequence_length = tf.shape(attribute_features)[1]
         units = attribute_features.shape[-1]
 
         # attribute_mask has shape (batch x dialogue_len x 1), remove last dimension
         attribute_mask = tf.cast(tf.squeeze(attribute_mask, axis=-1), dtype=tf.int32)
         # sum of attribute mask contains number of dialogue turns with "real" features
         non_fake_dialogue_lengths = tf.reduce_sum(attribute_mask, axis=-1)
-
+        # create the batch indices
         batch_indices = tf.repeat(tf.range(batch_dim), non_fake_dialogue_lengths)
 
         dialogue_indices = (
@@ -1218,9 +1265,19 @@ def _convert_to_original_shape(
 
         indices = tf.stack([batch_indices, dialogue_indices], axis=1)
 
-        shape = tf.convert_to_tensor([batch_dim, dialogue_dim, units])
+        shape = tf.cond(
+            sequence_length == 1,
+            lambda: tf.convert_to_tensor([batch_dim, dialogue_dim, units]),
+            lambda: tf.convert_to_tensor(
+                [batch_dim, dialogue_dim, sequence_length, units]
+            ),
+        )
+        attribute_features = tf.cond(
+            sequence_length == 1,
+            lambda: tf.squeeze(attribute_features, axis=1),
+            lambda: attribute_features,
+        )
 
-        attribute_features = tf.squeeze(attribute_features, axis=1)
         return tf.scatter_nd(indices, attribute_features, shape)
 
     def _process_batch_data(
@@ -1395,41 +1452,74 @@ def _batch_predict_entities(
     ) -> Dict[Text, tf.Tensor]:
         predictions: Dict[Text, tf.Tensor] = {}
 
-        sequence_lengths = tf.cast(
-            tf_batch_data[TEXT][SEQUENCE_LENGTH][0], dtype=tf.int32
+        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+
+        # convert the output of the text sequence transformer to shape
+        # batch-size x dialogue length x sequence length x 1
+        text_seq_transformer_output = self._convert_to_original_shape(
+            self.text_seq_transformer_output,
+            tf_batch_data[TEXT][MASK][0],
+            dialogue_lengths,
+        )
+        # convert the output of the text sequence transformer to shape
+        # combined batch dialogue dimension x sequence length x units
+        text_seq_transformer_output = self._combine_batch_and_dialogue_dimension(
+            text_seq_transformer_output, tf_batch_data
         )
-        sequence_lengths = tf.squeeze(sequence_lengths, axis=-1)
 
-        text_seq_transformer_output = self.text_seq_transformer_output
+        # convert the output of the dialogue transformer to shape
+        # combined batch dialogue dimension x sequence length x units
         dialogue_transformer_output = self._combine_batch_and_dialogue_dimension(
             self.dialogue_transformer_output, tf_batch_data
         )
 
         # repeat the dialogue transformer output sequence-length-times to get the
         # same shape as the text sequence transformer output
+        sequence_dimension = tf.shape(text_seq_transformer_output)[1]
         dialogue_transformer_output = tf.repeat(
             tf.expand_dims(dialogue_transformer_output, axis=1),
-            text_seq_transformer_output.shape[1],
+            sequence_dimension,
             axis=1,
         )
         # add the output of the dialogue transformer to the output of the text
         # sequence transformer (adding context)
+        # resulting shape
+        # (combined batch and dialogue dimension x sequence length x units)
         text_transformed = tf.add(
             text_seq_transformer_output, dialogue_transformer_output
         )
 
-        if self.max_history_tracker_featurizer_used:
-            dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+        # we need the sequence length and the mask for the CRF layer
+        _sequence_lengths = tf_batch_data[TEXT][SEQUENCE_LENGTH][0]
+        # extract only nonzero lengths and cast to int
+        _sequence_lengths = tf.cast(
+            tf.boolean_mask(_sequence_lengths, _sequence_lengths), dtype=tf.int32
+        )
+        # boolean mask returns flat tensor
+        _sequence_lengths = tf.expand_dims(_sequence_lengths, axis=-1)
+        # + 1 for sentence features
+        sequence_lengths = _sequence_lengths + 1
+
+        # convert mask and sequence length to correct shape
+        sequence_lengths = self._convert_to_original_shape(
+            tf.expand_dims(sequence_lengths, axis=-1),
+            tf_batch_data[TEXT][MASK][0],
+            dialogue_lengths,
+        )
+        sequence_lengths = self._combine_batch_and_dialogue_dimension(
+            sequence_lengths, tf_batch_data
+        )
 
+        if self.max_history_tracker_featurizer_used:
             batch_dim = tf.size(dialogue_lengths)
-            # +1 for sentence features
-            sequence_dimension = tf.reduce_max(sequence_lengths) + 1
 
             # the first dimension of text transformed is the combined batch and dialogue
             # dimension, which corresponds to the sum of all dialogue lengths
             # if the max history tracker featurizer is used we just want the last
             # dialogues of every batch example
 
+            # TODO the last dialogue turn might not contain any entities
+
             # get the indices of all last dialogues
             last_dialogue_indices = tf.cumsum(dialogue_lengths) - 1
 
@@ -1445,6 +1535,7 @@ def _batch_predict_entities(
             indices = tf.stack([dialogue_indices, sequence_indices], axis=2)
 
             # get all last dialogues from text_transformed using the above indices
+            # resulting shape (batch size x sequence length x units)
             text_transformed = tf.gather_nd(text_transformed, indices)
             # do the same for the other tensors
             sequence_lengths = tf.gather(
@@ -1452,9 +1543,8 @@ def _batch_predict_entities(
             )
 
         name = ENTITY_ATTRIBUTE_TYPE
-        _input = text_transformed
 
-        _logits = self._tf_layers[f"embed.{name}.logits"](_input)
+        _logits = self._tf_layers[f"embed.{name}.logits"](text_transformed)
         pred_ids, confidences = self._tf_layers[f"crf.{name}"](
             _logits, sequence_lengths - 1
         )
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 46cf8fd5bd66..3c8012f024ba 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -140,6 +140,9 @@ def _validate_number_of_dimensions(
             if isinstance(_sub_array, scipy.sparse.spmatrix):
                 dim = i
                 break
+            if isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0:
+                # sequence dimension is 0, we are dealing with "fake" features
+                return
 
         # If the resulting sub_array is sparse, the remaining number of dimensions
         # should be at least 2
@@ -1122,7 +1125,7 @@ def _pad_4d_dense_data(array_of_array_of_dense: FeatureArray) -> np.ndarray:
         )
 
         data_padded = np.zeros(
-            [combined_dialogue_len, max_seq_len, number_of_features,],
+            [combined_dialogue_len, max_seq_len, number_of_features],
             dtype=array_of_array_of_dense[0][0].dtype,
         )
 
@@ -1225,7 +1228,7 @@ def _4d_scipy_matrix_to_values(
         indices = np.hstack(
             [
                 np.vstack(
-                    [sum(dialogue_len[:i]) + j * np.ones_like(x.row), x.row, x.col,]
+                    [sum(dialogue_len[:i]) + j * np.ones_like(x.row), x.row, x.col]
                 )
                 for i, array_of_sparse in enumerate(array_of_array_of_sparse)
                 for j, x in enumerate(array_of_sparse)

From adea49e8ad50532c6634acae681a44a70e93d8a9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 10 Nov 2020 10:14:52 +0100
Subject: [PATCH 40/62] use python if instead of tf.cond

---
 rasa/core/policies/ted_policy.py | 137 +++++++++++++++++++------------
 1 file changed, 83 insertions(+), 54 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 685af9d3b6b3..41c29231242b 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -734,9 +734,6 @@ def __init__(
 
         self._prepare_layers()
 
-        self.text_seq_transformer_output: Optional[tf.Tensor] = None
-        self.dialogue_transformer_output: Optional[tf.Tensor] = None
-
     def _check_data(self) -> None:
         if not any(key in [INTENT, TEXT] for key in self.data_signature.keys()):
             raise ValueError(
@@ -841,11 +838,13 @@ def _prepare_encoding_layers(self, name: Text) -> None:
     def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_label_ids = self.tf_label_data[LABEL_KEY][LABEL_SUB_KEY][0]
         # labels cannot have all features "fake"
-        all_labels_encoded = {
-            key: self._encode_real_features_per_attribute(self.tf_label_data, key)
-            for key in self.tf_label_data.keys()
-            if key != LABEL_KEY
-        }
+        all_labels_encoded = {}
+        for key in self.tf_label_data.keys():
+            if key != LABEL_KEY:
+                attribute_features, _ = self._encode_real_features_per_attribute(
+                    self.tf_label_data, key
+                )
+                all_labels_encoded[key] = attribute_features
 
         if (
             all_labels_encoded.get(f"{LABEL_KEY}_{ACTION_TEXT}") is not None
@@ -871,7 +870,7 @@ def _emebed_dialogue(
         self,
         dialogue_in: tf.Tensor,
         tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Create dialogue level embedding and mask."""
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
         mask = self._compute_mask(dialogue_lengths)
@@ -881,7 +880,7 @@ def _emebed_dialogue(
         )
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
-        self.dialogue_transformer_output = dialogue_transformed
+        dialogue_transformer_output = dialogue_transformed
 
         if self.max_history_tracker_featurizer_used:
             # pick last vector if max history featurizer is used
@@ -892,11 +891,11 @@ def _emebed_dialogue(
 
         dialogue_embed = self._tf_layers[f"embed.{DIALOGUE}"](dialogue_transformed)
 
-        return dialogue_embed, mask
+        return dialogue_embed, mask, dialogue_transformer_output
 
     def _encode_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
-    ) -> tf.Tensor:
+    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
         # The input is a representation of 4d tensor of
         # shape (batch-size x dialogue-len x sequence-len x units) in 3d of shape
         # (sum of dialogue history length for all tensors in the batch x
@@ -922,7 +921,7 @@ def _encode_features_per_attribute(
 
     def _encode_fake_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
-    ) -> tf.Tensor:
+    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
         attribute_features_list = tf_batch_data[attribute][SENTENCE]
         attribute_mask = tf_batch_data[attribute][MASK][0]
 
@@ -939,11 +938,14 @@ def _encode_fake_features_per_attribute(
                 else:
                     units += f.shape[-1]
 
-        return tf.zeros((batch_dim, dialogue_dim, units), dtype=tf.float32)
+        attribute_features = tf.zeros(
+            (batch_dim, dialogue_dim, units), dtype=tf.float32
+        )
+        return attribute_features, None
 
     def _encode_real_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
-    ) -> tf.Tensor:
+    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
         """Encodes features for a given attribute.
 
         Args:
@@ -954,6 +956,8 @@ def _encode_real_features_per_attribute(
         Returns:
             A tensor combining  all features for `attribute`
         """
+        text_transformer_output = None
+
         if attribute in SEQUENCE_FEATURES_TO_ENCODE:
             # sequence_lengths contain `0` for "fake" features, while
             # tf_batch_data[attribute] contain only "real" features
@@ -984,7 +988,7 @@ def _encode_real_features_per_attribute(
             )
 
             if attribute == TEXT:
-                self.text_seq_transformer_output = attribute_features
+                text_transformer_output = attribute_features
 
             # resulting attribute features will have shape
             # combined batch dimension and dialogue length x 1 x units
@@ -1022,12 +1026,17 @@ def _encode_real_features_per_attribute(
         # (combined batch dimension and dialogue length x 1 x units)
         # convert them back to their original shape of
         # batch size x dialogue length x units
-        return self._convert_to_original_shape(
-            attribute_features, attribute_mask, dialogue_lengths
+        attribute_features = self._convert_to_original_shape(
+            attribute_features, attribute_mask, dialogue_lengths, False
         )
 
+        return attribute_features, text_transformer_output
+
     def _batch_loss_entities(
-        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
     ) -> List[tf.Tensor]:
         if ENTITY_ATTRIBUTE_TYPE not in tf_batch_data.get(ENTITIES, {}):
             return []
@@ -1054,7 +1063,7 @@ def _batch_loss_entities(
         tag_ids = tf.pad(tag_ids, [[0, 0], [0, 1], [0, 0]])
         # convert tag ids to shape batch-size x dialogue length x sequence length x 1
         tag_ids = self._convert_to_original_shape(
-            tag_ids, tf_batch_data[ENTITIES][MASK][0], dialogue_lengths
+            tag_ids, tf_batch_data[ENTITIES][MASK][0], dialogue_lengths, True
         )
         # convert tag ids to shape
         # combined batch dialogue dimension x sequence length x 1
@@ -1063,9 +1072,10 @@ def _batch_loss_entities(
         # convert the output of the text sequence transformer to shape
         # batch-size x dialogue length x sequence length x 1
         text_seq_transformer_output = self._convert_to_original_shape(
-            self.text_seq_transformer_output,
+            text_transformer_output,
             tf_batch_data[TEXT][MASK][0],
             dialogue_lengths,
+            True,
         )
         # convert the output of the text sequence transformer to shape
         # combined batch dialogue dimension x sequence length x units
@@ -1076,7 +1086,7 @@ def _batch_loss_entities(
         # convert the output of the dialogue transformer to shape
         # combined batch dialogue dimension x sequence length x units
         dialogue_transformer_output = self._combine_batch_and_dialogue_dimension(
-            self.dialogue_transformer_output, tf_batch_data
+            dialogue_transformer_output, tf_batch_data
         )
 
         # repeat the dialogue transformer output sequence-length-times to get the
@@ -1109,13 +1119,14 @@ def _batch_loss_entities(
 
         # convert mask and sequence length to correct shape
         mask = self._convert_to_original_shape(
-            mask, tf_batch_data[TEXT][MASK][0], dialogue_lengths
+            mask, tf_batch_data[TEXT][MASK][0], dialogue_lengths, True
         )
         mask = self._combine_batch_and_dialogue_dimension(mask, tf_batch_data)
         sequence_lengths = self._convert_to_original_shape(
             tf.expand_dims(sequence_lengths, axis=-1),
             tf_batch_data[TEXT][MASK][0],
             dialogue_lengths,
+            True,
         )
         sequence_lengths = self._combine_batch_and_dialogue_dimension(
             sequence_lengths, tf_batch_data
@@ -1209,6 +1220,7 @@ def _convert_to_original_shape(
         attribute_features: tf.Tensor,
         attribute_mask: tf.Tensor,
         dialogue_lengths: tf.Tensor,
+        consider_sequence_dimension: bool,
     ) -> tf.Tensor:
         """Transform attribute features back to original shape.
 
@@ -1265,24 +1277,20 @@ def _convert_to_original_shape(
 
         indices = tf.stack([batch_indices, dialogue_indices], axis=1)
 
-        shape = tf.cond(
-            sequence_length == 1,
-            lambda: tf.convert_to_tensor([batch_dim, dialogue_dim, units]),
-            lambda: tf.convert_to_tensor(
+        if consider_sequence_dimension:
+            shape = tf.convert_to_tensor(
                 [batch_dim, dialogue_dim, sequence_length, units]
-            ),
-        )
-        attribute_features = tf.cond(
-            sequence_length == 1,
-            lambda: tf.squeeze(attribute_features, axis=1),
-            lambda: attribute_features,
-        )
+            )
+            return tf.scatter_nd(indices, attribute_features, shape)
+
+        shape = tf.convert_to_tensor([batch_dim, dialogue_dim, units])
+        attribute_features = tf.squeeze(attribute_features, axis=1)
 
         return tf.scatter_nd(indices, attribute_features, shape)
 
     def _process_batch_data(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
-    ) -> tf.Tensor:
+    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
         """Encodes batch data.
 
         Combines intent and text and action name and action text if both are present.
@@ -1294,11 +1302,19 @@ def _process_batch_data(
              Tensor: encoding of all features in the batch, combined;
         """
         # encode each attribute present in tf_batch_data
-        batch_encoded = {
-            key: self._encode_features_per_attribute(tf_batch_data, key)
-            for key in tf_batch_data.keys()
-            if LABEL_KEY not in key and DIALOGUE not in key
-        }
+        text_transformer_output = None
+
+        batch_encoded = {}
+        for key in tf_batch_data.keys():
+            if LABEL_KEY not in key and DIALOGUE not in key:
+                attribute_features, _text_transformer_output = self._encode_features_per_attribute(
+                    tf_batch_data, key
+                )
+
+                batch_encoded[key] = attribute_features
+                if _text_transformer_output is not None:
+                    text_transformer_output = _text_transformer_output
+
         # if both action text and action name are present, combine them; otherwise,
         # return the one which is present
 
@@ -1332,7 +1348,7 @@ def _process_batch_data(
 
         batch_features = tf.concat(batch_features, axis=-1)
 
-        return batch_features
+        return batch_features, text_transformer_output
 
     @staticmethod
     def _get_labels_embed(
@@ -1364,8 +1380,8 @@ def batch_loss(
         label_ids = tf_batch_data[LABEL_KEY][LABEL_SUB_KEY][0]
         labels_embed = self._get_labels_embed(label_ids, all_labels_embed)
 
-        dialogue_in = self._process_batch_data(tf_batch_data)
-        dialogue_embed, dialogue_mask = self._emebed_dialogue(
+        dialogue_in, text_transformer_output = self._process_batch_data(tf_batch_data)
+        dialogue_embed, dialogue_mask, dialogue_transformer_output = self._emebed_dialogue(
             dialogue_in, tf_batch_data
         )
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
@@ -1384,10 +1400,14 @@ def batch_loss(
 
         if (
             self.config[ENTITY_RECOGNITION]
-            and self.dialogue_transformer_output is not None
-            and self.text_seq_transformer_output is not None
+            and dialogue_transformer_output is not None
+            and text_transformer_output is not None
         ):
-            losses.extend(self._batch_loss_entities(tf_batch_data))
+            losses.extend(
+                self._batch_loss_entities(
+                    tf_batch_data, dialogue_transformer_output, text_transformer_output
+                )
+            )
 
         self.action_loss.update_state(loss)
         self.action_acc.update_state(acc)
@@ -1418,8 +1438,8 @@ def batch_predict(
             batch_in, self.predict_data_signature
         )
 
-        dialogue_in = self._process_batch_data(tf_batch_data)
-        dialogue_embed, dialogue_mask = self._emebed_dialogue(
+        dialogue_in, text_transformer_output = self._process_batch_data(tf_batch_data)
+        dialogue_embed, dialogue_mask, dialogue_transformer_output = self._emebed_dialogue(
             dialogue_in, tf_batch_data
         )
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
@@ -1428,10 +1448,14 @@ def batch_predict(
 
         if (
             self.config[ENTITY_RECOGNITION]
-            and self.dialogue_transformer_output is not None
-            and self.text_seq_transformer_output is not None
+            and dialogue_transformer_output is not None
+            and text_transformer_output is not None
         ):
-            predictions.update(self._batch_predict_entities(tf_batch_data))
+            predictions.update(
+                self._batch_predict_entities(
+                    tf_batch_data, dialogue_transformer_output, text_transformer_output
+                )
+            )
 
         sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
             dialogue_embed[:, :, tf.newaxis, :],
@@ -1448,7 +1472,10 @@ def batch_predict(
         return predictions
 
     def _batch_predict_entities(
-        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
     ) -> Dict[Text, tf.Tensor]:
         predictions: Dict[Text, tf.Tensor] = {}
 
@@ -1457,9 +1484,10 @@ def _batch_predict_entities(
         # convert the output of the text sequence transformer to shape
         # batch-size x dialogue length x sequence length x 1
         text_seq_transformer_output = self._convert_to_original_shape(
-            self.text_seq_transformer_output,
+            text_transformer_output,
             tf_batch_data[TEXT][MASK][0],
             dialogue_lengths,
+            True,
         )
         # convert the output of the text sequence transformer to shape
         # combined batch dialogue dimension x sequence length x units
@@ -1470,7 +1498,7 @@ def _batch_predict_entities(
         # convert the output of the dialogue transformer to shape
         # combined batch dialogue dimension x sequence length x units
         dialogue_transformer_output = self._combine_batch_and_dialogue_dimension(
-            self.dialogue_transformer_output, tf_batch_data
+            dialogue_transformer_output, tf_batch_data
         )
 
         # repeat the dialogue transformer output sequence-length-times to get the
@@ -1505,6 +1533,7 @@ def _batch_predict_entities(
             tf.expand_dims(sequence_lengths, axis=-1),
             tf_batch_data[TEXT][MASK][0],
             dialogue_lengths,
+            True,
         )
         sequence_lengths = self._combine_batch_and_dialogue_dimension(
             sequence_lengths, tf_batch_data

From d3bd22db898131825ac9f2fec7e3b8c907779a0b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 10 Nov 2020 10:39:45 +0100
Subject: [PATCH 41/62] we need to return a tensor in tf.cond instead of None

---
 rasa/core/policies/ted_policy.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 41c29231242b..698114e06251 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -941,7 +941,7 @@ def _encode_fake_features_per_attribute(
         attribute_features = tf.zeros(
             (batch_dim, dialogue_dim, units), dtype=tf.float32
         )
-        return attribute_features, None
+        return attribute_features, tf.zeros(([1]))
 
     def _encode_real_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
@@ -956,7 +956,7 @@ def _encode_real_features_per_attribute(
         Returns:
             A tensor combining  all features for `attribute`
         """
-        text_transformer_output = None
+        text_transformer_output = tf.zeros([1])
 
         if attribute in SEQUENCE_FEATURES_TO_ENCODE:
             # sequence_lengths contain `0` for "fake" features, while
@@ -1071,6 +1071,7 @@ def _batch_loss_entities(
 
         # convert the output of the text sequence transformer to shape
         # batch-size x dialogue length x sequence length x 1
+        # TODO text_transformer_output shape is unknown in non-eager mode
         text_seq_transformer_output = self._convert_to_original_shape(
             text_transformer_output,
             tf_batch_data[TEXT][MASK][0],
@@ -1247,7 +1248,6 @@ def _convert_to_original_shape(
 
         batch_dim = tf.shape(attribute_mask)[0]
         dialogue_dim = tf.shape(attribute_mask)[1]
-        sequence_length = tf.shape(attribute_features)[1]
         units = attribute_features.shape[-1]
 
         # attribute_mask has shape (batch x dialogue_len x 1), remove last dimension
@@ -1278,6 +1278,7 @@ def _convert_to_original_shape(
         indices = tf.stack([batch_indices, dialogue_indices], axis=1)
 
         if consider_sequence_dimension:
+            sequence_length = tf.shape(attribute_features)[1]
             shape = tf.convert_to_tensor(
                 [batch_dim, dialogue_dim, sequence_length, units]
             )
@@ -1302,7 +1303,7 @@ def _process_batch_data(
              Tensor: encoding of all features in the batch, combined;
         """
         # encode each attribute present in tf_batch_data
-        text_transformer_output = None
+        text_transformer_output = tf.zeros([1])
 
         batch_encoded = {}
         for key in tf_batch_data.keys():
@@ -1312,7 +1313,7 @@ def _process_batch_data(
                 )
 
                 batch_encoded[key] = attribute_features
-                if _text_transformer_output is not None:
+                if tf.reduce_max(_text_transformer_output) > 0:
                     text_transformer_output = _text_transformer_output
 
         # if both action text and action name are present, combine them; otherwise,

From cd69de9a3eec71191801c9901275d9fa92507236 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 10 Nov 2020 12:34:06 +0100
Subject: [PATCH 42/62] create entity tags for all texts

---
 rasa/core/featurizers/single_state_featurizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index 7607b37427a9..a39f161faba6 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -136,7 +136,7 @@ def _create_entity_tag_features(
             return []
 
         parsed_text = interpreter.featurize_message(Message({TEXT: sub_state[TEXT]}))
-        entities = [dict(entity) for entity in sub_state[ENTITIES]]
+        entities = sub_state.get(ENTITIES, [])
         tag_id_mapping = self.get_entity_tag_ids()
 
         _tags = []
@@ -261,7 +261,7 @@ def encode_state(
                 state_features.update(
                     self._extract_state_features(sub_state, interpreter, sparse=True)
                 )
-                if sub_state.get(ENTITIES):
+                if sub_state.get(TEXT):
                     state_features[ENTITIES] = self._create_features(
                         sub_state, ENTITIES, sparse=True
                     ) + self._create_entity_tag_features(sub_state, interpreter)

From 8e8af875e990115f87ad1f48515ce99cadd7d2bd Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 10 Nov 2020 17:22:07 +0100
Subject: [PATCH 43/62] update batch loss entities (not yet working)

---
 rasa/core/policies/ted_policy.py | 179 ++++++++++++++-----------------
 1 file changed, 83 insertions(+), 96 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 698114e06251..a7cac2b10880 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -941,7 +941,7 @@ def _encode_fake_features_per_attribute(
         attribute_features = tf.zeros(
             (batch_dim, dialogue_dim, units), dtype=tf.float32
         )
-        return attribute_features, tf.zeros(([1]))
+        return attribute_features, tf.zeros(([0, 0, units]))
 
     def _encode_real_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
@@ -956,7 +956,7 @@ def _encode_real_features_per_attribute(
         Returns:
             A tensor combining  all features for `attribute`
         """
-        text_transformer_output = tf.zeros([1])
+        text_transformer_output = tf.zeros([0, 0, 0])
 
         if attribute in SEQUENCE_FEATURES_TO_ENCODE:
             # sequence_lengths contain `0` for "fake" features, while
@@ -1037,62 +1037,47 @@ def _batch_loss_entities(
         tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
         dialogue_transformer_output: tf.Tensor,
         text_transformer_output: tf.Tensor,
-    ) -> List[tf.Tensor]:
+    ) -> tf.Tensor:
         if ENTITY_ATTRIBUTE_TYPE not in tf_batch_data.get(ENTITIES, {}):
-            return []
+            return tf.constant(0)
+
+        # TODO tf.cond
+        if tf.shape(text_transformer_output)[0] == 0:
+            return tf.constant(0)
 
         # To calculate the loss for entities we need the output of the text
         # sequence transformer (shape: combined batch dialogue dimension x
         # sequence length x units), the output of the dialogue transformer
         # (shape: batch size x dialogue length x units) and the tag ids for the
         # entities (shape: combined batch dialogue dimension x sequence length x units)
-        # As the combined batch dialogue dimension for the output of the text sequence
-        # transformer and the tag ids differ, all tensors have different shapes.
+        # The combined batch dialogue dimension for the text sequence transformer
+        # and the tag ids matches.
         # In order to process the tensors, they need to have the same shape.
-        # Convert all tensors to the same
-        #   combined batch dialogue dimension x sequence length x units
-        # shape.
+        # Convert the output of the dialogue transformer to shape
+        # (combined batch dialogue dimension x sequence length x units).
         # Note: The CRF layer cannot handle 4D tensors. E.g. we cannot use the shape
         # batch size x dialogue length x sequence length x units
 
-        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
-
         tag_ids = tf_batch_data[ENTITIES][ENTITY_ATTRIBUTE_TYPE][0]
         # add a zero (no entity) for the sentence features to match the shape of
         # inputs
         tag_ids = tf.pad(tag_ids, [[0, 0], [0, 1], [0, 0]])
-        # convert tag ids to shape batch-size x dialogue length x sequence length x 1
-        tag_ids = self._convert_to_original_shape(
-            tag_ids, tf_batch_data[ENTITIES][MASK][0], dialogue_lengths, True
-        )
-        # convert tag ids to shape
-        # combined batch dialogue dimension x sequence length x 1
-        tag_ids = self._combine_batch_and_dialogue_dimension(tag_ids, tf_batch_data)
-
-        # convert the output of the text sequence transformer to shape
-        # batch-size x dialogue length x sequence length x 1
-        # TODO text_transformer_output shape is unknown in non-eager mode
-        text_seq_transformer_output = self._convert_to_original_shape(
-            text_transformer_output,
-            tf_batch_data[TEXT][MASK][0],
-            dialogue_lengths,
-            True,
-        )
-        # convert the output of the text sequence transformer to shape
-        # combined batch dialogue dimension x sequence length x units
-        text_seq_transformer_output = self._combine_batch_and_dialogue_dimension(
-            text_seq_transformer_output, tf_batch_data
-        )
 
         # convert the output of the dialogue transformer to shape
         # combined batch dialogue dimension x sequence length x units
+        batch_dim = tf.shape(dialogue_transformer_output)[0]
         dialogue_transformer_output = self._combine_batch_and_dialogue_dimension(
             dialogue_transformer_output, tf_batch_data
         )
+        # get only the dialogues that contain a user utterance
+        dialogue_transformer_output = tf.boolean_mask(
+            dialogue_transformer_output,
+            tf.squeeze(tf_batch_data[TEXT][SEQUENCE_LENGTH][0], axis=-1),
+        )
 
         # repeat the dialogue transformer output sequence-length-times to get the
         # same shape as the text sequence transformer output
-        sequence_dimension = tf.shape(tag_ids)[1]
+        sequence_dimension = tf.shape(text_transformer_output)[1]
         dialogue_transformer_output = tf.repeat(
             tf.expand_dims(dialogue_transformer_output, axis=1),
             sequence_dimension,
@@ -1102,9 +1087,7 @@ def _batch_loss_entities(
         # sequence transformer (adding context)
         # resulting shape
         # (combined batch and dialogue dimension x sequence length x units)
-        text_transformed = tf.add(
-            text_seq_transformer_output, dialogue_transformer_output
-        )
+        text_transformed = tf.add(text_transformer_output, dialogue_transformer_output)
 
         # we need the sequence length and the mask for the CRF layer
         _sequence_lengths = tf_batch_data[TEXT][SEQUENCE_LENGTH][0]
@@ -1118,35 +1101,48 @@ def _batch_loss_entities(
         sequence_lengths = _sequence_lengths + 1
         mask = tf.squeeze(self._compute_mask(sequence_lengths), axis=1)
 
-        # convert mask and sequence length to correct shape
-        mask = self._convert_to_original_shape(
-            mask, tf_batch_data[TEXT][MASK][0], dialogue_lengths, True
-        )
-        mask = self._combine_batch_and_dialogue_dimension(mask, tf_batch_data)
-        sequence_lengths = self._convert_to_original_shape(
-            tf.expand_dims(sequence_lengths, axis=-1),
-            tf_batch_data[TEXT][MASK][0],
-            dialogue_lengths,
-            True,
-        )
-        sequence_lengths = self._combine_batch_and_dialogue_dimension(
-            sequence_lengths, tf_batch_data
-        )
-
         if self.max_history_tracker_featurizer_used:
-            batch_dim = tf.size(dialogue_lengths)
-
-            # the first dimension of text transformed is the combined batch and dialogue
-            # dimension, which corresponds to the sum of all dialogue lengths
             # if the max history tracker featurizer is used we just want the last
-            # dialogues of every batch example
-
-            # TODO the last dialogue turn might not contain any entities
+            # dialogues that contain a user utterance for every batch example
+
+            # the attribute mask indicates which dialogue contains a user utterance
+            attribute_mask = tf_batch_data[TEXT][MASK][0]
+            # get indices of all dialogues that contain a user utterance
+            # shape: (combined batch dialogue dimension x 2)
+            # TODO it seems like there are sometimes dialogues that do not have any
+            #  text features, but that should not be
+            indices_of_text_dialogues = tf.where(
+                tf.not_equal(tf.squeeze(attribute_mask), 0)
+            )
+            # get the index of the last dialogues indices for every batch example
+            indices_of_last_text_dialogue_indices = (
+                tf.cumsum(
+                    tf.squeeze(
+                        tf.cast(tf.reduce_sum(attribute_mask, axis=1), dtype=tf.int32)
+                    )
+                )
+                - 1
+            )
+            # get only those the indices_of_text_dialogues of the last dialogues
+            # resulting shape of indices (batch size x 2)
+            indices_of_text_dialogues = tf.gather(
+                indices_of_text_dialogues, indices_of_last_text_dialogue_indices
+            )
 
-            # get the indices of all last dialogues
-            last_dialogue_indices = tf.cumsum(dialogue_lengths) - 1
+            # We now hove the indices of the relevant dialogues. However,
+            # text_transformed has a different shape (first dimension is the combined
+            # batch dialogue dimension). Thus we need to map the
+            # indices_of_text_dialogues into this shape.
+            cumsum_sequence_length = tf.squeeze(
+                tf.cast(tf.cumsum(sequence_lengths, axis=0), dtype=tf.int32)
+            )
+            last_dialogue_indices = tf.map_fn(
+                lambda x: cumsum_sequence_length[x[0]] + x[1],
+                tf.cast(indices_of_text_dialogues, dtype=tf.int32),
+            )
 
-            # build up indices to get the last dialogues from text_transformed
+            # build up indices to get the last dialogues from text_transformed and the
+            # other tensors
             dialogue_indices = tf.repeat(
                 tf.expand_dims(last_dialogue_indices, axis=1),
                 sequence_dimension,
@@ -1163,13 +1159,8 @@ def _batch_loss_entities(
             # do the same for the other tensors
             tag_ids = tf.gather_nd(tag_ids, indices)
             mask = tf.gather_nd(mask, indices)
-            sequence_lengths = tf.gather(
-                tf.squeeze(sequence_lengths), last_dialogue_indices
-            )
-
-            # TODO
-            #  inside the LSTM of the CRF layer the check len(mask.shape) == 2
-            #  fails. mask is created from the sequence length.
+            # as sequence_lengths is a 1D tensor use tf.gather instead of tf.gather_nd
+            sequence_lengths = tf.gather(sequence_lengths, last_dialogue_indices)
 
         loss, f1, _ = self._calculate_entity_loss(
             text_transformed,
@@ -1182,7 +1173,7 @@ def _batch_loss_entities(
         self.entity_loss.update_state(loss)
         self.entity_f1.update_state(f1)
 
-        return [loss]
+        return loss
 
     @staticmethod
     def _combine_batch_and_dialogue_dimension(
@@ -1203,7 +1194,7 @@ def _combine_batch_and_dialogue_dimension(
         """
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
 
-        batch_dim = tf.size(dialogue_lengths)
+        batch_dim = tf.shape(dialogue_lengths)[0]
         batch_indices = tf.repeat(tf.range(batch_dim), dialogue_lengths)
         dialogue_indices = (
             tf.map_fn(
@@ -1303,17 +1294,17 @@ def _process_batch_data(
              Tensor: encoding of all features in the batch, combined;
         """
         # encode each attribute present in tf_batch_data
-        text_transformer_output = tf.zeros([1])
+        text_transformer_output = tf.zeros([0, 0, 0])
 
         batch_encoded = {}
         for key in tf_batch_data.keys():
             if LABEL_KEY not in key and DIALOGUE not in key:
-                attribute_features, _text_transformer_output = self._encode_features_per_attribute(
-                    tf_batch_data, key
-                )
-
+                (
+                    attribute_features,
+                    _text_transformer_output,
+                ) = self._encode_features_per_attribute(tf_batch_data, key)
                 batch_encoded[key] = attribute_features
-                if tf.reduce_max(_text_transformer_output) > 0:
+                if tf.shape(_text_transformer_output)[0] > 0:
                     text_transformer_output = _text_transformer_output
 
         # if both action text and action name are present, combine them; otherwise,
@@ -1382,9 +1373,11 @@ def batch_loss(
         labels_embed = self._get_labels_embed(label_ids, all_labels_embed)
 
         dialogue_in, text_transformer_output = self._process_batch_data(tf_batch_data)
-        dialogue_embed, dialogue_mask, dialogue_transformer_output = self._emebed_dialogue(
-            dialogue_in, tf_batch_data
-        )
+        (
+            dialogue_embed,
+            dialogue_mask,
+            dialogue_transformer_output,
+        ) = self._emebed_dialogue(dialogue_in, tf_batch_data)
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
         losses = []
@@ -1399,12 +1392,8 @@ def batch_loss(
         )
         losses.append(loss)
 
-        if (
-            self.config[ENTITY_RECOGNITION]
-            and dialogue_transformer_output is not None
-            and text_transformer_output is not None
-        ):
-            losses.extend(
+        if self.config[ENTITY_RECOGNITION]:
+            losses.append(
                 self._batch_loss_entities(
                     tf_batch_data, dialogue_transformer_output, text_transformer_output
                 )
@@ -1440,18 +1429,16 @@ def batch_predict(
         )
 
         dialogue_in, text_transformer_output = self._process_batch_data(tf_batch_data)
-        dialogue_embed, dialogue_mask, dialogue_transformer_output = self._emebed_dialogue(
-            dialogue_in, tf_batch_data
-        )
+        (
+            dialogue_embed,
+            dialogue_mask,
+            dialogue_transformer_output,
+        ) = self._emebed_dialogue(dialogue_in, tf_batch_data)
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
         predictions = {}
 
-        if (
-            self.config[ENTITY_RECOGNITION]
-            and dialogue_transformer_output is not None
-            and text_transformer_output is not None
-        ):
+        if self.config[ENTITY_RECOGNITION]:
             predictions.update(
                 self._batch_predict_entities(
                     tf_batch_data, dialogue_transformer_output, text_transformer_output
@@ -1480,6 +1467,8 @@ def _batch_predict_entities(
     ) -> Dict[Text, tf.Tensor]:
         predictions: Dict[Text, tf.Tensor] = {}
 
+        # TODO Update according to batch loss entities
+
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
 
         # convert the output of the text sequence transformer to shape
@@ -1548,8 +1537,6 @@ def _batch_predict_entities(
             # if the max history tracker featurizer is used we just want the last
             # dialogues of every batch example
 
-            # TODO the last dialogue turn might not contain any entities
-
             # get the indices of all last dialogues
             last_dialogue_indices = tf.cumsum(dialogue_lengths) - 1
 

From d1f7e978fb76d420f1cd0493ce167b62be890a80 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 11 Nov 2020 21:54:09 +0100
Subject: [PATCH 44/62] input to entity loss

---
 .../featurizers/single_state_featurizer.py    | 115 ++---
 rasa/core/featurizers/tracker_featurizers.py  |  92 +++-
 rasa/core/policies/memoization.py             |  19 +-
 rasa/core/policies/policy.py                  |  11 +-
 rasa/core/policies/sklearn_policy.py          |   2 +-
 rasa/core/policies/ted_policy.py              | 466 +++++++++---------
 rasa/nlu/classifiers/diet_classifier.py       |   4 +-
 rasa/shared/core/domain.py                    |  12 +-
 rasa/shared/core/events.py                    |   2 +-
 rasa/shared/core/generator.py                 |  17 +-
 rasa/shared/core/trackers.py                  |  26 +-
 rasa/shared/nlu/constants.py                  |   1 +
 rasa/shared/nlu/training_data/features.py     |  10 -
 rasa/utils/tensorflow/constants.py            |   1 +
 rasa/utils/tensorflow/models.py               |  12 +-
 15 files changed, 411 insertions(+), 379 deletions(-)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index a39f161faba6..e3ed3712ad76 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -1,7 +1,7 @@
 import logging
 import numpy as np
 import scipy.sparse
-from typing import List, Optional, Dict, Text, Set
+from typing import List, Optional, Dict, Text, Set, Any
 from collections import defaultdict
 
 import rasa.shared.utils.io
@@ -17,14 +17,15 @@
     ACTION_TEXT,
     ACTION_NAME,
     INTENT,
-    FEATURE_TYPE_SEQUENCE,
     TEXT,
     NO_ENTITY_TAG,
     ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_TAGS,
 )
 from rasa.shared.nlu.training_data.features import Features
 from rasa.shared.nlu.training_data.message import Message
 from rasa.utils.tensorflow.model_data_utils import TAG_ID_ORIGIN
+from rasa.utils.tensorflow.constants import IDS
 
 logger = logging.getLogger(__name__)
 
@@ -42,6 +43,23 @@ class SingleStateFeaturizer:
     def __init__(self) -> None:
         self._default_feature_states = {}
         self.action_texts = []
+        self.tag_id_mapping = {}
+
+    def get_entity_tag_ids(self) -> Dict[Text, int]:
+        """Returns the tag to index mapping for entities.
+
+        Returns:
+            Tag to index mapping.
+        """
+        if ENTITIES not in self._default_feature_states:
+            return {}
+
+        tag_ids = {
+            tag: idx + 1  # +1 to keep 0 for the NO_ENTITY_TAG
+            for tag, idx in self._default_feature_states[ENTITIES].items()
+        }
+        tag_ids[NO_ENTITY_TAG] = 0
+        return tag_ids
 
     def prepare_from_domain(self, domain: Domain) -> None:
         """Gets necessary information for featurization from domain.
@@ -61,6 +79,7 @@ def convert_to_dict(feature_states: List[Text]) -> Dict[Text, int]:
         self._default_feature_states[SLOTS] = convert_to_dict(domain.slot_states)
         self._default_feature_states[ACTIVE_LOOP] = convert_to_dict(domain.form_names)
         self.action_texts = domain.action_texts
+        self.tag_id_mapping = self.get_entity_tag_ids()
 
     def _state_features_for_attribute(
         self, sub_state: SubState, attribute: Text
@@ -68,7 +87,7 @@ def _state_features_for_attribute(
         if attribute in {INTENT, ACTION_NAME}:
             return {sub_state[attribute]: 1}
         elif attribute == ENTITIES:
-            return {entity["entity"]: 1 for entity in sub_state.get(ENTITIES, [])}
+            return {entity: 1 for entity in sub_state.get(ENTITIES, [])}
         elif attribute == ACTIVE_LOOP:
             return {sub_state["name"]: 1}
         elif attribute == SLOTS:
@@ -104,58 +123,6 @@ def _create_features(
         )
         return [features]
 
-    def get_entity_tag_ids(self) -> Dict[Text, int]:
-        """Returns the tag to index mapping for entities.
-
-        Returns:
-            Tag to index mapping.
-        """
-        if ENTITIES not in self._default_feature_states:
-            return {}
-
-        tag_ids = {
-            tag: idx + 1  # +1 to keep 0 for the NO_ENTITY_TAG
-            for tag, idx in self._default_feature_states[ENTITIES].items()
-        }
-        tag_ids[NO_ENTITY_TAG] = 0
-        return tag_ids
-
-    def _create_entity_tag_features(
-        self, sub_state: SubState, interpreter: NaturalLanguageInterpreter
-    ) -> List["Features"]:
-        from rasa.nlu.test import determine_token_labels
-
-        # TODO
-        #  The entity states used to create the tag-idx-mapping contains the
-        #  entities and the concatenated entity and roles/groups. We do not
-        #  distinguish between entities and roles/groups right now.
-        # TODO
-        #  Should we support BILOU tagging?
-
-        if TEXT not in sub_state:
-            return []
-
-        parsed_text = interpreter.featurize_message(Message({TEXT: sub_state[TEXT]}))
-        entities = sub_state.get(ENTITIES, [])
-        tag_id_mapping = self.get_entity_tag_ids()
-
-        _tags = []
-        for token in parsed_text.get(TOKENS_NAMES[TEXT]):
-            _tag = determine_token_labels(
-                token, entities, attribute_key=ENTITY_ATTRIBUTE_TYPE
-            )
-            _tags.append(tag_id_mapping[_tag])
-
-        # transpose to have seq_len x 1
-        return [
-            Features(
-                np.array([_tags]).T,
-                FEATURE_TYPE_SEQUENCE,
-                ENTITY_ATTRIBUTE_TYPE,
-                TAG_ID_ORIGIN,
-            )
-        ]
-
     @staticmethod
     def _to_sparse_sentence_features(
         sparse_sequence_features: List["Features"],
@@ -261,10 +228,10 @@ def encode_state(
                 state_features.update(
                     self._extract_state_features(sub_state, interpreter, sparse=True)
                 )
-                if sub_state.get(TEXT):
+                if sub_state.get(ENTITIES):
                     state_features[ENTITIES] = self._create_features(
                         sub_state, ENTITIES, sparse=True
-                    ) + self._create_entity_tag_features(sub_state, interpreter)
+                    )
 
             if state_type in {SLOTS, ACTIVE_LOOP}:
                 state_features[state_type] = self._create_features(
@@ -273,6 +240,40 @@ def encode_state(
 
         return state_features
 
+    def encode_entity(
+        self, entity_data: Dict[Text, Any], interpreter: NaturalLanguageInterpreter
+    ) -> Dict[Text, List["Features"]]:
+        from rasa.nlu.test import determine_token_labels
+
+        # TODO
+        #  The entity states used to create the tag-idx-mapping contains the
+        #  entities and the concatenated entity and roles/groups. We do not
+        #  distinguish between entities and roles/groups right now.
+        # TODO
+        #  Should we support BILOU tagging?
+
+        if TEXT not in entity_data or len(self.tag_id_mapping) < 2:
+            # we cannot build a classifier if there are less than 2 class
+            return {}
+
+        parsed_text = interpreter.featurize_message(Message({TEXT: entity_data[TEXT]}))
+        entities = entity_data.get(ENTITIES, [])
+
+        _tags = []
+        for token in parsed_text.get(TOKENS_NAMES[TEXT]):
+            _tag = determine_token_labels(
+                token, entities, attribute_key=ENTITY_ATTRIBUTE_TYPE
+            )
+            # TODO handle if tag is not in mapping
+            _tags.append(self.tag_id_mapping[_tag])
+
+        # transpose to have seq_len x 1
+        return {
+            ENTITY_TAGS: [
+                Features(np.array([_tags]).T, IDS, ENTITY_TAGS, TAG_ID_ORIGIN,)
+            ]
+        }
+
     def _encode_action(
         self, action: Text, interpreter: NaturalLanguageInterpreter
     ) -> Dict[Text, List["Features"]]:
diff --git a/rasa/core/featurizers/tracker_featurizers.py b/rasa/core/featurizers/tracker_featurizers.py
index 4f250c535ea8..4973b1da98a7 100644
--- a/rasa/core/featurizers/tracker_featurizers.py
+++ b/rasa/core/featurizers/tracker_featurizers.py
@@ -3,15 +3,15 @@
 import jsonpickle
 import logging
 
-from rasa.shared.nlu.constants import TEXT, INTENT
+from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES
 from rasa.shared.exceptions import RasaException
 from tqdm import tqdm
-from typing import Tuple, List, Optional, Dict, Text, Union
+from typing import Tuple, List, Optional, Dict, Text, Union, Any
 import numpy as np
 
 from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer
 from rasa.shared.core.domain import State, Domain
-from rasa.shared.core.events import ActionExecuted
+from rasa.shared.core.events import ActionExecuted, UserUttered
 from rasa.shared.core.trackers import (
     DialogueStateTracker,
     is_prev_action_listen_in_state,
@@ -91,6 +91,34 @@ def _convert_labels_to_ids(
             ]
         )
 
+    def _create_entity_tags(
+        self,
+        trackers_as_entities: List[List[Dict[Text, Any]]],
+        interpreter: NaturalLanguageInterpreter,
+    ) -> List[List[Dict[Text, List["Features"]]]]:
+        return [
+            [
+                self.state_featurizer.encode_entity(entity_data, interpreter)
+                for entity_data in trackers_entities
+            ]
+            for trackers_entities in trackers_as_entities
+        ]
+
+    @staticmethod
+    def _entity_data(event: UserUttered) -> Dict[Text, Any]:
+        if event.text:
+            return {TEXT: event.text, ENTITIES: event.entities}
+
+        # input is not textual, so add empty dict
+        return {}
+
+    def training_states_actions_and_entities(
+        self, trackers: List[DialogueStateTracker], domain: Domain
+    ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
+        raise NotImplementedError(
+            "Featurizer must have the capacity to encode trackers to feature vectors"
+        )
+
     def training_states_and_actions(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[State]], List[List[Text]]]:
@@ -103,16 +131,23 @@ def training_states_and_actions(
         Returns:
             A tuple of list of states and list of actions.
         """
-        raise NotImplementedError(
-            "Featurizer must have the capacity to encode trackers to feature vectors"
-        )
+        (
+            trackers_as_states,
+            trackers_as_actions,
+            _,
+        ) = self.training_states_actions_and_entities(trackers, domain)
+        return trackers_as_states, trackers_as_actions
 
     def featurize_trackers(
         self,
         trackers: List[DialogueStateTracker],
         domain: Domain,
         interpreter: NaturalLanguageInterpreter,
-    ) -> Tuple[List[List[Dict[Text, List["Features"]]]], np.ndarray]:
+    ) -> Tuple[
+        List[List[Dict[Text, List["Features"]]]],
+        np.ndarray,
+        List[List[Dict[Text, List["Features"]]]],
+    ]:
         """Featurize the training trackers.
 
         Args:
@@ -137,14 +172,17 @@ def featurize_trackers(
 
         self.state_featurizer.prepare_from_domain(domain)
 
-        trackers_as_states, trackers_as_actions = self.training_states_and_actions(
-            trackers, domain
-        )
+        (
+            trackers_as_states,
+            trackers_as_actions,
+            trackers_as_entities,
+        ) = self.training_states_actions_and_entities(trackers, domain)
 
         tracker_state_features = self._featurize_states(trackers_as_states, interpreter)
         label_ids = self._convert_labels_to_ids(trackers_as_actions, domain)
+        entity_tags = self._create_entity_tags(trackers_as_entities, interpreter)
 
-        return tracker_state_features, label_ids
+        return tracker_state_features, label_ids, entity_tags
 
     @staticmethod
     def _choose_last_user_input(
@@ -252,9 +290,9 @@ class FullDialogueTrackerFeaturizer(TrackerFeaturizer):
     Training data is padded up to the length of the longest dialogue with -1.
     """
 
-    def training_states_and_actions(
+    def training_states_actions_and_entities(
         self, trackers: List[DialogueStateTracker], domain: Domain
-    ) -> Tuple[List[List[State]], List[List[Text]]]:
+    ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
         """Transforms list of trackers to lists of states and actions.
 
         Training data is padded up to the length of the longest dialogue with -1.
@@ -269,6 +307,7 @@ def training_states_and_actions(
 
         trackers_as_states = []
         trackers_as_actions = []
+        trackers_as_entities = []
 
         logger.debug(
             "Creating states and action examples from "
@@ -285,7 +324,12 @@ def training_states_and_actions(
 
             delete_first_state = False
             actions = []
+            entities = []
+            entity_data = {}
             for event in tracker.applied_events():
+                if isinstance(event, UserUttered):
+                    entity_data = self._entity_data(event)
+
                 if not isinstance(event, ActionExecuted):
                     continue
 
@@ -293,6 +337,7 @@ def training_states_and_actions(
                     # only actions which can be
                     # predicted at a stories start
                     actions.append(event.action_name or event.action_text)
+                    entities.append(entity_data)
                 else:
                     # unpredictable actions can be
                     # only the first in the story
@@ -303,13 +348,17 @@ def training_states_and_actions(
                         )
                     delete_first_state = True
 
+                # reset entity_data for the the next turn
+                entity_data = {}
+
             if delete_first_state:
                 states = states[1:]
 
             trackers_as_states.append(states[:-1])
             trackers_as_actions.append(actions)
+            trackers_as_entities.append(entities)
 
-        return trackers_as_states, trackers_as_actions
+        return trackers_as_states, trackers_as_actions, trackers_as_entities
 
     def prediction_states(
         self,
@@ -386,9 +435,9 @@ def _hash_example(
         frozen_actions = (action,)
         return hash((frozen_states, frozen_actions))
 
-    def training_states_and_actions(
+    def training_states_actions_and_entities(
         self, trackers: List[DialogueStateTracker], domain: Domain
-    ) -> Tuple[List[List[State]], List[List[Text]]]:
+    ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
         """Transforms list of trackers to lists of states and actions.
 
         Training data is padded up to the length of the longest dialogue with -1.
@@ -403,6 +452,7 @@ def training_states_and_actions(
 
         trackers_as_states = []
         trackers_as_actions = []
+        trackers_as_entities = []
 
         # from multiple states that create equal featurizations
         # we only need to keep one.
@@ -422,7 +472,11 @@ def training_states_and_actions(
             states = self._create_states(tracker, domain)
 
             states_length_for_action = 0
+            entity_data = {}
             for event in tracker.applied_events():
+                if isinstance(event, UserUttered):
+                    entity_data = self._entity_data(event)
+
                 if not isinstance(event, ActionExecuted):
                     continue
 
@@ -448,15 +502,19 @@ def training_states_and_actions(
                         trackers_as_actions.append(
                             [event.action_name or event.action_text]
                         )
+                        trackers_as_entities.append([entity_data])
                 else:
                     trackers_as_states.append(sliced_states)
                     trackers_as_actions.append([event.action_name or event.action_text])
+                    trackers_as_entities.append([entity_data])
 
+                # reset entity_data for the the next turn
+                entity_data = {}
                 pbar.set_postfix({"# actions": "{:d}".format(len(trackers_as_actions))})
 
         logger.debug("Created {} action examples.".format(len(trackers_as_actions)))
 
-        return trackers_as_states, trackers_as_actions
+        return trackers_as_states, trackers_as_actions, trackers_as_entities
 
     def prediction_states(
         self,
diff --git a/rasa/core/policies/memoization.py b/rasa/core/policies/memoization.py
index 7170071d17ad..8510ab9c6852 100644
--- a/rasa/core/policies/memoization.py
+++ b/rasa/core/policies/memoization.py
@@ -23,8 +23,6 @@
 from rasa.shared.core.generator import TrackerWithCachedStates
 from rasa.shared.utils.io import is_logging_disabled
 from rasa.core.constants import MEMOIZATION_POLICY_PRIORITY
-from rasa.shared.core.constants import USER
-from rasa.shared.nlu.constants import ENTITIES, ENTITY_ATTRIBUTE_TYPE
 
 logger = logging.getLogger(__name__)
 
@@ -161,22 +159,7 @@ def _create_feature_key(self, states: List[State]) -> Text:
         # we sort keys to make sure that the same states
         # represented as dictionaries have the same json strings
         # quotes are removed for aesthetic reasons
-
-        # Ignore the actual values of entities
-        # We are just interested whether an entity of a certain type was detected or not
-        _states = []
-        for state in states:
-            _state = {}
-            for key, value in state.items():
-                _state[key] = copy.deepcopy(value)
-                if USER == key and ENTITIES in _state[USER]:
-                    _state[USER][ENTITIES] = [
-                        entity[ENTITY_ATTRIBUTE_TYPE]
-                        for entity in _state[USER][ENTITIES]
-                    ]
-            _states.append(_state)
-
-        feature_str = json.dumps(_states, sort_keys=True).replace('"', "")
+        feature_str = json.dumps(states, sort_keys=True).replace('"', "")
         if self.ENABLE_FEATURE_STRING_COMPRESSION:
             compressed = zlib.compress(
                 bytes(feature_str, rasa.shared.utils.io.DEFAULT_ENCODING)
diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py
index 0d5db5a6c01b..6517b8e3362b 100644
--- a/rasa/core/policies/policy.py
+++ b/rasa/core/policies/policy.py
@@ -143,7 +143,11 @@ def featurize_for_training(
         domain: Domain,
         interpreter: NaturalLanguageInterpreter,
         **kwargs: Any,
-    ) -> Tuple[List[List[Dict[Text, List["Features"]]]], np.ndarray]:
+    ) -> Tuple[
+        List[List[Dict[Text, List["Features"]]]],
+        np.ndarray,
+        List[List[Dict[Text, List["Features"]]]],
+    ]:
         """Transform training trackers into a vector representation.
 
         The trackers, consisting of multiple turns, will be transformed
@@ -163,7 +167,7 @@ def featurize_for_training(
               trackers
         """
 
-        state_features, label_ids = self.featurizer.featurize_trackers(
+        state_features, label_ids, entity_tags = self.featurizer.featurize_trackers(
             training_trackers, domain, interpreter
         )
 
@@ -175,8 +179,9 @@ def featurize_for_training(
             )
             state_features = state_features[:max_training_samples]
             label_ids = label_ids[:max_training_samples]
+            entity_tags = entity_tags[:max_training_samples]
 
-        return state_features, label_ids
+        return state_features, label_ids, entity_tags
 
     def train(
         self,
diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py
index 93abf244d931..0126e60e15f7 100644
--- a/rasa/core/policies/sklearn_policy.py
+++ b/rasa/core/policies/sklearn_policy.py
@@ -233,7 +233,7 @@ def train(
         interpreter: NaturalLanguageInterpreter,
         **kwargs: Any,
     ) -> None:
-        tracker_state_features, label_ids = self.featurize_for_training(
+        tracker_state_features, label_ids, _ = self.featurize_for_training(
             training_trackers, domain, interpreter, **kwargs
         )
         training_data, zero_state_features = model_data_utils.convert_to_data_format(
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index a7cac2b10880..65a6c376825e 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -28,6 +28,7 @@
     VALID_FEATURE_TYPES,
     FEATURE_TYPE_SENTENCE,
     ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_TAGS,
 )
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter
 from rasa.core.policies.policy import Policy
@@ -45,6 +46,7 @@
 from rasa.utils.tensorflow.model_data_utils import convert_to_data_format
 from rasa.utils.tensorflow.constants import (
     LABEL,
+    IDS,
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
@@ -104,7 +106,7 @@
 logger = logging.getLogger(__name__)
 
 LABEL_KEY = LABEL
-LABEL_SUB_KEY = "ids"
+LABEL_SUB_KEY = IDS
 LENGTH = "length"
 SENTENCE_FEATURES_TO_ENCODE = [INTENT, TEXT, ACTION_NAME, ACTION_TEXT]
 SEQUENCE_FEATURES_TO_ENCODE = [TEXT, ACTION_TEXT, f"{LABEL}_{ACTION_TEXT}"]
@@ -138,7 +140,7 @@ class TEDPolicy(Policy):
         # Hidden layer sizes for layers before the dialogue and label embedding layers.
         # The number of hidden layers is equal to the length of the corresponding
         # list.
-        # TODO add 2 parallel NNs: transformer for text and ffnn for names
+
         # Hidden layer sizes for layers before the embedding layers for user message
         # and labels.
         # The number of hidden layers is equal to the length of the corresponding
@@ -157,10 +159,14 @@ class TEDPolicy(Policy):
         },
         CONCAT_DIMENSION: {TEXT: 128, ACTION_TEXT: 128, f"{LABEL}_{ACTION_TEXT}": 128},
         ENCODING_DIMENSION: 50,
-        # Number of units in transformer
+        # Number of units in sequence transformer
         TRANSFORMER_SIZE: 128,
-        # Number of transformer layers
+        # Number of sequence transformer layers
         NUM_TRANSFORMER_LAYERS: 1,
+        # Number of units in dialogue transformer
+        f"{DIALOGUE}_{TRANSFORMER_SIZE}": 128,
+        # Number of dialogue transformer layers
+        f"{DIALOGUE}_{NUM_TRANSFORMER_LAYERS}": 1,
         # Number of attention heads in transformer
         NUM_HEADS: 4,
         # If 'True' use key relative embeddings in attention
@@ -360,6 +366,7 @@ def _create_model_data(
         self,
         tracker_state_features: List[List[Dict[Text, List["Features"]]]],
         label_ids: Optional[np.ndarray] = None,
+        entity_tags: Optional[List[List[Dict[Text, List["Features"]]]]] = None,
         encoded_all_labels: Optional[List[Dict[Text, List["Features"]]]] = None,
     ) -> RasaModelData:
         """Combine all model related data into RasaModelData.
@@ -377,7 +384,11 @@ def _create_model_data(
         """
         model_data = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY)
 
-        if label_ids is not None and encoded_all_labels is not None:
+        if (
+            label_ids is not None
+            and entity_tags is not None
+            and encoded_all_labels is not None
+        ):
 
             label_ids = np.array(
                 [np.expand_dims(seq_label_ids, -1) for seq_label_ids in label_ids]
@@ -391,6 +402,19 @@ def _create_model_data(
             attribute_data, self.zero_state_features = convert_to_data_format(
                 tracker_state_features, featurizers=self.config[FEATURIZERS]
             )
+            if self.config[ENTITY_RECOGNITION]:
+                # check that there are real entity tags
+                if any([any(turn_tags) for turn_tags in entity_tags]):
+                    entity_tags_data, _ = convert_to_data_format(entity_tags)
+                    model_data.add_data(entity_tags_data)
+                else:
+                    # there are no "real" entity tags
+                    logger.debug(
+                        f"Entity recognition cannot be performed,"
+                        f"set {ENTITY_RECOGNITION} to False"
+                    )
+                    self.config[ENTITY_RECOGNITION] = False
+
         else:
             # method is called during prediction
             attribute_data, _ = convert_to_data_format(
@@ -401,9 +425,6 @@ def _create_model_data(
 
         model_data.add_data(attribute_data)
         model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE)
-        model_data.add_lengths(
-            ENTITIES, SEQUENCE_LENGTH, ENTITIES, ENTITY_ATTRIBUTE_TYPE
-        )
         model_data.add_lengths(ACTION_TEXT, SEQUENCE_LENGTH, ACTION_TEXT, SEQUENCE)
 
         # add the dialogue lengths
@@ -437,7 +458,7 @@ def train(
             return
 
         # dealing with training data
-        tracker_state_features, label_ids = self.featurize_for_training(
+        tracker_state_features, label_ids, entity_tags = self.featurize_for_training(
             training_trackers, domain, interpreter, **kwargs
         )
 
@@ -447,7 +468,7 @@ def train(
 
         # extract actual training data to feed to model
         model_data = self._create_model_data(
-            tracker_state_features, label_ids, encoded_all_labels
+            tracker_state_features, label_ids, entity_tags, encoded_all_labels
         )
         if model_data.is_empty():
             logger.error(
@@ -768,7 +789,11 @@ def _prepare_layers(self) -> None:
             self._prepare_encoding_layers(name)
 
         self._prepare_transformer_layer(
-            DIALOGUE, self.config[DROP_RATE_DIALOGUE], self.config[DROP_RATE_ATTENTION]
+            DIALOGUE,
+            self.config[f"{DIALOGUE}_{NUM_TRANSFORMER_LAYERS}"],
+            self.config[f"{DIALOGUE}_{TRANSFORMER_SIZE}"],
+            self.config[DROP_RATE_DIALOGUE],
+            self.config[DROP_RATE_ATTENTION],
         )
 
         self._prepare_embed_layers(DIALOGUE)
@@ -841,7 +866,7 @@ def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_labels_encoded = {}
         for key in self.tf_label_data.keys():
             if key != LABEL_KEY:
-                attribute_features, _ = self._encode_real_features_per_attribute(
+                attribute_features, _, _ = self._encode_real_features_per_attribute(
                     self.tf_label_data, key
                 )
                 all_labels_encoded[key] = attribute_features
@@ -880,8 +905,6 @@ def _emebed_dialogue(
         )
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
-        dialogue_transformer_output = dialogue_transformed
-
         if self.max_history_tracker_featurizer_used:
             # pick last vector if max history featurizer is used
             dialogue_transformed = tf.expand_dims(
@@ -891,11 +914,11 @@ def _emebed_dialogue(
 
         dialogue_embed = self._tf_layers[f"embed.{DIALOGUE}"](dialogue_transformed)
 
-        return dialogue_embed, mask, dialogue_transformer_output
+        return dialogue_embed, mask, dialogue_transformed
 
     def _encode_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         # The input is a representation of 4d tensor of
         # shape (batch-size x dialogue-len x sequence-len x units) in 3d of shape
         # (sum of dialogue history length for all tensors in the batch x
@@ -921,7 +944,7 @@ def _encode_features_per_attribute(
 
     def _encode_fake_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         attribute_features_list = tf_batch_data[attribute][SENTENCE]
         attribute_mask = tf_batch_data[attribute][MASK][0]
 
@@ -941,11 +964,33 @@ def _encode_fake_features_per_attribute(
         attribute_features = tf.zeros(
             (batch_dim, dialogue_dim, units), dtype=tf.float32
         )
-        return attribute_features, tf.zeros(([0, 0, units]))
+        if attribute == TEXT:
+            # TODO handle the case if transformer is not created
+            # if self.config[f"{DIALOGUE}_{NUM_TRANSFORMER_LAYERS}"] > 0:
+            #     units = self.config[f"{DIALOGUE}_{TRANSFORMER_SIZE}"]
+            # elif self.config[HIDDEN_LAYERS_SIZES][TEXT]:
+            #     units = self.config[HIDDEN_LAYERS_SIZES][TEXT]
+            # else:
+            #     for f in attribute_features_list:
+            #         if isinstance(f, tf.SparseTensor):
+            #             units += self.config[DENSE_DIMENSION][attribute]
+            #         else:
+            #             units += f.shape[-1]
+
+            text_transformer_output = tf.zeros(
+                (0, 0, self.config[f"{DIALOGUE}_{TRANSFORMER_SIZE}"]), dtype=tf.float32
+            )
+            text_sequence_lengths = tf.zeros((0, 1), dtype=tf.int32)
+        else:
+            # simulate None with empty tensor of zeros
+            text_transformer_output = tf.zeros((0,))
+            text_sequence_lengths = tf.zeros((0,))
+
+        return attribute_features, text_transformer_output, text_sequence_lengths
 
     def _encode_real_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Encodes features for a given attribute.
 
         Args:
@@ -956,23 +1001,25 @@ def _encode_real_features_per_attribute(
         Returns:
             A tensor combining  all features for `attribute`
         """
-        text_transformer_output = tf.zeros([0, 0, 0])
+        # simulate None with empty tensor of zeros
+        text_transformer_output = tf.zeros((0,))
+        text_sequence_lengths = tf.zeros((0,))
 
         if attribute in SEQUENCE_FEATURES_TO_ENCODE:
             # sequence_lengths contain `0` for "fake" features, while
             # tf_batch_data[attribute] contain only "real" features
             _sequence_lengths = tf_batch_data[attribute][SEQUENCE_LENGTH][0]
             # extract only nonzero lengths and cast to int
-            _sequence_lengths = tf.cast(
+            sequence_lengths = tf.cast(
                 tf.boolean_mask(_sequence_lengths, _sequence_lengths), dtype=tf.int32
             )
             # boolean mask returns flat tensor
-            _sequence_lengths = tf.expand_dims(_sequence_lengths, axis=-1)
+            sequence_lengths = tf.expand_dims(sequence_lengths, axis=-1)
 
             mask_sequence_text = tf.squeeze(
-                self._compute_mask(_sequence_lengths), axis=1
+                self._compute_mask(sequence_lengths), axis=1
             )
-            sequence_lengths = _sequence_lengths + 1
+            sequence_lengths = sequence_lengths + 1
             mask_text = tf.squeeze(self._compute_mask(sequence_lengths), axis=1)
 
             attribute_features, _, _, _ = self._create_sequence(
@@ -989,6 +1036,44 @@ def _encode_real_features_per_attribute(
 
             if attribute == TEXT:
                 text_transformer_output = attribute_features
+                text_sequence_lengths = sequence_lengths
+
+                if self.max_history_tracker_featurizer_used:
+                    # get the location of all last dialogue inputs
+                    dialogue_lengths = tf.cast(
+                        tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32
+                    )
+                    # TODO precompute dialogue_indices after creation of tf_batch_data
+                    dialogue_indices = (
+                        tf.map_fn(
+                            tf.range,
+                            dialogue_lengths,
+                            fn_output_signature=tf.RaggedTensorSpec(
+                                shape=[None], dtype=tf.int32
+                            ),
+                        )
+                    ).values
+                    last_dialogue_mask = tf.math.logical_not(
+                        tf.cast(
+                            tf.concat(
+                                [dialogue_indices, tf.zeros((1,), dtype=tf.int32)],
+                                axis=0,
+                            )[1:],
+                            dtype=tf.bool,
+                        )
+                    )
+
+                    # get only the indices of real text inputs
+                    last_dialogue_mask = tf.boolean_mask(
+                        last_dialogue_mask, tf.reshape(_sequence_lengths, (-1,))
+                    )
+                    # pick last vector if max history featurizer is used
+                    text_transformer_output = tf.boolean_mask(
+                        text_transformer_output, last_dialogue_mask
+                    )
+                    text_sequence_lengths = tf.boolean_mask(
+                        text_sequence_lengths, last_dialogue_mask
+                    )
 
             # resulting attribute features will have shape
             # combined batch dimension and dialogue length x 1 x units
@@ -1006,7 +1091,7 @@ def _encode_real_features_per_attribute(
                 tf_batch_data[attribute][SENTENCE], f"{attribute}_{SENTENCE}"
             )
 
-        if attribute in set(SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE):
+        if attribute in SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
             attribute_features = self._tf_layers[f"ffnn.{attribute}"](
                 attribute_features
             )
@@ -1014,7 +1099,7 @@ def _encode_real_features_per_attribute(
         # attribute_mask has shape batch x dialogue_len x 1
         attribute_mask = tf_batch_data[attribute][MASK][0]
 
-        if attribute in set(SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES):
+        if attribute in SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES:
             dialogue_lengths = tf.cast(
                 tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32
             )
@@ -1027,197 +1112,21 @@ def _encode_real_features_per_attribute(
         # convert them back to their original shape of
         # batch size x dialogue length x units
         attribute_features = self._convert_to_original_shape(
-            attribute_features, attribute_mask, dialogue_lengths, False
-        )
-
-        return attribute_features, text_transformer_output
-
-    def _batch_loss_entities(
-        self,
-        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
-        dialogue_transformer_output: tf.Tensor,
-        text_transformer_output: tf.Tensor,
-    ) -> tf.Tensor:
-        if ENTITY_ATTRIBUTE_TYPE not in tf_batch_data.get(ENTITIES, {}):
-            return tf.constant(0)
-
-        # TODO tf.cond
-        if tf.shape(text_transformer_output)[0] == 0:
-            return tf.constant(0)
-
-        # To calculate the loss for entities we need the output of the text
-        # sequence transformer (shape: combined batch dialogue dimension x
-        # sequence length x units), the output of the dialogue transformer
-        # (shape: batch size x dialogue length x units) and the tag ids for the
-        # entities (shape: combined batch dialogue dimension x sequence length x units)
-        # The combined batch dialogue dimension for the text sequence transformer
-        # and the tag ids matches.
-        # In order to process the tensors, they need to have the same shape.
-        # Convert the output of the dialogue transformer to shape
-        # (combined batch dialogue dimension x sequence length x units).
-        # Note: The CRF layer cannot handle 4D tensors. E.g. we cannot use the shape
-        # batch size x dialogue length x sequence length x units
-
-        tag_ids = tf_batch_data[ENTITIES][ENTITY_ATTRIBUTE_TYPE][0]
-        # add a zero (no entity) for the sentence features to match the shape of
-        # inputs
-        tag_ids = tf.pad(tag_ids, [[0, 0], [0, 1], [0, 0]])
-
-        # convert the output of the dialogue transformer to shape
-        # combined batch dialogue dimension x sequence length x units
-        batch_dim = tf.shape(dialogue_transformer_output)[0]
-        dialogue_transformer_output = self._combine_batch_and_dialogue_dimension(
-            dialogue_transformer_output, tf_batch_data
-        )
-        # get only the dialogues that contain a user utterance
-        dialogue_transformer_output = tf.boolean_mask(
-            dialogue_transformer_output,
-            tf.squeeze(tf_batch_data[TEXT][SEQUENCE_LENGTH][0], axis=-1),
-        )
-
-        # repeat the dialogue transformer output sequence-length-times to get the
-        # same shape as the text sequence transformer output
-        sequence_dimension = tf.shape(text_transformer_output)[1]
-        dialogue_transformer_output = tf.repeat(
-            tf.expand_dims(dialogue_transformer_output, axis=1),
-            sequence_dimension,
-            axis=1,
-        )
-        # add the output of the dialogue transformer to the output of the text
-        # sequence transformer (adding context)
-        # resulting shape
-        # (combined batch and dialogue dimension x sequence length x units)
-        text_transformed = tf.add(text_transformer_output, dialogue_transformer_output)
-
-        # we need the sequence length and the mask for the CRF layer
-        _sequence_lengths = tf_batch_data[TEXT][SEQUENCE_LENGTH][0]
-        # extract only nonzero lengths and cast to int
-        _sequence_lengths = tf.cast(
-            tf.boolean_mask(_sequence_lengths, _sequence_lengths), dtype=tf.int32
-        )
-        # boolean mask returns flat tensor
-        _sequence_lengths = tf.expand_dims(_sequence_lengths, axis=-1)
-        # + 1 for sentence features
-        sequence_lengths = _sequence_lengths + 1
-        mask = tf.squeeze(self._compute_mask(sequence_lengths), axis=1)
-
-        if self.max_history_tracker_featurizer_used:
-            # if the max history tracker featurizer is used we just want the last
-            # dialogues that contain a user utterance for every batch example
-
-            # the attribute mask indicates which dialogue contains a user utterance
-            attribute_mask = tf_batch_data[TEXT][MASK][0]
-            # get indices of all dialogues that contain a user utterance
-            # shape: (combined batch dialogue dimension x 2)
-            # TODO it seems like there are sometimes dialogues that do not have any
-            #  text features, but that should not be
-            indices_of_text_dialogues = tf.where(
-                tf.not_equal(tf.squeeze(attribute_mask), 0)
-            )
-            # get the index of the last dialogues indices for every batch example
-            indices_of_last_text_dialogue_indices = (
-                tf.cumsum(
-                    tf.squeeze(
-                        tf.cast(tf.reduce_sum(attribute_mask, axis=1), dtype=tf.int32)
-                    )
-                )
-                - 1
-            )
-            # get only those the indices_of_text_dialogues of the last dialogues
-            # resulting shape of indices (batch size x 2)
-            indices_of_text_dialogues = tf.gather(
-                indices_of_text_dialogues, indices_of_last_text_dialogue_indices
-            )
-
-            # We now hove the indices of the relevant dialogues. However,
-            # text_transformed has a different shape (first dimension is the combined
-            # batch dialogue dimension). Thus we need to map the
-            # indices_of_text_dialogues into this shape.
-            cumsum_sequence_length = tf.squeeze(
-                tf.cast(tf.cumsum(sequence_lengths, axis=0), dtype=tf.int32)
-            )
-            last_dialogue_indices = tf.map_fn(
-                lambda x: cumsum_sequence_length[x[0]] + x[1],
-                tf.cast(indices_of_text_dialogues, dtype=tf.int32),
-            )
-
-            # build up indices to get the last dialogues from text_transformed and the
-            # other tensors
-            dialogue_indices = tf.repeat(
-                tf.expand_dims(last_dialogue_indices, axis=1),
-                sequence_dimension,
-                axis=1,
-            )
-            sequence_indices = tf.repeat(
-                tf.expand_dims(tf.range(sequence_dimension), axis=0), batch_dim, axis=0
-            )
-            indices = tf.stack([dialogue_indices, sequence_indices], axis=2)
-
-            # get all last dialogues from text_transformed using the above indices
-            # resulting shape (batch size x sequence length x units)
-            text_transformed = tf.gather_nd(text_transformed, indices)
-            # do the same for the other tensors
-            tag_ids = tf.gather_nd(tag_ids, indices)
-            mask = tf.gather_nd(mask, indices)
-            # as sequence_lengths is a 1D tensor use tf.gather instead of tf.gather_nd
-            sequence_lengths = tf.gather(sequence_lengths, last_dialogue_indices)
-
-        loss, f1, _ = self._calculate_entity_loss(
-            text_transformed,
-            tag_ids,
-            mask,
-            tf.squeeze(sequence_lengths),
-            ENTITY_ATTRIBUTE_TYPE,
+            attribute_features, attribute_mask, dialogue_lengths
         )
 
-        self.entity_loss.update_state(loss)
-        self.entity_f1.update_state(f1)
-
-        return loss
-
-    @staticmethod
-    def _combine_batch_and_dialogue_dimension(
-        tensor: tf.Tensor, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
-    ):
-        """Combines the batch and dialogue dimension of the given tensor.
-
-        Before the tensor has shape (batch-size x dialogue-length x ...).
-        Afterwards the tensor will have shape
-        (combined batch and dialogue dimension x ...).
-
-        Args:
-            tensor: The tensor
-            tf_batch_data: the batch data
-
-        Returns:
-            The converted tensor
-        """
-        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
-
-        batch_dim = tf.shape(dialogue_lengths)[0]
-        batch_indices = tf.repeat(tf.range(batch_dim), dialogue_lengths)
-        dialogue_indices = (
-            tf.map_fn(
-                tf.range,
-                dialogue_lengths,
-                fn_output_signature=tf.RaggedTensorSpec(shape=[None], dtype=tf.int32),
-            )
-        ).values
-        indices = tf.stack([batch_indices, dialogue_indices], axis=1)
-
-        return tf.gather_nd(tensor, indices)
+        return attribute_features, text_transformer_output, text_sequence_lengths
 
     @staticmethod
     def _convert_to_original_shape(
         attribute_features: tf.Tensor,
         attribute_mask: tf.Tensor,
         dialogue_lengths: tf.Tensor,
-        consider_sequence_dimension: bool,
     ) -> tf.Tensor:
         """Transform attribute features back to original shape.
 
-        Given shape: combined batch and dialogue dimension x sequence length x units
-        Original shape: batch x dialogue length x sequence length x units
+        Given shape: (combined batch and dialogue dimension x 1 x units)
+        Original shape: (batch x dialogue length x units)
 
         Args:
             attribute_features: the "real" features to convert
@@ -1231,8 +1140,8 @@ def _convert_to_original_shape(
         """
 
         # in order to convert the attribute features with shape
-        # combined batch-size and dialogue length x sequence length x units
-        # to a shape of batch-size x dialogue length x sequence length x units
+        # (combined batch-size and dialogue length x 1 x units)
+        # to a shape of (batch-size x dialogue length x units)
         # we use tf.scatter_nd. Therefore, we need the target shape and the indices
         # mapping the values of attribute features to the position in the resulting
         # tensor.
@@ -1247,7 +1156,7 @@ def _convert_to_original_shape(
         non_fake_dialogue_lengths = tf.reduce_sum(attribute_mask, axis=-1)
         # create the batch indices
         batch_indices = tf.repeat(tf.range(batch_dim), non_fake_dialogue_lengths)
-
+        # TODO precompute dialogue_indices after creation of tf_batch_data
         dialogue_indices = (
             tf.map_fn(
                 tf.range,
@@ -1268,21 +1177,108 @@ def _convert_to_original_shape(
 
         indices = tf.stack([batch_indices, dialogue_indices], axis=1)
 
-        if consider_sequence_dimension:
-            sequence_length = tf.shape(attribute_features)[1]
-            shape = tf.convert_to_tensor(
-                [batch_dim, dialogue_dim, sequence_length, units]
-            )
-            return tf.scatter_nd(indices, attribute_features, shape)
-
         shape = tf.convert_to_tensor([batch_dim, dialogue_dim, units])
         attribute_features = tf.squeeze(attribute_features, axis=1)
 
         return tf.scatter_nd(indices, attribute_features, shape)
 
+    def _batch_loss_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> tf.Tensor:
+
+        return tf.cond(
+            tf.shape(text_transformer_output)[0] > 0,
+            lambda: self._real_batch_loss_entities(
+                tf_batch_data,
+                dialogue_transformer_output,
+                text_transformer_output,
+                text_sequence_lengths,
+            ),
+            lambda: tf.constant(0.0),
+        )
+
+    def _real_batch_loss_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> tf.Tensor:
+        # To calculate the loss for entities we need the output of the text
+        # sequence transformer (shape: real entity dim x
+        # sequence length x units), the output of the dialogue transformer
+        # (shape: batch size x dialogue length x units) and the tag ids for the
+        # entities (shape: real entity dim x sequence length - 1 x units)
+        # The real entity dimension for the text sequence transformer
+        # and the tag ids matches.
+        # In order to process the tensors, they need to have the same shape.
+        # Convert the output of the dialogue transformer to shape
+        # (real entity dim x 1 x units).
+        # Note: The CRF layer cannot handle 4D tensors. E.g. we cannot use the shape
+        # batch size x dialogue length x sequence length x units
+
+        # convert the output of the dialogue transformer
+        # to shape (real entity dim x 1 x units)
+        attribute_mask = tf_batch_data[TEXT][MASK][0]
+        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+
+        if self.max_history_tracker_featurizer_used:
+            # pick last vector if max history featurizer is used
+            attribute_mask = tf.expand_dims(
+                self._last_token(attribute_mask, dialogue_lengths), axis=1
+            )
+        dialogue_transformer_output = tf.boolean_mask(
+            dialogue_transformer_output, tf.squeeze(attribute_mask, axis=-1)
+        )
+
+        # boolean mask removed axis=1, add it back
+        dialogue_transformer_output = tf.expand_dims(
+            dialogue_transformer_output, axis=1
+        )
+
+        # broadcast the dialogue transformer output sequence-length-times to get the
+        # same shape as the text sequence transformer output
+        dialogue_transformer_output = tf.broadcast_to(
+            dialogue_transformer_output, tf.shape(text_transformer_output)
+        )
+
+        # concat the output of the dialogue transformer to the output of the text
+        # sequence transformer (adding context)
+        # resulting shape
+        # (real entity dim x sequence length x 2 units)
+        text_transformed = tf.concat(
+            [text_transformer_output, dialogue_transformer_output], axis=-1
+        )
+
+        mask = tf.squeeze(self._compute_mask(text_sequence_lengths), axis=1)
+        # remove additional dims and sentence features
+        text_sequence_lengths = tf.reshape(text_sequence_lengths, (-1,)) - 1
+
+        tag_ids = tf_batch_data[ENTITY_TAGS][IDS][0]
+        # add a zero (no entity) for the sentence features to match the shape of
+        # inputs
+        tag_ids = tf.pad(tag_ids, [[0, 0], [0, 1], [0, 0]])
+
+        loss, f1, _ = self._calculate_entity_loss(
+            text_transformed,
+            tag_ids,
+            mask,
+            text_sequence_lengths,
+            ENTITY_ATTRIBUTE_TYPE,
+        )
+
+        self.entity_loss.update_state(loss)
+        self.entity_f1.update_state(f1)
+
+        return loss
+
     def _process_batch_data(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+    ) -> Tuple[tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]:
         """Encodes batch data.
 
         Combines intent and text and action name and action text if both are present.
@@ -1294,18 +1290,21 @@ def _process_batch_data(
              Tensor: encoding of all features in the batch, combined;
         """
         # encode each attribute present in tf_batch_data
-        text_transformer_output = tf.zeros([0, 0, 0])
-
+        text_transformer_output = None
+        text_sequence_lengths = None
         batch_encoded = {}
-        for key in tf_batch_data.keys():
-            if LABEL_KEY not in key and DIALOGUE not in key:
+        for attribute in tf_batch_data.keys():
+            if attribute in SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES:
                 (
                     attribute_features,
                     _text_transformer_output,
-                ) = self._encode_features_per_attribute(tf_batch_data, key)
-                batch_encoded[key] = attribute_features
-                if tf.shape(_text_transformer_output)[0] > 0:
+                    _text_sequence_lengths,
+                ) = self._encode_features_per_attribute(tf_batch_data, attribute)
+
+                batch_encoded[attribute] = attribute_features
+                if attribute == TEXT:
                     text_transformer_output = _text_transformer_output
+                    text_sequence_lengths = _text_sequence_lengths
 
         # if both action text and action name are present, combine them; otherwise,
         # return the one which is present
@@ -1340,7 +1339,7 @@ def _process_batch_data(
 
         batch_features = tf.concat(batch_features, axis=-1)
 
-        return batch_features, text_transformer_output
+        return batch_features, text_transformer_output, text_sequence_lengths
 
     @staticmethod
     def _get_labels_embed(
@@ -1372,7 +1371,11 @@ def batch_loss(
         label_ids = tf_batch_data[LABEL_KEY][LABEL_SUB_KEY][0]
         labels_embed = self._get_labels_embed(label_ids, all_labels_embed)
 
-        dialogue_in, text_transformer_output = self._process_batch_data(tf_batch_data)
+        (
+            dialogue_in,
+            text_transformer_output,
+            text_sequence_lengths,
+        ) = self._process_batch_data(tf_batch_data)
         (
             dialogue_embed,
             dialogue_mask,
@@ -1392,10 +1395,17 @@ def batch_loss(
         )
         losses.append(loss)
 
-        if self.config[ENTITY_RECOGNITION]:
+        if (
+            self.config[ENTITY_RECOGNITION]
+            and text_transformer_output is not None
+            and text_sequence_lengths is not None
+        ):
             losses.append(
                 self._batch_loss_entities(
-                    tf_batch_data, dialogue_transformer_output, text_transformer_output
+                    tf_batch_data,
+                    dialogue_transformer_output,
+                    text_transformer_output,
+                    text_sequence_lengths,
                 )
             )
 
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index d6d74c63eaec..eb9268c02887 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -44,6 +44,7 @@
 from rasa.nlu.model import Metadata
 from rasa.utils.tensorflow.constants import (
     LABEL,
+    IDS,
     HIDDEN_LAYERS_SIZES,
     SHARE_HIDDEN_LAYERS,
     TRANSFORMER_SIZE,
@@ -101,8 +102,7 @@
 SPARSE = "sparse"
 DENSE = "dense"
 LABEL_KEY = LABEL
-LABEL_SUB_KEY = "ids"
-TAG_IDS = "tag_ids"
+LABEL_SUB_KEY = IDS
 
 POSSIBLE_TAGS = [ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_ROLE, ENTITY_ATTRIBUTE_GROUP]
 
diff --git a/rasa/shared/core/domain.py b/rasa/shared/core/domain.py
index d2afb3f84c81..0072a1e4f975 100644
--- a/rasa/shared/core/domain.py
+++ b/rasa/shared/core/domain.py
@@ -67,7 +67,7 @@
 # State is a dictionary with keys (USER, PREVIOUS_ACTION, SLOTS, ACTIVE_LOOP)
 # representing the origin of a SubState;
 # the values are SubStates, that contain the information needed for featurization
-SubState = Dict[Text, Union[Text, Tuple[Union[float, Text, Dict]]]]
+SubState = Dict[Text, Union[Text, Tuple[Union[float, Text]]]]
 State = Dict[Text, SubState]
 
 logger = logging.getLogger(__name__)
@@ -822,9 +822,7 @@ def input_states(self) -> List[Text]:
             + self.form_names
         )
 
-    def _get_featurized_entities(
-        self, latest_message: UserUttered
-    ) -> List[Dict[Text, Any]]:
+    def _get_featurized_entities(self, latest_message: UserUttered) -> Set[Text]:
         intent_name = latest_message.intent.get(
             rasa.shared.nlu.constants.INTENT_NAME_KEY
         )
@@ -855,11 +853,7 @@ def _get_featurized_entities(
         # concatenated entity labels with their corresponding roles and groups labels
         wanted_entities = set(intent_config.get(USED_ENTITIES_KEY, entity_names))
 
-        return [
-            entity
-            for entity in latest_message.entities
-            if entity["entity"] in entity_names & wanted_entities
-        ]
+        return entity_names & wanted_entities
 
     def _get_user_sub_state(
         self, tracker: "DialogueStateTracker"
diff --git a/rasa/shared/core/events.py b/rasa/shared/core/events.py
index 1f3f94240eaf..c254649a9870 100644
--- a/rasa/shared/core/events.py
+++ b/rasa/shared/core/events.py
@@ -458,7 +458,7 @@ def as_sub_state(self) -> Dict[Text, Union[None, Text, List[Optional[Text]]]]:
         if self.intent_name and not self.use_text_for_featurization:
             out[INTENT] = self.intent_name
         if entities:
-            out[ENTITIES] = self.entities
+            out[ENTITIES] = entities
 
         return out
 
diff --git a/rasa/shared/core/generator.py b/rasa/shared/core/generator.py
index dcb69f20cea4..994ee52fedaf 100644
--- a/rasa/shared/core/generator.py
+++ b/rasa/shared/core/generator.py
@@ -31,7 +31,6 @@
 )
 from rasa.shared.utils.io import is_logging_disabled
 import rasa.shared.utils.io
-from rasa.shared.nlu.constants import ENTITIES
 
 logger = logging.getLogger(__name__)
 
@@ -103,18 +102,10 @@ def past_states_for_hashing(self, domain: Domain) -> Deque[FrozenState]:
 
     @staticmethod
     def _unfreeze_states(frozen_states: Deque[FrozenState]) -> List[State]:
-        states = []
-        for frozen_state in frozen_states:
-            state_dict = {}
-            for key, value in dict(frozen_state).items():
-                _value = dict(value)
-                if ENTITIES in _value:
-                    _value[ENTITIES] = [
-                        dict(frozen_entity) for frozen_entity in _value[ENTITIES]
-                    ]
-                state_dict[key] = _value
-            states.append(state_dict)
-        return states
+        return [
+            {key: dict(value) for key, value in dict(frozen_state).items()}
+            for frozen_state in frozen_states
+        ]
 
     def past_states(self, domain: Domain) -> List[State]:
         states_for_hashing = self.past_states_for_hashing(domain)
diff --git a/rasa/shared/core/trackers.py b/rasa/shared/core/trackers.py
index 247c6825ba65..47c1c7744362 100644
--- a/rasa/shared/core/trackers.py
+++ b/rasa/shared/core/trackers.py
@@ -29,7 +29,6 @@
     ENTITY_ATTRIBUTE_ROLE,
     ACTION_TEXT,
     ACTION_NAME,
-    ENTITIES,
 )
 from rasa.shared.core import events
 from rasa.shared.core.constants import (
@@ -69,9 +68,7 @@
 logger = logging.getLogger(__name__)
 
 # same as State but with Dict[...] substituted with FrozenSet[Tuple[...]]
-FrozenState = FrozenSet[
-    Tuple[Text, FrozenSet[Tuple[Text, Tuple[Union[float, Text, FrozenSet]]]]]
-]
+FrozenState = FrozenSet[Tuple[Text, FrozenSet[Tuple[Text, Tuple[Union[float, Text]]]]]]
 
 
 class EventVerbosity(Enum):
@@ -234,19 +231,14 @@ def _events_for_verbosity(
 
     @staticmethod
     def freeze_current_state(state: State) -> FrozenState:
-        state_copy = copy.deepcopy(state)
-        frozen_state = {}
-        for key, values in state_copy.items():
-            if isinstance(values, dict):
-                if ENTITIES in values and isinstance(values[ENTITIES][0], dict):
-                    values[ENTITIES] = tuple(
-                        [frozenset(e.items()) for e in values[ENTITIES]]
-                    )
-                frozen_state[key] = frozenset(values.items())
-            else:
-                frozen_state[key] = frozenset(values)
-
-        return frozenset(frozen_state.items())
+        return frozenset(
+            {
+                key: frozenset(values.items())
+                if isinstance(values, Dict)
+                else frozenset(values)
+                for key, values in state.items()
+            }.items()
+        )
 
     def past_states(self, domain: Domain) -> List[State]:
         """Generate the past states of this tracker based on the history.
diff --git a/rasa/shared/nlu/constants.py b/rasa/shared/nlu/constants.py
index ee85a005f935..53040f0d4c53 100644
--- a/rasa/shared/nlu/constants.py
+++ b/rasa/shared/nlu/constants.py
@@ -26,6 +26,7 @@
 TRAINABLE_EXTRACTORS = {"MitieEntityExtractor", "CRFEntityExtractor", "DIETClassifier"}
 
 ENTITIES = "entities"
+ENTITY_TAGS = "entity_tags"
 ENTITY_ATTRIBUTE_TYPE = "entity"
 ENTITY_ATTRIBUTE_GROUP = "group"
 ENTITY_ATTRIBUTE_ROLE = "role"
diff --git a/rasa/shared/nlu/training_data/features.py b/rasa/shared/nlu/training_data/features.py
index c556d6e6c3ff..755215fae35e 100644
--- a/rasa/shared/nlu/training_data/features.py
+++ b/rasa/shared/nlu/training_data/features.py
@@ -16,21 +16,11 @@ def __init__(
         attribute: Text,
         origin: Union[Text, List[Text]],
     ) -> None:
-        self._validate_feature_type(feature_type)
-
         self.features = features
         self.type = feature_type
         self.origin = origin
         self.attribute = attribute
 
-    @staticmethod
-    def _validate_feature_type(feature_type: Text) -> None:
-        if feature_type not in VALID_FEATURE_TYPES:
-            raise ValueError(
-                f"Invalid feature type '{feature_type}' used. Valid feature types are: "
-                f"{VALID_FEATURE_TYPES}."
-            )
-
     def is_sparse(self) -> bool:
         """Checks if features are sparse or not.
 
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 06f81775a673..7957e84f8351 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -1,6 +1,7 @@
 # constants for configuration parameters of our tensorflow models
 
 LABEL = "label"
+IDS = "ids"
 HIDDEN_LAYERS_SIZES = "hidden_layers_sizes"
 SHARE_HIDDEN_LAYERS = "share_hidden_layers"
 
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index bfe483ebffef..6b1242d90a0a 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -732,14 +732,16 @@ def _prepare_ffnn_layer(
     def _prepare_transformer_layer(
         self,
         name: Text,
+        num_layers: int,
+        units: int,
         drop_rate: float,
         drop_rate_attention: float,
         prefix: Text = "transformer",
     ):
         if self.config[NUM_TRANSFORMER_LAYERS] > 0:
             self._tf_layers[f"{prefix}.{name}"] = TransformerEncoder(
-                self.config[NUM_TRANSFORMER_LAYERS],
-                self.config[TRANSFORMER_SIZE],
+                num_layers,
+                units,
                 self.config[NUM_HEADS],
                 self.config[TRANSFORMER_SIZE] * 4,
                 self.config[REGULARIZATION_CONSTANT],
@@ -834,7 +836,11 @@ def _prepare_input_layers(self, name: Text) -> None:
     def _prepare_sequence_layers(self, name: Text) -> None:
         self._prepare_input_layers(name)
         self._prepare_transformer_layer(
-            name, self.config[DROP_RATE], self.config[DROP_RATE_ATTENTION]
+            name,
+            self.config[NUM_TRANSFORMER_LAYERS],
+            self.config[TRANSFORMER_SIZE],
+            self.config[DROP_RATE],
+            self.config[DROP_RATE_ATTENTION],
         )
 
     def _prepare_entity_recognition_layers(self) -> None:

From d9a5378d59320637a8ac6e96259017ddadde9953 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 12 Nov 2020 12:16:48 +0100
Subject: [PATCH 45/62] update entity prediction

---
 rasa/core/policies/ted_policy.py | 373 ++++++++++++++-----------------
 1 file changed, 168 insertions(+), 205 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 65a6c376825e..e6f0e44d0455 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -140,7 +140,6 @@ class TEDPolicy(Policy):
         # Hidden layer sizes for layers before the dialogue and label embedding layers.
         # The number of hidden layers is equal to the length of the corresponding
         # list.
-
         # Hidden layer sizes for layers before the embedding layers for user message
         # and labels.
         # The number of hidden layers is equal to the length of the corresponding
@@ -685,9 +684,9 @@ def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
             model_data_example,
             data_signature=model_data_example.get_signature(),
             config=meta,
-            max_history_tracker_featurizer_used=isinstance(
-                featurizer, MaxHistoryTrackerFeaturizer
-            ),
+            # during prediction we don't care about previous dialogue turns,
+            # so to save computation time, use only the last one
+            use_only_last_dialogue_turn=True,
             label_data=label_data,
             entity_tag_specs=entity_tag_specs,
         )
@@ -721,13 +720,13 @@ def __init__(
         self,
         data_signature: Dict[Text, Dict[Text, List[FeatureSignature]]],
         config: Dict[Text, Any],
-        max_history_tracker_featurizer_used: bool,
+        use_only_last_dialogue_turn: bool,
         label_data: RasaModelData,
         entity_tag_specs: Optional[List[EntityTagSpec]],
     ) -> None:
         super().__init__("TED", config, data_signature, label_data)
 
-        self.max_history_tracker_featurizer_used = max_history_tracker_featurizer_used
+        self.use_only_last_dialogue_turn = use_only_last_dialogue_turn
 
         self.predict_data_signature = {
             feature_name: features
@@ -775,6 +774,8 @@ def _check_data(self) -> None:
                 f"Cannot train '{self.__class__.__name__}' model."
             )
 
+    # ---CREATING LAYERS HELPERS---
+
     def _prepare_layers(self) -> None:
         for name in self.data_signature.keys():
             self._prepare_sparse_dense_layer_for(name, self.data_signature)
@@ -860,6 +861,8 @@ def _prepare_encoding_layers(self, name: Text) -> None:
             self.config[DROP_RATE_DIALOGUE],
         )
 
+    # ---GRAPH BUILDING HELPERS---
+
     def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_label_ids = self.tf_label_data[LABEL_KEY][LABEL_SUB_KEY][0]
         # labels cannot have all features "fake"
@@ -905,7 +908,7 @@ def _emebed_dialogue(
         )
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
-        if self.max_history_tracker_featurizer_used:
+        if self.use_only_last_dialogue_turn:
             # pick last vector if max history featurizer is used
             dialogue_transformed = tf.expand_dims(
                 self._last_token(dialogue_transformed, dialogue_lengths), 1
@@ -1038,7 +1041,7 @@ def _encode_real_features_per_attribute(
                 text_transformer_output = attribute_features
                 text_sequence_lengths = sequence_lengths
 
-                if self.max_history_tracker_featurizer_used:
+                if self.use_only_last_dialogue_turn:
                     # get the location of all last dialogue inputs
                     dialogue_lengths = tf.cast(
                         tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32
@@ -1182,32 +1185,78 @@ def _convert_to_original_shape(
 
         return tf.scatter_nd(indices, attribute_features, shape)
 
-    def _batch_loss_entities(
-        self,
-        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
-        dialogue_transformer_output: tf.Tensor,
-        text_transformer_output: tf.Tensor,
-        text_sequence_lengths: tf.Tensor,
-    ) -> tf.Tensor:
+    def _process_batch_data(
+        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
+    ) -> Tuple[tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]:
+        """Encodes batch data.
 
-        return tf.cond(
-            tf.shape(text_transformer_output)[0] > 0,
-            lambda: self._real_batch_loss_entities(
-                tf_batch_data,
-                dialogue_transformer_output,
-                text_transformer_output,
-                text_sequence_lengths,
-            ),
-            lambda: tf.constant(0.0),
-        )
+        Combines intent and text and action name and action text if both are present.
 
-    def _real_batch_loss_entities(
+        Args:
+            tf_batch_data: dictionary mapping every attribute to its features and masks
+
+        Returns:
+             Tensor: encoding of all features in the batch, combined;
+        """
+        # encode each attribute present in tf_batch_data
+        text_transformer_output = None
+        text_sequence_lengths = None
+        batch_encoded = {}
+        for attribute in tf_batch_data.keys():
+            if attribute in SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES:
+                (
+                    attribute_features,
+                    _text_transformer_output,
+                    _text_sequence_lengths,
+                ) = self._encode_features_per_attribute(tf_batch_data, attribute)
+
+                batch_encoded[attribute] = attribute_features
+                if attribute == TEXT:
+                    text_transformer_output = _text_transformer_output
+                    text_sequence_lengths = _text_sequence_lengths
+
+        # if both action text and action name are present, combine them; otherwise,
+        # return the one which is present
+
+        if (
+            batch_encoded.get(ACTION_TEXT) is not None
+            and batch_encoded.get(ACTION_NAME) is not None
+        ):
+            batch_action = batch_encoded.pop(ACTION_TEXT) + batch_encoded.pop(
+                ACTION_NAME
+            )
+        elif batch_encoded.get(ACTION_TEXT) is not None:
+            batch_action = batch_encoded.pop(ACTION_TEXT)
+        else:
+            batch_action = batch_encoded.pop(ACTION_NAME)
+        # same for user input
+        if (
+            batch_encoded.get(INTENT) is not None
+            and batch_encoded.get(TEXT) is not None
+        ):
+            batch_user = batch_encoded.pop(INTENT) + batch_encoded.pop(TEXT)
+        elif batch_encoded.get(TEXT) is not None:
+            batch_user = batch_encoded.pop(TEXT)
+        else:
+            batch_user = batch_encoded.pop(INTENT)
+
+        batch_features = [batch_user, batch_action]
+        # once we have user input and previous action,
+        # add all other attributes (SLOTS, ACTIVE_LOOP, etc.) to batch_features;
+        for key in batch_encoded.keys():
+            batch_features.append(batch_encoded.get(key))
+
+        batch_features = tf.concat(batch_features, axis=-1)
+
+        return batch_features, text_transformer_output, text_sequence_lengths
+
+    def _reshape_for_entities(
         self,
         tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
         dialogue_transformer_output: tf.Tensor,
         text_transformer_output: tf.Tensor,
         text_sequence_lengths: tf.Tensor,
-    ) -> tf.Tensor:
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         # To calculate the loss for entities we need the output of the text
         # sequence transformer (shape: real entity dim x
         # sequence length x units), the output of the dialogue transformer
@@ -1226,7 +1275,7 @@ def _real_batch_loss_entities(
         attribute_mask = tf_batch_data[TEXT][MASK][0]
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
 
-        if self.max_history_tracker_featurizer_used:
+        if self.use_only_last_dialogue_turn:
             # pick last vector if max history featurizer is used
             attribute_mask = tf.expand_dims(
                 self._last_token(attribute_mask, dialogue_lengths), axis=1
@@ -1254,19 +1303,56 @@ def _real_batch_loss_entities(
             [text_transformer_output, dialogue_transformer_output], axis=-1
         )
 
-        mask = tf.squeeze(self._compute_mask(text_sequence_lengths), axis=1)
+        text_mask = tf.squeeze(self._compute_mask(text_sequence_lengths), axis=1)
         # remove additional dims and sentence features
         text_sequence_lengths = tf.reshape(text_sequence_lengths, (-1,)) - 1
 
+        return text_transformed, text_mask, text_sequence_lengths
+
+    # ---TRAINING---
+
+    def _batch_loss_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> tf.Tensor:
+
+        return tf.cond(
+            tf.shape(text_transformer_output)[0] > 0,
+            lambda: self._real_batch_loss_entities(
+                tf_batch_data,
+                dialogue_transformer_output,
+                text_transformer_output,
+                text_sequence_lengths,
+            ),
+            lambda: tf.constant(0.0),
+        )
+
+    def _real_batch_loss_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> tf.Tensor:
+
+        text_transformed, text_mask, text_sequence_lengths = self._reshape_for_entities(
+            tf_batch_data,
+            dialogue_transformer_output,
+            text_transformer_output,
+            text_sequence_lengths,
+        )
+
         tag_ids = tf_batch_data[ENTITY_TAGS][IDS][0]
-        # add a zero (no entity) for the sentence features to match the shape of
-        # inputs
+        # add a zero (no entity) for the sentence features to match the shape of inputs
         tag_ids = tf.pad(tag_ids, [[0, 0], [0, 1], [0, 0]])
 
         loss, f1, _ = self._calculate_entity_loss(
             text_transformed,
             tag_ids,
-            mask,
+            text_mask,
             text_sequence_lengths,
             ENTITY_ATTRIBUTE_TYPE,
         )
@@ -1276,71 +1362,6 @@ def _real_batch_loss_entities(
 
         return loss
 
-    def _process_batch_data(
-        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]:
-        """Encodes batch data.
-
-        Combines intent and text and action name and action text if both are present.
-
-        Args:
-            tf_batch_data: dictionary mapping every attribute to its features and masks
-
-        Returns:
-             Tensor: encoding of all features in the batch, combined;
-        """
-        # encode each attribute present in tf_batch_data
-        text_transformer_output = None
-        text_sequence_lengths = None
-        batch_encoded = {}
-        for attribute in tf_batch_data.keys():
-            if attribute in SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES:
-                (
-                    attribute_features,
-                    _text_transformer_output,
-                    _text_sequence_lengths,
-                ) = self._encode_features_per_attribute(tf_batch_data, attribute)
-
-                batch_encoded[attribute] = attribute_features
-                if attribute == TEXT:
-                    text_transformer_output = _text_transformer_output
-                    text_sequence_lengths = _text_sequence_lengths
-
-        # if both action text and action name are present, combine them; otherwise,
-        # return the one which is present
-
-        if (
-            batch_encoded.get(ACTION_TEXT) is not None
-            and batch_encoded.get(ACTION_NAME) is not None
-        ):
-            batch_action = batch_encoded.pop(ACTION_TEXT) + batch_encoded.pop(
-                ACTION_NAME
-            )
-        elif batch_encoded.get(ACTION_TEXT) is not None:
-            batch_action = batch_encoded.pop(ACTION_TEXT)
-        else:
-            batch_action = batch_encoded.pop(ACTION_NAME)
-        # same for user input
-        if (
-            batch_encoded.get(INTENT) is not None
-            and batch_encoded.get(TEXT) is not None
-        ):
-            batch_user = batch_encoded.pop(INTENT) + batch_encoded.pop(TEXT)
-        elif batch_encoded.get(TEXT) is not None:
-            batch_user = batch_encoded.pop(TEXT)
-        else:
-            batch_user = batch_encoded.pop(INTENT)
-
-        batch_features = [batch_user, batch_action]
-        # once we have user input and previous action,
-        # add all other attributes (SLOTS, ACTIVE_LOOP, etc.) to batch_features;
-        for key in batch_encoded.keys():
-            batch_features.append(batch_encoded.get(key))
-
-        batch_features = tf.concat(batch_features, axis=-1)
-
-        return batch_features, text_transformer_output, text_sequence_lengths
-
     @staticmethod
     def _get_labels_embed(
         label_ids: tf.Tensor, all_labels_embed: tf.Tensor
@@ -1414,6 +1435,8 @@ def batch_loss(
 
         return tf.math.add_n(losses)
 
+    # ---PREDICTION---
+
     def prepare_for_predict(self) -> None:
         _, self.all_labels_embed = self._create_all_labels_embed()
 
@@ -1438,7 +1461,11 @@ def batch_predict(
             batch_in, self.predict_data_signature
         )
 
-        dialogue_in, text_transformer_output = self._process_batch_data(tf_batch_data)
+        (
+            dialogue_in,
+            text_transformer_output,
+            text_sequence_lengths,
+        ) = self._process_batch_data(tf_batch_data)
         (
             dialogue_embed,
             dialogue_mask,
@@ -1446,15 +1473,6 @@ def batch_predict(
         ) = self._emebed_dialogue(dialogue_in, tf_batch_data)
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
-        predictions = {}
-
-        if self.config[ENTITY_RECOGNITION]:
-            predictions.update(
-                self._batch_predict_entities(
-                    tf_batch_data, dialogue_transformer_output, text_transformer_output
-                )
-            )
-
         sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
             dialogue_embed[:, :, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, tf.newaxis, :, :],
@@ -1464,8 +1482,22 @@ def batch_predict(
         scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
             sim_all, self.config[SIMILARITY_TYPE]
         )
+        predictions = {"action_scores": scores, "similarities": sim_all}
 
-        predictions.update({"action_scores": scores, "similarities": sim_all})
+        if (
+            self.config[ENTITY_RECOGNITION]
+            and text_transformer_output is not None
+            and text_sequence_lengths is not None
+        ):
+            pred_ids, confidences = self._batch_predict_entities(
+                tf_batch_data,
+                dialogue_transformer_output,
+                text_transformer_output,
+                text_sequence_lengths,
+            )
+            name = ENTITY_ATTRIBUTE_TYPE
+            predictions[f"e_{name}_ids"] = pred_ids
+            predictions[f"e_{name}_scores"] = confidences
 
         return predictions
 
@@ -1474,112 +1506,43 @@ def _batch_predict_entities(
         tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
         dialogue_transformer_output: tf.Tensor,
         text_transformer_output: tf.Tensor,
-    ) -> Dict[Text, tf.Tensor]:
-        predictions: Dict[Text, tf.Tensor] = {}
-
-        # TODO Update according to batch loss entities
-
-        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
-
-        # convert the output of the text sequence transformer to shape
-        # batch-size x dialogue length x sequence length x 1
-        text_seq_transformer_output = self._convert_to_original_shape(
-            text_transformer_output,
-            tf_batch_data[TEXT][MASK][0],
-            dialogue_lengths,
-            True,
-        )
-        # convert the output of the text sequence transformer to shape
-        # combined batch dialogue dimension x sequence length x units
-        text_seq_transformer_output = self._combine_batch_and_dialogue_dimension(
-            text_seq_transformer_output, tf_batch_data
-        )
-
-        # convert the output of the dialogue transformer to shape
-        # combined batch dialogue dimension x sequence length x units
-        dialogue_transformer_output = self._combine_batch_and_dialogue_dimension(
-            dialogue_transformer_output, tf_batch_data
+        text_sequence_lengths: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        return tf.cond(
+            tf.shape(text_transformer_output)[0] > 0,
+            lambda: self._real_batch_predict_entities(
+                tf_batch_data,
+                dialogue_transformer_output,
+                text_transformer_output,
+                text_sequence_lengths,
+            ),
+            lambda: (
+                # the output is of shape (batch_size, max_seq_len)
+                tf.zeros(tf.shape(text_transformer_output)[:2], dtype=tf.int32),
+                tf.zeros(tf.shape(text_transformer_output)[:2], dtype=tf.float32),
+            ),
         )
 
-        # repeat the dialogue transformer output sequence-length-times to get the
-        # same shape as the text sequence transformer output
-        sequence_dimension = tf.shape(text_seq_transformer_output)[1]
-        dialogue_transformer_output = tf.repeat(
-            tf.expand_dims(dialogue_transformer_output, axis=1),
-            sequence_dimension,
-            axis=1,
-        )
-        # add the output of the dialogue transformer to the output of the text
-        # sequence transformer (adding context)
-        # resulting shape
-        # (combined batch and dialogue dimension x sequence length x units)
-        text_transformed = tf.add(
-            text_seq_transformer_output, dialogue_transformer_output
-        )
+    def _real_batch_predict_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
 
-        # we need the sequence length and the mask for the CRF layer
-        _sequence_lengths = tf_batch_data[TEXT][SEQUENCE_LENGTH][0]
-        # extract only nonzero lengths and cast to int
-        _sequence_lengths = tf.cast(
-            tf.boolean_mask(_sequence_lengths, _sequence_lengths), dtype=tf.int32
-        )
-        # boolean mask returns flat tensor
-        _sequence_lengths = tf.expand_dims(_sequence_lengths, axis=-1)
-        # + 1 for sentence features
-        sequence_lengths = _sequence_lengths + 1
-
-        # convert mask and sequence length to correct shape
-        sequence_lengths = self._convert_to_original_shape(
-            tf.expand_dims(sequence_lengths, axis=-1),
-            tf_batch_data[TEXT][MASK][0],
-            dialogue_lengths,
-            True,
-        )
-        sequence_lengths = self._combine_batch_and_dialogue_dimension(
-            sequence_lengths, tf_batch_data
+        text_transformed, _, text_sequence_lengths = self._reshape_for_entities(
+            tf_batch_data,
+            dialogue_transformer_output,
+            text_transformer_output,
+            text_sequence_lengths,
         )
 
-        if self.max_history_tracker_featurizer_used:
-            batch_dim = tf.size(dialogue_lengths)
-
-            # the first dimension of text transformed is the combined batch and dialogue
-            # dimension, which corresponds to the sum of all dialogue lengths
-            # if the max history tracker featurizer is used we just want the last
-            # dialogues of every batch example
-
-            # get the indices of all last dialogues
-            last_dialogue_indices = tf.cumsum(dialogue_lengths) - 1
-
-            # build up indices to get the last dialogues from text_transformed
-            dialogue_indices = tf.repeat(
-                tf.expand_dims(last_dialogue_indices, axis=1),
-                sequence_dimension,
-                axis=1,
-            )
-            sequence_indices = tf.repeat(
-                tf.expand_dims(tf.range(sequence_dimension), axis=0), batch_dim, axis=0
-            )
-            indices = tf.stack([dialogue_indices, sequence_indices], axis=2)
-
-            # get all last dialogues from text_transformed using the above indices
-            # resulting shape (batch size x sequence length x units)
-            text_transformed = tf.gather_nd(text_transformed, indices)
-            # do the same for the other tensors
-            sequence_lengths = tf.gather(
-                tf.squeeze(sequence_lengths), last_dialogue_indices
-            )
-
         name = ENTITY_ATTRIBUTE_TYPE
 
         _logits = self._tf_layers[f"embed.{name}.logits"](text_transformed)
-        pred_ids, confidences = self._tf_layers[f"crf.{name}"](
-            _logits, sequence_lengths - 1
-        )
-
-        predictions[f"e_{name}_ids"] = pred_ids
-        predictions[f"e_{name}_scores"] = confidences
 
-        return predictions
+        return self._tf_layers[f"crf.{name}"](_logits, text_sequence_lengths)
 
 
 # pytype: enable=key-error

From c287d8c14728c787ca054fa87ca17992877560c7 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 12 Nov 2020 14:01:24 +0100
Subject: [PATCH 46/62] fix randomness and shapes

---
 examples/e2ebot/config.yml       |  4 ++--
 examples/e2ebot/data/stories.yml |  2 +-
 examples/e2ebot/domain.yml       |  3 +++
 rasa/core/policies/ted_policy.py | 12 +++++++++++-
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/examples/e2ebot/config.yml b/examples/e2ebot/config.yml
index f38558adb0ad..2d4a08aa4ae6 100644
--- a/examples/e2ebot/config.yml
+++ b/examples/e2ebot/config.yml
@@ -9,8 +9,8 @@ pipeline:
     analyzer: char_wb
     min_ngram: 1
     max_ngram: 4
-  - name: DIETClassifier
-    epochs: 200
+#  - name: DIETClassifier
+#    epochs: 200
 policies:
 - name: TEDPolicy
   epochs: 200
diff --git a/examples/e2ebot/data/stories.yml b/examples/e2ebot/data/stories.yml
index bf884abf1856..cab5ea7113ca 100644
--- a/examples/e2ebot/data/stories.yml
+++ b/examples/e2ebot/data/stories.yml
@@ -10,7 +10,7 @@ stories:
 
 - story: sad path (text to text)
   steps:
-  - user: "Hello"
+  - user: "[Hello](bla)"
   - bot: "Welcome to moodbot. How are you feeling today?"
   - user: "Horrible"
   - bot: "Oh no! Here is a kitten photo. Did it help?"
diff --git a/examples/e2ebot/domain.yml b/examples/e2ebot/domain.yml
index 17b3faba2a75..d884f4cc40c3 100644
--- a/examples/e2ebot/domain.yml
+++ b/examples/e2ebot/domain.yml
@@ -9,3 +9,6 @@ actions:
 intents:
  - greet
  - mood_great
+
+entities:
+ - bla
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index e6f0e44d0455..059c5bce6711 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -438,6 +438,9 @@ def _create_model_data(
             FeatureArray(dialogue_lengths, number_of_dimensions=1)
         ]
 
+        # make sure all keys are in the same order during training and prediction
+        model_data.sort()
+
         return model_data
 
     def train(
@@ -1304,6 +1307,12 @@ def _reshape_for_entities(
         )
 
         text_mask = tf.squeeze(self._compute_mask(text_sequence_lengths), axis=1)
+        # add zeros to match the shape of text_transformed, because
+        # max sequence length might differ, since it is calculated dynamically
+        # based on a subset of sequence lengths
+        sequence_diff = tf.shape(text_transformed)[1] - tf.shape(text_mask)[1]
+        text_mask = tf.pad(text_mask, [[0, 0], [0, sequence_diff], [0, 0]])
+
         # remove additional dims and sentence features
         text_sequence_lengths = tf.reshape(text_sequence_lengths, (-1,)) - 1
 
@@ -1347,7 +1356,8 @@ def _real_batch_loss_entities(
 
         tag_ids = tf_batch_data[ENTITY_TAGS][IDS][0]
         # add a zero (no entity) for the sentence features to match the shape of inputs
-        tag_ids = tf.pad(tag_ids, [[0, 0], [0, 1], [0, 0]])
+        sequence_diff = tf.shape(text_transformed)[1] - tf.shape(tag_ids)[1]
+        tag_ids = tf.pad(tag_ids, [[0, 0], [0, sequence_diff], [0, 0]])
 
         loss, f1, _ = self._calculate_entity_loss(
             text_transformed,

From f87c134f7aed137944dbf3be49cb9da30bbeceee Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 12 Nov 2020 16:43:18 +0100
Subject: [PATCH 47/62] fix ffnn encoding layer name

---
 examples/e2ebot/config.yml       | 4 ++--
 rasa/core/policies/ted_policy.py | 3 ++-
 rasa/utils/tensorflow/layers.py  | 2 +-
 rasa/utils/tensorflow/models.py  | 5 ++++-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/examples/e2ebot/config.yml b/examples/e2ebot/config.yml
index 2d4a08aa4ae6..f38558adb0ad 100644
--- a/examples/e2ebot/config.yml
+++ b/examples/e2ebot/config.yml
@@ -9,8 +9,8 @@ pipeline:
     analyzer: char_wb
     min_ngram: 1
     max_ngram: 4
-#  - name: DIETClassifier
-#    epochs: 200
+  - name: DIETClassifier
+    epochs: 200
 policies:
 - name: TEDPolicy
   epochs: 200
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 059c5bce6711..529d5c02d7be 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -862,6 +862,7 @@ def _prepare_encoding_layers(self, name: Text) -> None:
             f"{name}",
             [self.config[ENCODING_DIMENSION]],
             self.config[DROP_RATE_DIALOGUE],
+            prefix="encoding_layer",
         )
 
     # ---GRAPH BUILDING HELPERS---
@@ -1098,7 +1099,7 @@ def _encode_real_features_per_attribute(
             )
 
         if attribute in SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
-            attribute_features = self._tf_layers[f"ffnn.{attribute}"](
+            attribute_features = self._tf_layers[f"encoding_layer.{attribute}"](
                 attribute_features
             )
 
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 1d64b1b26cb3..0b0d00e4131a 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -136,7 +136,7 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
         if len(inputs.shape) == 3:
             # reshape back
             outputs = tf.reshape(
-                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], -1)
+                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], self.units)
             )
 
         if self.use_bias:
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 6b1242d90a0a..50e4903814c0 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -803,7 +803,10 @@ def _prepare_sparse_dense_layers(
             if not dense:
                 # create dense labels for the input to use in negative sampling
                 self._tf_layers[f"sparse_to_dense_ids.{name}"] = layers.DenseForSparse(
-                    units=2, trainable=False, name=f"sparse_to_dense_ids.{name}"
+                    units=2,
+                    use_bias=False,
+                    trainable=False,
+                    name=f"sparse_to_dense_ids.{name}",
                 )
 
     def _prepare_input_layers(self, name: Text) -> None:

From bfc2571169d786c2c87d2697041bf2e3a2d24e00 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 12 Nov 2020 16:48:08 +0100
Subject: [PATCH 48/62] add todo

---
 rasa/core/policies/ted_policy.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 529d5c02d7be..6a695a5bdc2c 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -550,6 +550,7 @@ def predict_action_probabilities(
         if (
             len(tracker_state_features) == 2
             and np.max(confidences[1]) > self.config[E2E_CONFIDENCE_THRESHOLD]
+            # TODO maybe compare confidences is better
             and np.max(similarities[1]) > np.max(similarities[0])
         ):
             batch_index = 1

From 53b21599b7f436c42ba300afc7975675b30dba22 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Thu, 12 Nov 2020 16:49:57 +0100
Subject: [PATCH 49/62] Update rasa/core/policies/ted_policy.py

Co-authored-by: Tanja <tabergma@gmail.com>
---
 rasa/core/policies/ted_policy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 6a695a5bdc2c..59c1b044aea9 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -1027,7 +1027,7 @@ def _encode_real_features_per_attribute(
             mask_sequence_text = tf.squeeze(
                 self._compute_mask(sequence_lengths), axis=1
             )
-            sequence_lengths = sequence_lengths + 1
+            sequence_lengths += 1
             mask_text = tf.squeeze(self._compute_mask(sequence_lengths), axis=1)
 
             attribute_features, _, _, _ = self._create_sequence(

From 05639b4a94f27707586d3ad43e5ccf928a2db421 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Thu, 12 Nov 2020 16:50:37 +0100
Subject: [PATCH 50/62] Update rasa/core/featurizers/single_state_featurizer.py

Co-authored-by: Tanja <tabergma@gmail.com>
---
 rasa/core/featurizers/single_state_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index e3ed3712ad76..792c618cfe87 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -43,7 +43,7 @@ class SingleStateFeaturizer:
     def __init__(self) -> None:
         self._default_feature_states = {}
         self.action_texts = []
-        self.tag_id_mapping = {}
+        self.entity_tag_id_mapping = {}
 
     def get_entity_tag_ids(self) -> Dict[Text, int]:
         """Returns the tag to index mapping for entities.

From 4e873f98e51cf9347cb719ca749d616cb5917bb9 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 12 Nov 2020 16:52:32 +0100
Subject: [PATCH 51/62] rename to entity_tag_id_mapping

---
 rasa/core/featurizers/single_state_featurizer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index 792c618cfe87..f77f20291154 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -79,7 +79,7 @@ def convert_to_dict(feature_states: List[Text]) -> Dict[Text, int]:
         self._default_feature_states[SLOTS] = convert_to_dict(domain.slot_states)
         self._default_feature_states[ACTIVE_LOOP] = convert_to_dict(domain.form_names)
         self.action_texts = domain.action_texts
-        self.tag_id_mapping = self.get_entity_tag_ids()
+        self.entity_tag_id_mapping = self.get_entity_tag_ids()
 
     def _state_features_for_attribute(
         self, sub_state: SubState, attribute: Text
@@ -252,7 +252,7 @@ def encode_entity(
         # TODO
         #  Should we support BILOU tagging?
 
-        if TEXT not in entity_data or len(self.tag_id_mapping) < 2:
+        if TEXT not in entity_data or len(self.entity_tag_id_mapping) < 2:
             # we cannot build a classifier if there are less than 2 class
             return {}
 
@@ -265,7 +265,7 @@ def encode_entity(
                 token, entities, attribute_key=ENTITY_ATTRIBUTE_TYPE
             )
             # TODO handle if tag is not in mapping
-            _tags.append(self.tag_id_mapping[_tag])
+            _tags.append(self.entity_tag_id_mapping[_tag])
 
         # transpose to have seq_len x 1
         return {

From 563085b546f92d3938206f25acbd614aa5387342 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 12 Nov 2020 17:06:51 +0100
Subject: [PATCH 52/62] add comment to last dial mask

---
 rasa/core/policies/ted_policy.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 59c1b044aea9..29dd1d7e0dde 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -1061,6 +1061,30 @@ def _encode_real_features_per_attribute(
                             ),
                         )
                     ).values
+                    # since use_only_last_dialogue_turn is True,
+                    # we need to find the locations of last dialogue turns in
+                    # (combined batch dimension and dialogue length,) dimension
+                    # so that we can use `_sequence_lengths` as a boolean to pick
+                    # which ones are "real" textual input in these last dialogue turns
+
+                    # in order to do that we can use given `dialogue_lengths`
+                    # for example:
+                    # if we have `dialogue_lengths = [2, 1, 3]`, than
+                    # `dialogue_indices = [0, 1, 0, 0, 1, 2]` here we can spot that `0`
+                    # always indicates the first dialogue turn,
+                    # which means that previous dialogue turn is the last one,
+                    # combining this with the fact that the last element in
+                    # `dialogue_indices` is always the last dialogue turn, we can add
+                    # a `0` to the end, getting
+                    # `_dialogue_indices = [0, 1, 0, 0, 1, 2, 0]`,
+                    # then remove the first element
+                    # `_last_dialogue_turn_inverse_indicator = [1, 0, 0, 1, 2, 0]`
+                    # and we see that `0` points to last dialogue turn,
+                    # the rest is to convert all positive numbers to `True` and take
+                    # the inverse mask to get
+                    # `last_dialogue_mask = [0, 1, 1, 0, 0, 1]
+                    # which precisely corresponds to the fact that first dialogue is of
+                    # length 2, the second 1 and the third 3
                     last_dialogue_mask = tf.math.logical_not(
                         tf.cast(
                             tf.concat(

From 4a97b0b95183058e76500efe7af2b3087d69577a Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 12 Nov 2020 17:17:31 +0100
Subject: [PATCH 53/62] add comments to tf.cond

---
 rasa/core/policies/ted_policy.py | 43 ++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 29dd1d7e0dde..c5f9b2f0c6ff 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -1061,30 +1061,30 @@ def _encode_real_features_per_attribute(
                             ),
                         )
                     ).values
-                    # since use_only_last_dialogue_turn is True,
+                    # Since use_only_last_dialogue_turn is True,
                     # we need to find the locations of last dialogue turns in
-                    # (combined batch dimension and dialogue length,) dimension
-                    # so that we can use `_sequence_lengths` as a boolean to pick
-                    # which ones are "real" textual input in these last dialogue turns
+                    # (combined batch dimension and dialogue length,) dimension,
+                    # so that we can use `_sequence_lengths` as a boolean  mask to pick
+                    # which ones are "real" textual input in these last dialogue turns.
 
-                    # in order to do that we can use given `dialogue_lengths`
-                    # for example:
-                    # if we have `dialogue_lengths = [2, 1, 3]`, than
+                    # In order to do that we can use given `dialogue_lengths`.
+                    # For example:
+                    # If we have `dialogue_lengths = [2, 1, 3]`, than
                     # `dialogue_indices = [0, 1, 0, 0, 1, 2]` here we can spot that `0`
                     # always indicates the first dialogue turn,
-                    # which means that previous dialogue turn is the last one,
-                    # combining this with the fact that the last element in
+                    # which means that previous dialogue turn is the last dialogue turn.
+                    # Combining this with the fact that the last element in
                     # `dialogue_indices` is always the last dialogue turn, we can add
                     # a `0` to the end, getting
-                    # `_dialogue_indices = [0, 1, 0, 0, 1, 2, 0]`,
-                    # then remove the first element
+                    # `_dialogue_indices = [0, 1, 0, 0, 1, 2, 0]`.
+                    # Then removing the first element
                     # `_last_dialogue_turn_inverse_indicator = [1, 0, 0, 1, 2, 0]`
-                    # and we see that `0` points to last dialogue turn,
-                    # the rest is to convert all positive numbers to `True` and take
+                    # we see that `0` points to the last dialogue turn.
+                    # We convert all positive numbers to `True` and take
                     # the inverse mask to get
-                    # `last_dialogue_mask = [0, 1, 1, 0, 0, 1]
+                    # `last_dialogue_mask = [0, 1, 1, 0, 0, 1],
                     # which precisely corresponds to the fact that first dialogue is of
-                    # length 2, the second 1 and the third 3
+                    # length 2, the second 1 and the third 3.
                     last_dialogue_mask = tf.math.logical_not(
                         tf.cast(
                             tf.concat(
@@ -1353,7 +1353,12 @@ def _batch_loss_entities(
         text_transformer_output: tf.Tensor,
         text_sequence_lengths: tf.Tensor,
     ) -> tf.Tensor:
-
+        # It could happen that some batches don't contain "real" features for `text`,
+        # e.g. large number of stories are intent only.
+        # Therefore actual `text_transformer_output` will be empty.
+        # We cannot create a loss with empty tensors.
+        # Since we need actual numbers to create a full loss, we output
+        # zero in this case.
         return tf.cond(
             tf.shape(text_transformer_output)[0] > 0,
             lambda: self._real_batch_loss_entities(
@@ -1544,6 +1549,12 @@ def _batch_predict_entities(
         text_transformer_output: tf.Tensor,
         text_sequence_lengths: tf.Tensor,
     ) -> Tuple[tf.Tensor, tf.Tensor]:
+        # It could happen that current prediction turn don't contain
+        # "real" features for `text`,
+        # Therefore actual `text_transformer_output` will be empty.
+        # We cannot predict entities with empty tensors.
+        # Since we need to output some tensors of the same shape, we output
+        # zero tensors.
         return tf.cond(
             tf.shape(text_transformer_output)[0] > 0,
             lambda: self._real_batch_predict_entities(

From d2db7153594789dc6b9f15a1b1bd50aac6af041f Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 12 Nov 2020 17:22:58 +0100
Subject: [PATCH 54/62] add docstrings

---
 .../featurizers/single_state_featurizer.py    | 11 ++++++++++
 rasa/core/featurizers/tracker_featurizers.py  | 21 ++++++++++++-------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index f77f20291154..aa93df4c35c5 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -243,6 +243,17 @@ def encode_state(
     def encode_entity(
         self, entity_data: Dict[Text, Any], interpreter: NaturalLanguageInterpreter
     ) -> Dict[Text, List["Features"]]:
+        """Encode the given entity data with the help of the given interpreter.
+
+        Produce numeric entity tags for tokens.
+
+        Args:
+            entity_data: The dict containing the text and entity labels and locations
+            interpreter: The interpreter used to encode the state
+
+        Returns:
+            A dictionary of entity type to list of features.
+        """
         from rasa.nlu.test import determine_token_labels
 
         # TODO
diff --git a/rasa/core/featurizers/tracker_featurizers.py b/rasa/core/featurizers/tracker_featurizers.py
index 4973b1da98a7..08355883d716 100644
--- a/rasa/core/featurizers/tracker_featurizers.py
+++ b/rasa/core/featurizers/tracker_featurizers.py
@@ -115,6 +115,15 @@ def _entity_data(event: UserUttered) -> Dict[Text, Any]:
     def training_states_actions_and_entities(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
+        """Transforms list of trackers to lists of states, actions and entity data.
+
+        Args:
+            trackers: The trackers to transform
+            domain: The domain
+
+        Returns:
+            A tuple of list of states, list of actions and list of entity data.
+        """
         raise NotImplementedError(
             "Featurizer must have the capacity to encode trackers to feature vectors"
         )
@@ -293,16 +302,14 @@ class FullDialogueTrackerFeaturizer(TrackerFeaturizer):
     def training_states_actions_and_entities(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
-        """Transforms list of trackers to lists of states and actions.
-
-        Training data is padded up to the length of the longest dialogue with -1.
+        """Transforms list of trackers to lists of states, actions and entity data.
 
         Args:
             trackers: The trackers to transform
             domain: The domain
 
         Returns:
-            A tuple of list of states and list of actions.
+            A tuple of list of states, list of actions and list of entity data.
         """
 
         trackers_as_states = []
@@ -438,16 +445,14 @@ def _hash_example(
     def training_states_actions_and_entities(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
-        """Transforms list of trackers to lists of states and actions.
-
-        Training data is padded up to the length of the longest dialogue with -1.
+        """Transforms list of trackers to lists of states, actions and entity data.
 
         Args:
             trackers: The trackers to transform
             domain: The domain
 
         Returns:
-            A tuple of list of states and list of actions.
+            A tuple of list of states, list of actions and list of entity data.
         """
 
         trackers_as_states = []

From b3b28d74a94b58a5c73631e4567e5470af423b51 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 12 Nov 2020 19:21:16 +0100
Subject: [PATCH 55/62] refactor number of dims check

---
 rasa/core/policies/memoization.py   |  1 -
 rasa/utils/tensorflow/model_data.py | 15 ++++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/rasa/core/policies/memoization.py b/rasa/core/policies/memoization.py
index 8510ab9c6852..e511f35563ac 100644
--- a/rasa/core/policies/memoization.py
+++ b/rasa/core/policies/memoization.py
@@ -1,4 +1,3 @@
-import copy
 import zlib
 
 import base64
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 3c8012f024ba..4f5dcab1c010 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -124,7 +124,7 @@ def __setstate__(self, state, **kwargs):
     def _validate_number_of_dimensions(
         number_of_dimensions: int, input_array: np.ndarray
     ) -> None:
-        """Validates if the given number of dimensions maps the with the dimensions of the input array.
+        """Validates if the the input array has given number of dimensions.
 
         Args:
             number_of_dimensions: number of dimensions
@@ -142,7 +142,8 @@ def _validate_number_of_dimensions(
                 break
             if isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0:
                 # sequence dimension is 0, we are dealing with "fake" features
-                return
+                dim = i
+                break
 
         # If the resulting sub_array is sparse, the remaining number of dimensions
         # should be at least 2
@@ -150,7 +151,15 @@ def _validate_number_of_dimensions(
             if dim > 2:
                 raise ValueError(
                     f"Given number of dimensions '{number_of_dimensions}' does not "
-                    f"match dimensiona of given input array: {input_array}."
+                    f"match dimensions of given input array: {input_array}."
+                )
+        elif isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0:
+            # sequence dimension is 0, we are dealing with "fake" features,
+            # but they should be of dim 2
+            if dim > 2:
+                raise ValueError(
+                    f"Given number of dimensions '{number_of_dimensions}' does not "
+                    f"match dimensions of given input array: {input_array}."
                 )
         # If the resulting sub_array is dense, the sub_array should be a single number
         elif not np.issubdtype(type(_sub_array), np.integer) and not isinstance(

From 779db7f1236913ae24db6c803d9d2ced52b610d3 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 13 Nov 2020 09:59:15 +0100
Subject: [PATCH 56/62] rename zero features to fake features

---
 rasa/core/policies/ted_policy.py              | 24 ++++++------
 rasa/utils/tensorflow/model_data_utils.py     | 38 +++++++++----------
 .../utils/tensorflow/test_model_data_utils.py | 34 ++++++++---------
 3 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index c5f9b2f0c6ff..02c9f06cebda 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -276,7 +276,7 @@ def __init__(
         priority: int = DEFAULT_POLICY_PRIORITY,
         max_history: Optional[int] = None,
         model: Optional[RasaModel] = None,
-        zero_state_features: Optional[Dict[Text, List["Features"]]] = None,
+        fake_features: Optional[Dict[Text, List["Features"]]] = None,
         entity_tag_specs: Optional[List[EntityTagSpec]] = None,
         **kwargs: Any,
     ) -> None:
@@ -297,7 +297,7 @@ def __init__(
 
         self._entity_tag_specs = entity_tag_specs
 
-        self.zero_state_features = zero_state_features or defaultdict(list)
+        self.fake_features = fake_features or defaultdict(list)
 
         self._label_data: Optional[RasaModelData] = None
         self.data_example: Optional[Dict[Text, List[np.ndarray]]] = None
@@ -398,7 +398,7 @@ def _create_model_data(
                 [FeatureArray(label_ids, number_of_dimensions=3)],
             )
 
-            attribute_data, self.zero_state_features = convert_to_data_format(
+            attribute_data, self.fake_features = convert_to_data_format(
                 tracker_state_features, featurizers=self.config[FEATURIZERS]
             )
             if self.config[ENTITY_RECOGNITION]:
@@ -418,7 +418,7 @@ def _create_model_data(
             # method is called during prediction
             attribute_data, _ = convert_to_data_format(
                 tracker_state_features,
-                self.zero_state_features,
+                self.fake_features,
                 featurizers=self.config[FEATURIZERS],
             )
 
@@ -519,7 +519,7 @@ def predict_action_probabilities(
         # create model data from tracker
         tracker_state_features = []
         if (
-            INTENT in self.zero_state_features
+            INTENT in self.fake_features
             or not tracker.latest_action_name == ACTION_LISTEN_NAME
         ):
             # the first example in a batch uses intent
@@ -528,7 +528,7 @@ def predict_action_probabilities(
                 [tracker], domain, interpreter, use_text_for_last_user_input=False
             )
         if (
-            TEXT in self.zero_state_features
+            TEXT in self.fake_features
             and tracker.latest_action_name == ACTION_LISTEN_NAME
         ):
             # the second - text, but only after user utterance
@@ -561,7 +561,7 @@ def predict_action_probabilities(
         else:  # only one tracker present
             batch_index = 0
             if tracker.latest_action_name == ACTION_LISTEN_NAME:
-                if TEXT in self.zero_state_features:
+                if TEXT in self.fake_features:
                     is_e2e_prediction = True
                 else:
                     is_e2e_prediction = False
@@ -609,8 +609,8 @@ def persist(self, path: Union[Text, Path]) -> None:
             model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl", self.data_example
         )
         io_utils.pickle_dump(
-            model_path / f"{SAVE_MODEL_FILE_NAME}.zero_state_features.pkl",
-            self.zero_state_features,
+            model_path / f"{SAVE_MODEL_FILE_NAME}.fake_features.pkl",
+            self.fake_features,
         )
         io_utils.pickle_dump(
             model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl",
@@ -653,8 +653,8 @@ def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
         label_data = io_utils.pickle_load(
             model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl"
         )
-        zero_state_features = io_utils.pickle_load(
-            model_path / f"{SAVE_MODEL_FILE_NAME}.zero_state_features.pkl"
+        fake_features = io_utils.pickle_load(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.fake_features.pkl"
         )
         label_data = RasaModelData(data=label_data)
         meta = io_utils.pickle_load(model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl")
@@ -713,7 +713,7 @@ def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
             featurizer=featurizer,
             priority=priority,
             model=model,
-            zero_state_features=zero_state_features,
+            fake_features=fake_features,
             entity_tag_specs=entity_tag_specs,
             **meta,
         )
diff --git a/rasa/utils/tensorflow/model_data_utils.py b/rasa/utils/tensorflow/model_data_utils.py
index 8788a366f06b..1f556d22914d 100644
--- a/rasa/utils/tensorflow/model_data_utils.py
+++ b/rasa/utils/tensorflow/model_data_utils.py
@@ -166,7 +166,7 @@ def _filter_features(features: Optional[List["Features"]], featurizers: List[Tex
     return [f for f in features if f.origin in featurizers]
 
 
-def _create_zero_features(
+def _create_fake_features(
     all_features: List[List[List["Features"]]],
 ) -> List["Features"]:
     """Computes default feature values.
@@ -191,8 +191,8 @@ def _create_zero_features(
         )
     )
 
-    # create zero_features for Nones
-    zero_features = []
+    # create fake_features for Nones
+    fake_features = []
     for _features in example_features:
         new_features = copy.deepcopy(_features)
         if _features.is_dense():
@@ -203,16 +203,16 @@ def _create_zero_features(
             new_features.features = scipy.sparse.coo_matrix(
                 (0, _features.features.shape[-1]), _features.features.dtype
             )
-        zero_features.append(new_features)
+        fake_features.append(new_features)
 
-    return zero_features
+    return fake_features
 
 
 def convert_to_data_format(
     features: Union[
         List[List[Dict[Text, List["Features"]]]], List[Dict[Text, List["Features"]]]
     ],
-    zero_features: Optional[Dict[Text, List["Features"]]] = None,
+    fake_features: Optional[Dict[Text, List["Features"]]] = None,
     consider_dialogue_dimension: bool = True,
     featurizers: Optional[List[Text]] = None,
 ) -> Tuple[Data, Optional[Dict[Text, List["Features"]]]]:
@@ -228,7 +228,7 @@ def convert_to_data_format(
     Args:
         features: a dictionary of attributes to a list of features for all
             examples in the training data
-        zero_features: Contains default feature values for attributes
+        fake_features: Contains default feature values for attributes
         consider_dialogue_dimension: If set to false the dialogue dimension will be
             removed from the resulting sequence features.
         featurizers: the featurizers to consider
@@ -237,9 +237,9 @@ def convert_to_data_format(
         Input in "Data" format and zero features
     """
     training = False
-    if not zero_features:
+    if not fake_features:
         training = True
-        zero_features = defaultdict(list)
+        fake_features = defaultdict(list)
 
     # unify format of incoming features
     if isinstance(features[0], Dict):
@@ -254,7 +254,7 @@ def convert_to_data_format(
     if training:
         attributes = list(attribute_to_features.keys())
     else:
-        attributes = list(zero_features.keys())
+        attributes = list(fake_features.keys())
 
     # In case an attribute is not present during prediction, replace it with
     # None values that will then be replaced by zero features
@@ -271,14 +271,14 @@ def convert_to_data_format(
             empty_features,
             attribute_to_features,
             training,
-            zero_features,
+            fake_features,
             consider_dialogue_dimension,
         )
 
     # ensure that all attributes are in the same order
     attribute_data = OrderedDict(sorted(attribute_data.items()))
 
-    return attribute_data, zero_features
+    return attribute_data, fake_features
 
 
 def _features_for_attribute(
@@ -286,7 +286,7 @@ def _features_for_attribute(
     empty_features: List[Any],
     attribute_to_features: Dict[Text, List[List[List["Features"]]]],
     training: bool,
-    zero_features: Dict[Text, List["Features"]],
+    fake_features: Dict[Text, List["Features"]],
     consider_dialogue_dimension: bool,
 ) -> Dict[Text, List[FeatureArray]]:
     """Create the features for the given attribute from the all examples features.
@@ -296,7 +296,7 @@ def _features_for_attribute(
         empty_features: empty features
         attribute_to_features: features for every example
         training: boolean indicating whether we are currently in training or not
-        zero_features: zero features
+        fake_features: zero features
         consider_dialogue_dimension: If set to false the dialogue dimension will be
           removed from the resulting sequence features.
 
@@ -312,10 +312,10 @@ def _features_for_attribute(
     # in case some features for a specific attribute are
     # missing, replace them with a feature vector of zeros
     if training:
-        zero_features[attribute] = _create_zero_features(features)
+        fake_features[attribute] = _create_fake_features(features)
 
     (attribute_masks, _dense_features, _sparse_features) = _extract_features(
-        features, zero_features[attribute], attribute
+        features, fake_features[attribute], attribute
     )
 
     sparse_features = {}
@@ -363,7 +363,7 @@ def _features_for_attribute(
 
 def _extract_features(
     features: List[List[List["Features"]]],
-    zero_features: List["Features"],
+    fake_features: List["Features"],
     attribute: Text,
 ) -> Tuple[
     List[np.ndarray],
@@ -375,7 +375,7 @@ def _extract_features(
 
     Args:
         features: all features
-        zero_features: list of zero features
+        fake_features: list of zero features
 
     Returns:
         - a list of attribute masks
@@ -399,7 +399,7 @@ def _extract_features(
             if list_of_features is None:
                 # use zero features and set mask to zero
                 attribute_mask[i] = 0
-                list_of_features = zero_features
+                list_of_features = fake_features
 
             for features in list_of_features:
                 # in case of ENTITIES, if the attribute type matches either 'entity',
diff --git a/tests/utils/tensorflow/test_model_data_utils.py b/tests/utils/tensorflow/test_model_data_utils.py
index f495222958df..2dab29353f3a 100644
--- a/tests/utils/tensorflow/test_model_data_utils.py
+++ b/tests/utils/tensorflow/test_model_data_utils.py
@@ -30,7 +30,7 @@
 shape = 100
 
 
-def test_create_zero_features():
+def test_create_fake_features():
     # DENSE FEATURES
     dense_feature_sentence_features = Features(
         features=np.random.rand(shape),
@@ -40,10 +40,10 @@ def test_create_zero_features():
     )
     features = [[None, None, [dense_feature_sentence_features]]]
 
-    zero_features = model_data_utils._create_zero_features(features)
-    assert len(zero_features) == 1
-    assert zero_features[0].is_dense()
-    assert zero_features[0].features.shape == (0, shape)
+    fake_features = model_data_utils._create_fake_features(features)
+    assert len(fake_features) == 1
+    assert fake_features[0].is_dense()
+    assert fake_features[0].features.shape == (0, shape)
 
     # SPARSE FEATURES
     sparse_feature_sentence_features = Features(
@@ -53,11 +53,11 @@ def test_create_zero_features():
         origin=[],
     )
     features = [[None, None, [sparse_feature_sentence_features]]]
-    zero_features = model_data_utils._create_zero_features(features)
-    assert len(zero_features) == 1
-    assert zero_features[0].is_sparse()
-    assert zero_features[0].features.shape == (0, shape)
-    assert zero_features[0].features.nnz == 0
+    fake_features = model_data_utils._create_fake_features(features)
+    assert len(fake_features) == 1
+    assert fake_features[0].is_sparse()
+    assert fake_features[0].features.shape == (0, shape)
+    assert fake_features[0].features.nnz == 0
 
 
 def test_surface_attributes():
@@ -142,18 +142,18 @@ def test_surface_attributes():
 
 
 def test_extract_features():
-    zero_features = np.zeros(shape)
-    zero_features_as_features = Features(
-        features=zero_features, attribute=INTENT, feature_type=SENTENCE, origin=[]
+    fake_features = np.zeros(shape)
+    fake_features_as_features = Features(
+        features=fake_features, attribute=INTENT, feature_type=SENTENCE, origin=[]
     )
     # create zero features
-    zero_features_list = [zero_features_as_features]
+    fake_features_list = [fake_features_as_features]
 
     # create tracker state features by setting a random index in the array to 1
     random_inds = np.random.randint(shape, size=6)
     list_of_features = []
     for idx in random_inds:
-        current_features = copy.deepcopy(zero_features_as_features)
+        current_features = copy.deepcopy(fake_features_as_features)
         current_features.features[idx] = 1
         list_of_features.append([current_features])
 
@@ -168,11 +168,11 @@ def test_extract_features():
         attribute_masks,
         dense_features,
         sparse_features,
-    ) = model_data_utils._extract_features(tracker_features, zero_features_list, INTENT)
+    ) = model_data_utils._extract_features(tracker_features, fake_features_list, INTENT)
     expected_mask = np.array([[1, 0, 1], [0, 0, 1], [1, 1, 1]])
 
     assert np.all(np.squeeze(np.array(attribute_masks), 2) == expected_mask)
-    assert np.array(dense_features[SENTENCE]).shape[-1] == zero_features.shape[-1]
+    assert np.array(dense_features[SENTENCE]).shape[-1] == fake_features.shape[-1]
     assert sparse_features == {}
 
 

From ee85c17eb0d442dcc0cbb6064f8d30ca36f8ad0c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 13 Nov 2020 10:29:42 +0100
Subject: [PATCH 57/62] pre compute dialogue_indices

---
 examples/e2ebot/config.yml       |  4 +-
 rasa/core/policies/ted_policy.py | 84 +++++++++++++++++---------------
 rasa/utils/tensorflow/models.py  |  2 +-
 3 files changed, 47 insertions(+), 43 deletions(-)

diff --git a/examples/e2ebot/config.yml b/examples/e2ebot/config.yml
index f38558adb0ad..2d4a08aa4ae6 100644
--- a/examples/e2ebot/config.yml
+++ b/examples/e2ebot/config.yml
@@ -9,8 +9,8 @@ pipeline:
     analyzer: char_wb
     min_ngram: 1
     max_ngram: 4
-  - name: DIETClassifier
-    epochs: 200
+#  - name: DIETClassifier
+#    epochs: 200
 policies:
 - name: TEDPolicy
   epochs: 200
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 02c9f06cebda..812a04fc4478 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -868,6 +868,24 @@ def _prepare_encoding_layers(self, name: Text) -> None:
 
     # ---GRAPH BUILDING HELPERS---
 
+    @staticmethod
+    def _compute_dialogue_indices(
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
+    ) -> None:
+        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32)
+        # wrap in a list, because that's the structure of tf_batch_data
+        tf_batch_data[DIALOGUE][IDS] = [
+            (
+                tf.map_fn(
+                    tf.range,
+                    dialogue_lengths,
+                    fn_output_signature=tf.RaggedTensorSpec(
+                        shape=[None], dtype=tf.int32
+                    ),
+                )
+            ).values
+        ]
+
     def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_label_ids = self.tf_label_data[LABEL_KEY][LABEL_SUB_KEY][0]
         # labels cannot have all features "fake"
@@ -899,7 +917,7 @@ def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
 
         return all_label_ids, all_labels_embed
 
-    def _emebed_dialogue(
+    def _embed_dialogue(
         self,
         dialogue_in: tf.Tensor,
         tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
@@ -1047,20 +1065,7 @@ def _encode_real_features_per_attribute(
                 text_sequence_lengths = sequence_lengths
 
                 if self.use_only_last_dialogue_turn:
-                    # get the location of all last dialogue inputs
-                    dialogue_lengths = tf.cast(
-                        tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32
-                    )
-                    # TODO precompute dialogue_indices after creation of tf_batch_data
-                    dialogue_indices = (
-                        tf.map_fn(
-                            tf.range,
-                            dialogue_lengths,
-                            fn_output_signature=tf.RaggedTensorSpec(
-                                shape=[None], dtype=tf.int32
-                            ),
-                        )
-                    ).values
+                    # Get the location of all last dialogue inputs.
                     # Since use_only_last_dialogue_turn is True,
                     # we need to find the locations of last dialogue turns in
                     # (combined batch dimension and dialogue length,) dimension,
@@ -1088,7 +1093,10 @@ def _encode_real_features_per_attribute(
                     last_dialogue_mask = tf.math.logical_not(
                         tf.cast(
                             tf.concat(
-                                [dialogue_indices, tf.zeros((1,), dtype=tf.int32)],
+                                [
+                                    tf_batch_data[DIALOGUE][IDS][0],
+                                    tf.zeros((1,), dtype=tf.int32),
+                                ],
                                 axis=0,
                             )[1:],
                             dtype=tf.bool,
@@ -1128,23 +1136,12 @@ def _encode_real_features_per_attribute(
                 attribute_features
             )
 
-        # attribute_mask has shape batch x dialogue_len x 1
-        attribute_mask = tf_batch_data[attribute][MASK][0]
-
-        if attribute in SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES:
-            dialogue_lengths = tf.cast(
-                tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32
-            )
-        else:
-            # for labels, dialogue length is a fake dim and equal to 1
-            dialogue_lengths = tf.ones((tf.shape(attribute_mask)[0],), dtype=tf.int32)
-
         # attribute features have shape
         # (combined batch dimension and dialogue length x 1 x units)
         # convert them back to their original shape of
         # batch size x dialogue length x units
         attribute_features = self._convert_to_original_shape(
-            attribute_features, attribute_mask, dialogue_lengths
+            attribute_features, tf_batch_data, attribute
         )
 
         return attribute_features, text_transformer_output, text_sequence_lengths
@@ -1152,8 +1149,8 @@ def _encode_real_features_per_attribute(
     @staticmethod
     def _convert_to_original_shape(
         attribute_features: tf.Tensor,
-        attribute_mask: tf.Tensor,
-        dialogue_lengths: tf.Tensor,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        attribute: Text,
     ) -> tf.Tensor:
         """Transform attribute features back to original shape.
 
@@ -1178,6 +1175,19 @@ def _convert_to_original_shape(
         # mapping the values of attribute features to the position in the resulting
         # tensor.
 
+        # attribute_mask has shape batch x dialogue_len x 1
+        attribute_mask = tf_batch_data[attribute][MASK][0]
+
+        if attribute in SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES:
+            dialogue_lengths = tf.cast(
+                tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32
+            )
+            dialogue_indices = tf_batch_data[DIALOGUE][IDS][0]
+        else:
+            # for labels, dialogue length is a fake dim and equal to 1
+            dialogue_lengths = tf.ones((tf.shape(attribute_mask)[0],), dtype=tf.int32)
+            dialogue_indices = tf.zeros((tf.shape(attribute_mask)[0],), dtype=tf.int32)
+
         batch_dim = tf.shape(attribute_mask)[0]
         dialogue_dim = tf.shape(attribute_mask)[1]
         units = attribute_features.shape[-1]
@@ -1188,14 +1198,6 @@ def _convert_to_original_shape(
         non_fake_dialogue_lengths = tf.reduce_sum(attribute_mask, axis=-1)
         # create the batch indices
         batch_indices = tf.repeat(tf.range(batch_dim), non_fake_dialogue_lengths)
-        # TODO precompute dialogue_indices after creation of tf_batch_data
-        dialogue_indices = (
-            tf.map_fn(
-                tf.range,
-                dialogue_lengths,
-                fn_output_signature=tf.RaggedTensorSpec(shape=[None], dtype=tf.int32),
-            )
-        ).values
 
         # attribute_mask has shape (batch x dialogue_len x 1), while
         # dialogue_indices has shape (combined_dialogue_len,)
@@ -1427,6 +1429,7 @@ def batch_loss(
             The loss of the given batch.
         """
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
+        self._compute_dialogue_indices(tf_batch_data)
 
         all_label_ids, all_labels_embed = self._create_all_labels_embed()
 
@@ -1442,7 +1445,7 @@ def batch_loss(
             dialogue_embed,
             dialogue_mask,
             dialogue_transformer_output,
-        ) = self._emebed_dialogue(dialogue_in, tf_batch_data)
+        ) = self._embed_dialogue(dialogue_in, tf_batch_data)
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
         losses = []
@@ -1501,6 +1504,7 @@ def batch_predict(
         tf_batch_data = self.batch_to_model_data_format(
             batch_in, self.predict_data_signature
         )
+        self._compute_dialogue_indices(tf_batch_data)
 
         (
             dialogue_in,
@@ -1511,7 +1515,7 @@ def batch_predict(
             dialogue_embed,
             dialogue_mask,
             dialogue_transformer_output,
-        ) = self._emebed_dialogue(dialogue_in, tf_batch_data)
+        ) = self._embed_dialogue(dialogue_in, tf_batch_data)
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
         sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 50e4903814c0..2bad5f180e06 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -186,7 +186,7 @@ def fit(
         batch_strategy: Text,
         silent: bool = False,
         loading: bool = False,
-        eager: bool = False,
+        eager: bool = True,
     ) -> None:
         """Fit model data"""
 

From fc48d4af957915c469365c0dcc641a60dc619b39 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 13 Nov 2020 10:37:56 +0100
Subject: [PATCH 58/62] create helper methods

---
 rasa/core/policies/ted_policy.py | 107 ++++++++++++++++---------------
 1 file changed, 56 insertions(+), 51 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 812a04fc4478..5e24475825d6 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -690,7 +690,7 @@ def load(cls, path: Union[Text, Path]) -> "TEDPolicy":
             config=meta,
             # during prediction we don't care about previous dialogue turns,
             # so to save computation time, use only the last one
-            use_only_last_dialogue_turn=True,
+            use_only_last_dialogue_turns=True,
             label_data=label_data,
             entity_tag_specs=entity_tag_specs,
         )
@@ -724,13 +724,13 @@ def __init__(
         self,
         data_signature: Dict[Text, Dict[Text, List[FeatureSignature]]],
         config: Dict[Text, Any],
-        use_only_last_dialogue_turn: bool,
+        use_only_last_dialogue_turns: bool,
         label_data: RasaModelData,
         entity_tag_specs: Optional[List[EntityTagSpec]],
     ) -> None:
         super().__init__("TED", config, data_signature, label_data)
 
-        self.use_only_last_dialogue_turn = use_only_last_dialogue_turn
+        self.use_only_last_dialogue_turns = use_only_last_dialogue_turns
 
         self.predict_data_signature = {
             feature_name: features
@@ -931,7 +931,7 @@ def _embed_dialogue(
         )
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
-        if self.use_only_last_dialogue_turn:
+        if self.use_only_last_dialogue_turns:
             # pick last vector if max history featurizer is used
             dialogue_transformed = tf.expand_dims(
                 self._last_token(dialogue_transformed, dialogue_lengths), 1
@@ -1014,6 +1014,49 @@ def _encode_fake_features_per_attribute(
 
         return attribute_features, text_transformer_output, text_sequence_lengths
 
+    @staticmethod
+    def _create_last_dialogue_turns_mask(
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
+    ) -> tf.Tensor:
+        # Since use_only_last_dialogue_turns is True,
+        # we need to find the locations of last dialogue turns in
+        # (combined batch dimension and dialogue length,) dimension,
+        # so that we can use `_sequence_lengths` as a boolean  mask to pick
+        # which ones are "real" textual input in these last dialogue turns.
+
+        # In order to do that we can use given `dialogue_lengths`.
+        # For example:
+        # If we have `dialogue_lengths = [2, 1, 3]`, than
+        # `dialogue_indices = [0, 1, 0, 0, 1, 2]` here we can spot that `0`
+        # always indicates the first dialogue turn,
+        # which means that previous dialogue turn is the last dialogue turn.
+        # Combining this with the fact that the last element in
+        # `dialogue_indices` is always the last dialogue turn, we can add
+        # a `0` to the end, getting
+        # `_dialogue_indices = [0, 1, 0, 0, 1, 2, 0]`.
+        # Then removing the first element
+        # `_last_dialogue_turn_inverse_indicator = [1, 0, 0, 1, 2, 0]`
+        # we see that `0` points to the last dialogue turn.
+        # We convert all positive numbers to `True` and take
+        # the inverse mask to get
+        # `last_dialogue_mask = [0, 1, 1, 0, 0, 1],
+        # which precisely corresponds to the fact that first dialogue is of
+        # length 2, the second 1 and the third 3.
+        last_dialogue_turn_mask = tf.math.logical_not(
+            tf.cast(
+                tf.concat(
+                    [tf_batch_data[DIALOGUE][IDS][0], tf.zeros((1,), dtype=tf.int32)],
+                    axis=0,
+                )[1:],
+                dtype=tf.bool,
+            )
+        )
+        # get only the indices of real inputs
+        return tf.boolean_mask(
+            last_dialogue_turn_mask,
+            tf.reshape(tf_batch_data[attribute][SEQUENCE_LENGTH][0], (-1,)),
+        )
+
     def _encode_real_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
@@ -1034,10 +1077,10 @@ def _encode_real_features_per_attribute(
         if attribute in SEQUENCE_FEATURES_TO_ENCODE:
             # sequence_lengths contain `0` for "fake" features, while
             # tf_batch_data[attribute] contain only "real" features
-            _sequence_lengths = tf_batch_data[attribute][SEQUENCE_LENGTH][0]
+            sequence_lengths = tf_batch_data[attribute][SEQUENCE_LENGTH][0]
             # extract only nonzero lengths and cast to int
             sequence_lengths = tf.cast(
-                tf.boolean_mask(_sequence_lengths, _sequence_lengths), dtype=tf.int32
+                tf.boolean_mask(sequence_lengths, sequence_lengths), dtype=tf.int32
             )
             # boolean mask returns flat tensor
             sequence_lengths = tf.expand_dims(sequence_lengths, axis=-1)
@@ -1064,55 +1107,17 @@ def _encode_real_features_per_attribute(
                 text_transformer_output = attribute_features
                 text_sequence_lengths = sequence_lengths
 
-                if self.use_only_last_dialogue_turn:
-                    # Get the location of all last dialogue inputs.
-                    # Since use_only_last_dialogue_turn is True,
-                    # we need to find the locations of last dialogue turns in
-                    # (combined batch dimension and dialogue length,) dimension,
-                    # so that we can use `_sequence_lengths` as a boolean  mask to pick
-                    # which ones are "real" textual input in these last dialogue turns.
-
-                    # In order to do that we can use given `dialogue_lengths`.
-                    # For example:
-                    # If we have `dialogue_lengths = [2, 1, 3]`, than
-                    # `dialogue_indices = [0, 1, 0, 0, 1, 2]` here we can spot that `0`
-                    # always indicates the first dialogue turn,
-                    # which means that previous dialogue turn is the last dialogue turn.
-                    # Combining this with the fact that the last element in
-                    # `dialogue_indices` is always the last dialogue turn, we can add
-                    # a `0` to the end, getting
-                    # `_dialogue_indices = [0, 1, 0, 0, 1, 2, 0]`.
-                    # Then removing the first element
-                    # `_last_dialogue_turn_inverse_indicator = [1, 0, 0, 1, 2, 0]`
-                    # we see that `0` points to the last dialogue turn.
-                    # We convert all positive numbers to `True` and take
-                    # the inverse mask to get
-                    # `last_dialogue_mask = [0, 1, 1, 0, 0, 1],
-                    # which precisely corresponds to the fact that first dialogue is of
-                    # length 2, the second 1 and the third 3.
-                    last_dialogue_mask = tf.math.logical_not(
-                        tf.cast(
-                            tf.concat(
-                                [
-                                    tf_batch_data[DIALOGUE][IDS][0],
-                                    tf.zeros((1,), dtype=tf.int32),
-                                ],
-                                axis=0,
-                            )[1:],
-                            dtype=tf.bool,
-                        )
-                    )
-
-                    # get only the indices of real text inputs
-                    last_dialogue_mask = tf.boolean_mask(
-                        last_dialogue_mask, tf.reshape(_sequence_lengths, (-1,))
+                if self.use_only_last_dialogue_turns:
+                    # get the location of all last dialogue inputs
+                    last_dialogue_turns_mask = self._create_last_dialogue_turns_mask(
+                        tf_batch_data, attribute
                     )
                     # pick last vector if max history featurizer is used
                     text_transformer_output = tf.boolean_mask(
-                        text_transformer_output, last_dialogue_mask
+                        text_transformer_output, last_dialogue_turns_mask
                     )
                     text_sequence_lengths = tf.boolean_mask(
-                        text_sequence_lengths, last_dialogue_mask
+                        text_sequence_lengths, last_dialogue_turns_mask
                     )
 
             # resulting attribute features will have shape
@@ -1306,7 +1311,7 @@ def _reshape_for_entities(
         attribute_mask = tf_batch_data[TEXT][MASK][0]
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
 
-        if self.use_only_last_dialogue_turn:
+        if self.use_only_last_dialogue_turns:
             # pick last vector if max history featurizer is used
             attribute_mask = tf.expand_dims(
                 self._last_token(attribute_mask, dialogue_lengths), axis=1

From 419f90c65613fe853e22f0765c8182167c012550 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 13 Nov 2020 11:41:26 +0100
Subject: [PATCH 59/62] calculate number of units for text_transformer_output

---
 rasa/core/policies/ted_policy.py | 70 ++++++++++++++++++++++----------
 rasa/utils/tensorflow/models.py  |  2 +-
 2 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 5e24475825d6..2f213859674d 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -259,8 +259,6 @@ class TEDPolicy(Policy):
         # By default all features in the pipeline are used.
         FEATURIZERS: [],
         # If set to true, entities are predicted in user utterances.
-        # TODO Do not communicate this option to users yet as we have to run some
-        #   experiments first.
         ENTITY_RECOGNITION: True,
     }
 
@@ -968,43 +966,71 @@ def _encode_features_per_attribute(
             lambda: self._encode_fake_features_per_attribute(tf_batch_data, attribute),
         )
 
+    def _get_dense_units(
+        self, attribute_features_list: List[tf.Tensor], attribute: Text
+    ) -> int:
+        units = 0
+        for f in attribute_features_list:
+            if isinstance(f, tf.SparseTensor):
+                units += self.config[DENSE_DIMENSION][attribute]
+            else:
+                units += f.shape[-1]
+        return units
+
+    def _get_concat_units(
+        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
+    ) -> int:
+        # calculate concat sequence sentence dim
+        sentence_units = self._get_dense_units(
+            tf_batch_data[attribute][SENTENCE], attribute
+        )
+        sequence_units = self._get_dense_units(
+            tf_batch_data[attribute][SEQUENCE], attribute
+        )
+
+        if sequence_units and not sentence_units:
+            return sequence_units
+
+        if sentence_units and not sequence_units:
+            return sentence_units
+
+        if sentence_units != sequence_units:
+            return self.config[CONCAT_DIMENSION][TEXT]
+
+        return sentence_units
+
     def _encode_fake_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
     ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-        attribute_features_list = tf_batch_data[attribute][SENTENCE]
+        # we need to create real zero tensors with appropriate batch and dialogue dim
+        # because they are passed to dialogue transformer
         attribute_mask = tf_batch_data[attribute][MASK][0]
 
         batch_dim = tf.shape(attribute_mask)[0]
         dialogue_dim = tf.shape(attribute_mask)[1]
-
         if attribute in set(SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE):
             units = self.config[ENCODING_DIMENSION]
         else:
-            units = 0
-            for f in attribute_features_list:
-                if isinstance(f, tf.SparseTensor):
-                    units += self.config[DENSE_DIMENSION][attribute]
-                else:
-                    units += f.shape[-1]
+            units = self._get_dense_units(tf_batch_data[attribute][SENTENCE], attribute)
 
         attribute_features = tf.zeros(
             (batch_dim, dialogue_dim, units), dtype=tf.float32
         )
         if attribute == TEXT:
-            # TODO handle the case if transformer is not created
-            # if self.config[f"{DIALOGUE}_{NUM_TRANSFORMER_LAYERS}"] > 0:
-            #     units = self.config[f"{DIALOGUE}_{TRANSFORMER_SIZE}"]
-            # elif self.config[HIDDEN_LAYERS_SIZES][TEXT]:
-            #     units = self.config[HIDDEN_LAYERS_SIZES][TEXT]
-            # else:
-            #     for f in attribute_features_list:
-            #         if isinstance(f, tf.SparseTensor):
-            #             units += self.config[DENSE_DIMENSION][attribute]
-            #         else:
-            #             units += f.shape[-1]
+            # if the input features are fake, we don't process them further,
+            # but we need to calculate correct last dim (units) so that tf could infer
+            # the last shape of the tensors
+            if self.config[f"{DIALOGUE}_{NUM_TRANSFORMER_LAYERS}"] > 0:
+                text_transformer_units = self.config[f"{DIALOGUE}_{TRANSFORMER_SIZE}"]
+            elif self.config[HIDDEN_LAYERS_SIZES][TEXT]:
+                text_transformer_units = self.config[HIDDEN_LAYERS_SIZES][TEXT][-1]
+            else:
+                text_transformer_units = self._get_concat_units(
+                    tf_batch_data, attribute
+                )
 
             text_transformer_output = tf.zeros(
-                (0, 0, self.config[f"{DIALOGUE}_{TRANSFORMER_SIZE}"]), dtype=tf.float32
+                (0, 0, text_transformer_units), dtype=tf.float32
             )
             text_sequence_lengths = tf.zeros((0, 1), dtype=tf.int32)
         else:
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index 2bad5f180e06..50e4903814c0 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -186,7 +186,7 @@ def fit(
         batch_strategy: Text,
         silent: bool = False,
         loading: bool = False,
-        eager: bool = True,
+        eager: bool = False,
     ) -> None:
         """Fit model data"""
 

From 229723af95ca640a3a14fc6eea1632dc475406fa Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 13 Nov 2020 11:42:55 +0100
Subject: [PATCH 60/62] add todo

---
 rasa/core/policies/ted_policy.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index 2f213859674d..d9084f053a43 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -969,6 +969,7 @@ def _encode_features_per_attribute(
     def _get_dense_units(
         self, attribute_features_list: List[tf.Tensor], attribute: Text
     ) -> int:
+        # TODO this should be done in corresponding layers once in init
         units = 0
         for f in attribute_features_list:
             if isinstance(f, tf.SparseTensor):
@@ -980,6 +981,7 @@ def _get_dense_units(
     def _get_concat_units(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
     ) -> int:
+        # TODO this should be done in corresponding layers once in init
         # calculate concat sequence sentence dim
         sentence_units = self._get_dense_units(
             tf_batch_data[attribute][SENTENCE], attribute

From 76ca209d6066bcf49ac35ab390194bb17fc1c18c Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 13 Nov 2020 13:29:40 +0100
Subject: [PATCH 61/62] fix tests

---
 .../featurizers/single_state_featurizer.py    |   2 +-
 rasa/core/featurizers/tracker_featurizers.py  |   2 +-
 rasa/utils/tensorflow/layers.py               |  16 +--
 .../test_single_state_featurizers.py          | 101 ++++++------------
 .../featurizers/test_tracker_featurizer.py    |   8 +-
 tests/shared/core/test_domain.py              |  22 ++--
 6 files changed, 62 insertions(+), 89 deletions(-)

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index aa93df4c35c5..e702134a0bf6 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -240,7 +240,7 @@ def encode_state(
 
         return state_features
 
-    def encode_entity(
+    def encode_entities(
         self, entity_data: Dict[Text, Any], interpreter: NaturalLanguageInterpreter
     ) -> Dict[Text, List["Features"]]:
         """Encode the given entity data with the help of the given interpreter.
diff --git a/rasa/core/featurizers/tracker_featurizers.py b/rasa/core/featurizers/tracker_featurizers.py
index 08355883d716..d12b92a42e79 100644
--- a/rasa/core/featurizers/tracker_featurizers.py
+++ b/rasa/core/featurizers/tracker_featurizers.py
@@ -98,7 +98,7 @@ def _create_entity_tags(
     ) -> List[List[Dict[Text, List["Features"]]]]:
         return [
             [
-                self.state_featurizer.encode_entity(entity_data, interpreter)
+                self.state_featurizer.encode_entities(entity_data, interpreter)
                 for entity_data in trackers_entities
             ]
             for trackers_entities in trackers_as_entities
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 0b0d00e4131a..a9017094e945 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -630,13 +630,15 @@ def body(idx: tf.Tensor, out: tf.Tensor) -> List[tf.Tensor]:
         # create first random array of indices
         out1 = rand_idxs()  # (1, num_neg)
 
-        return tf.while_loop(
-            cond,
-            body,
-            loop_vars=[idx1, out1],
-            shape_invariants=[idx1.shape, tf.TensorShape([None, self.num_neg])],
-            parallel_iterations=self.parallel_iterations,
-            back_prop=False,
+        return tf.nest.map_structure(
+            tf.stop_gradient,
+            tf.while_loop(
+                cond,
+                body,
+                loop_vars=[idx1, out1],
+                shape_invariants=[idx1.shape, tf.TensorShape([None, self.num_neg])],
+                parallel_iterations=self.parallel_iterations,
+            ),
         )[1]
 
     @staticmethod
diff --git a/tests/core/featurizers/test_single_state_featurizers.py b/tests/core/featurizers/test_single_state_featurizers.py
index a43028a64784..2f5819e8e659 100644
--- a/tests/core/featurizers/test_single_state_featurizers.py
+++ b/tests/core/featurizers/test_single_state_featurizers.py
@@ -19,6 +19,7 @@
     ENTITY_ATTRIBUTE_VALUE,
     ENTITY_ATTRIBUTE_START,
     ENTITY_ATTRIBUTE_END,
+    ENTITY_TAGS,
 )
 from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS
 from rasa.shared.nlu.interpreter import RegexInterpreter
@@ -186,61 +187,41 @@ def test_single_state_featurizer_with_entity_roles_and_groups(
     from rasa.core.agent import Agent
 
     interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter
-
+    # TODO roles and groups are not supported in e2e yet
+    domain = Domain(
+        intents=[],
+        entities=["city", f"city{ENTITY_LABEL_SEPARATOR}to"],
+        slots=[],
+        templates={},
+        forms={},
+        action_names=[],
+    )
     f = SingleStateFeaturizer()
-    f._default_feature_states[INTENT] = {"inform": 0, "greet": 1}
-    f._default_feature_states[ENTITIES] = {
-        "city": 0,
-        "name": 1,
-        f"city{ENTITY_LABEL_SEPARATOR}to": 2,
-        f"city{ENTITY_LABEL_SEPARATOR}from": 3,
-    }
-    f._default_feature_states[ACTION_NAME] = {
-        "utter_ask_where_to": 0,
-        "utter_greet": 1,
-        "action_listen": 2,
-    }
-    f._default_feature_states[SLOTS] = {"slot_1": 0, "slot_2": 1, "slot_3": 2}
-    f._default_feature_states[ACTIVE_LOOP] = {
-        "active_loop_1": 0,
-        "active_loop_2": 1,
-        "active_loop_3": 2,
-        "active_loop_4": 3,
-    }
-    encoded = f.encode_state(
+    f.prepare_from_domain(domain)
+    encoded = f.encode_entities(
         {
-            "user": {
-                "text": "I am flying from London to Paris",
-                "intent": "inform",
-                "entities": [
-                    {
-                        ENTITY_ATTRIBUTE_TYPE: "city",
-                        ENTITY_ATTRIBUTE_VALUE: "London",
-                        ENTITY_ATTRIBUTE_START: 17,
-                        ENTITY_ATTRIBUTE_END: 23,
-                    },
-                    {
-                        ENTITY_ATTRIBUTE_TYPE: f"city{ENTITY_LABEL_SEPARATOR}to",
-                        ENTITY_ATTRIBUTE_VALUE: "Paris",
-                        ENTITY_ATTRIBUTE_START: 27,
-                        ENTITY_ATTRIBUTE_END: 32,
-                    },
-                ],
-            },
-            "prev_action": {
-                "action_name": "action_listen",
-                "action_text": "throw a ball",
-            },
-            "active_loop": {"name": "active_loop_4"},
-            "slots": {"slot_1": (1.0,)},
+            TEXT: "I am flying from London to Paris",
+            ENTITIES: [
+                {
+                    ENTITY_ATTRIBUTE_TYPE: "city",
+                    ENTITY_ATTRIBUTE_VALUE: "London",
+                    ENTITY_ATTRIBUTE_START: 17,
+                    ENTITY_ATTRIBUTE_END: 23,
+                },
+                {
+                    ENTITY_ATTRIBUTE_TYPE: f"city{ENTITY_LABEL_SEPARATOR}to",
+                    ENTITY_ATTRIBUTE_VALUE: "Paris",
+                    ENTITY_ATTRIBUTE_START: 27,
+                    ENTITY_ATTRIBUTE_END: 32,
+                },
+            ],
         },
         interpreter=interpreter,
     )
-    # check all the features are encoded and *_text features are encoded by a densefeaturizer
-    assert sorted(list(encoded.keys())) == sorted(
-        [TEXT, ENTITIES, ACTION_NAME, SLOTS, ACTIVE_LOOP, INTENT, ACTION_TEXT]
+    assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS])
+    assert np.all(
+        encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [0], [2]]
     )
-    assert np.all(encoded[ENTITIES][0].features.toarray() == [1, 0, 1, 0])
 
 
 def test_single_state_featurizer_uses_dtype_float():
@@ -268,7 +249,7 @@ def test_single_state_featurizer_with_interpreter_state_with_action_listen(
     interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter
 
     f = SingleStateFeaturizer()
-    f._default_feature_states[INTENT] = {"inform": 0, "greet": 1}
+    f._default_feature_states[INTENT] = {"greet": 0, "inform": 1}
     f._default_feature_states[ENTITIES] = {
         "city": 0,
         "name": 1,
@@ -280,7 +261,8 @@ def test_single_state_featurizer_with_interpreter_state_with_action_listen(
         "utter_greet": 1,
         "action_listen": 2,
     }
-    f._default_feature_states[SLOTS] = {"slot_1": 0, "slot_2": 1, "slot_3": 2}
+    # `_0` in slots represent feature dimension
+    f._default_feature_states[SLOTS] = {"slot_1_0": 0, "slot_2_0": 1, "slot_3_0": 2}
     f._default_feature_states[ACTIVE_LOOP] = {
         "active_loop_1": 0,
         "active_loop_2": 1,
@@ -292,20 +274,7 @@ def test_single_state_featurizer_with_interpreter_state_with_action_listen(
             "user": {
                 "text": "I am flying from London to Paris",
                 "intent": "inform",
-                "entities": [
-                    {
-                        ENTITY_ATTRIBUTE_TYPE: "city",
-                        ENTITY_ATTRIBUTE_VALUE: "London",
-                        ENTITY_ATTRIBUTE_START: 17,
-                        ENTITY_ATTRIBUTE_END: 23,
-                    },
-                    {
-                        ENTITY_ATTRIBUTE_TYPE: f"city{ENTITY_LABEL_SEPARATOR}to",
-                        ENTITY_ATTRIBUTE_VALUE: "Paris",
-                        ENTITY_ATTRIBUTE_START: 27,
-                        ENTITY_ATTRIBUTE_END: 32,
-                    },
-                ],
+                "entities": ["city", f"city{ENTITY_LABEL_SEPARATOR}to"],
             },
             "prev_action": {
                 "action_name": "action_listen",
@@ -328,7 +297,7 @@ def test_single_state_featurizer_with_interpreter_state_with_action_listen(
     assert (
         encoded[ACTION_NAME][0].features != scipy.sparse.coo_matrix([[0, 0, 1]])
     ).nnz == 0
-    assert encoded[ENTITIES][0].features.shape[-1] == 1
+    assert encoded[ENTITIES][0].features.shape[-1] == 4
     assert (encoded[SLOTS][0].features != scipy.sparse.coo_matrix([[1, 0, 0]])).nnz == 0
     assert (
         encoded[ACTIVE_LOOP][0].features != scipy.sparse.coo_matrix([[0, 0, 0, 1]])
diff --git a/tests/core/featurizers/test_tracker_featurizer.py b/tests/core/featurizers/test_tracker_featurizer.py
index 98f323bd3279..f6b904d8397b 100644
--- a/tests/core/featurizers/test_tracker_featurizer.py
+++ b/tests/core/featurizers/test_tracker_featurizer.py
@@ -67,7 +67,7 @@ def test_featurize_trackers_with_full_dialogue_tracker_featurizer(
     tracker = tracker_from_dialogue_file(
         "data/test_dialogues/moodbot.json", moodbot_domain
     )
-    state_features, labels = tracker_featurizer.featurize_trackers(
+    state_features, labels, entity_tags = tracker_featurizer.featurize_trackers(
         [tracker], moodbot_domain, RegexInterpreter()
     )
 
@@ -75,6 +75,8 @@ def test_featurize_trackers_with_full_dialogue_tracker_featurizer(
     assert len(state_features) > 0
     assert labels is not None
     assert len(labels) > 0
+    # moodbot doesn't contain e2e entities
+    assert not any([any(turn_tags) for turn_tags in entity_tags])
 
 
 def test_featurize_trackers_with_max_history_tracker_featurizer(moodbot_domain: Domain):
@@ -84,7 +86,7 @@ def test_featurize_trackers_with_max_history_tracker_featurizer(moodbot_domain:
     tracker = tracker_from_dialogue_file(
         "data/test_dialogues/moodbot.json", moodbot_domain
     )
-    state_features, labels = tracker_featurizer.featurize_trackers(
+    state_features, labels, entity_tags = tracker_featurizer.featurize_trackers(
         [tracker], moodbot_domain, RegexInterpreter()
     )
 
@@ -92,3 +94,5 @@ def test_featurize_trackers_with_max_history_tracker_featurizer(moodbot_domain:
     assert len(state_features) > 0
     assert labels is not None
     assert len(labels) > 0
+    # moodbot doesn't contain e2e entities
+    assert not any([any(turn_tags) for turn_tags in entity_tags])
diff --git a/tests/shared/core/test_domain.py b/tests/shared/core/test_domain.py
index 800f04ffd5a1..630542fa44e4 100644
--- a/tests/shared/core/test_domain.py
+++ b/tests/shared/core/test_domain.py
@@ -75,7 +75,7 @@ async def test_create_train_data_no_history(default_domain: Domain):
     assert hashed == [
         "[{}]",
         '[{"prev_action": {"action_name": "utter_greet"}, "user": {"intent": "greet"}}]',
-        '[{"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}]',
+        '[{"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}]',
         '[{"prev_action": {"action_name": "utter_goodbye"}, "user": {"intent": "goodbye"}}]',
         '[{"prev_action": {"action_name": "utter_default"}, "user": {"intent": "default"}}]',
         '[{"prev_action": {"action_name": "utter_default"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}]',
@@ -83,7 +83,7 @@ async def test_create_train_data_no_history(default_domain: Domain):
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "goodbye"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "default"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}]',
-        '[{"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}]',
+        '[{"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}]',
     ]
 
 
@@ -104,13 +104,13 @@ async def test_create_train_data_with_history(default_domain: Domain):
     hashed = sorted(hashed)
 
     assert hashed == [
-        '[{"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "utter_default"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}]',
+        '[{"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "utter_default"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "utter_default"}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "goodbye"}}, {"prev_action": {"action_name": "utter_goodbye"}, "user": {"intent": "goodbye"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "user": {"intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "utter_default"}, "user": {"intent": "default"}}]',
         '[{"prev_action": {"action_name": "utter_greet"}, "user": {"intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "utter_default"}, "user": {"intent": "default"}}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "goodbye"}}]',
-        '[{}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}]',
-        '[{}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}]',
-        '[{}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": [{"end": 22, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}]',
+        '[{}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"intent": "default"}}]',
+        '[{}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}]',
+        '[{}, {"prev_action": {"action_name": "action_listen"}, "slots": {"name": [1.0]}, "user": {"entities": ["name"], "intent": "greet"}}]',
         '[{}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "user": {"intent": "greet"}}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "default"}}]',
         '[{}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "greet"}}, {"prev_action": {"action_name": "utter_greet"}, "user": {"intent": "greet"}}]',
         '[{}, {"prev_action": {"action_name": "action_listen"}, "user": {"intent": "greet"}}]',
@@ -156,7 +156,7 @@ async def test_create_train_data_unfeaturized_entities():
     assert hashed == [
         "[{}]",
         '[{"prev_action": {"action_name": "utter_greet"}, "user": {"intent": "greet"}}]',
-        '[{"prev_action": {"action_name": "utter_greet"}, "user": {"entities": [{"end": 81, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}]',
+        '[{"prev_action": {"action_name": "utter_greet"}, "user": {"entities": ["name"], "intent": "greet"}}]',
         '[{"prev_action": {"action_name": "utter_goodbye"}, "user": {"intent": "goodbye"}}]',
         '[{"prev_action": {"action_name": "utter_default"}, "user": {"intent": "why"}}]',
         '[{"prev_action": {"action_name": "utter_default"}, "user": {"intent": "thank"}}]',
@@ -166,9 +166,9 @@ async def test_create_train_data_unfeaturized_entities():
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "thank"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "greet"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "goodbye"}}]',
-        '[{"prev_action": {"action_name": "action_listen"}, "user": {"entities": [{"end": 81, "entity": "name", "start": 5, "value": "Peter"}], "intent": "greet"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"entities": [], "intent": "default"}}]',
         '[{"prev_action": {"action_name": "action_listen"}, "user": {"entities": [], "intent": "ask"}}]',
+        '[{"prev_action": {"action_name": "action_listen"}, "user": {"entities": ["name"], "intent": "greet"}}]',
     ]
 
 
@@ -1057,7 +1057,7 @@ def test_get_featurized_entities():
 
     featurized_entities = domain._get_featurized_entities(user_uttered)
 
-    assert featurized_entities == []
+    assert featurized_entities == set()
 
     user_uttered = UserUttered(
         text="I am going to London",
@@ -1067,6 +1067,4 @@ def test_get_featurized_entities():
 
     featurized_entities = domain._get_featurized_entities(user_uttered)
 
-    assert featurized_entities == [
-        {"entity": "GPE", "role": "destination", "value": "London"}
-    ]
+    assert featurized_entities == {"GPE", f"GPE{ENTITY_LABEL_SEPARATOR}destination"}

From ce4098e55ead338d3b164ed9f292013a43998b95 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 13 Nov 2020 17:21:44 +0100
Subject: [PATCH 62/62] use indices constant

---
 examples/e2ebot/config.yml       |  4 ++--
 examples/e2ebot/domain.yml       |  2 +-
 rasa/core/policies/ted_policy.py | 10 +++++++---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/e2ebot/config.yml b/examples/e2ebot/config.yml
index 2d4a08aa4ae6..f38558adb0ad 100644
--- a/examples/e2ebot/config.yml
+++ b/examples/e2ebot/config.yml
@@ -9,8 +9,8 @@ pipeline:
     analyzer: char_wb
     min_ngram: 1
     max_ngram: 4
-#  - name: DIETClassifier
-#    epochs: 200
+  - name: DIETClassifier
+    epochs: 200
 policies:
 - name: TEDPolicy
   epochs: 200
diff --git a/examples/e2ebot/domain.yml b/examples/e2ebot/domain.yml
index d884f4cc40c3..5c35c3d83a7b 100644
--- a/examples/e2ebot/domain.yml
+++ b/examples/e2ebot/domain.yml
@@ -1,5 +1,5 @@
 version: "2.0"
-
+# TODO create a bot that makes sense
 actions:
  - utter_greet
  - utter_happy
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index d9084f053a43..b228ea4ec2e8 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -108,6 +108,7 @@
 LABEL_KEY = LABEL
 LABEL_SUB_KEY = IDS
 LENGTH = "length"
+INDICES = "indices"
 SENTENCE_FEATURES_TO_ENCODE = [INTENT, TEXT, ACTION_NAME, ACTION_TEXT]
 SEQUENCE_FEATURES_TO_ENCODE = [TEXT, ACTION_TEXT, f"{LABEL}_{ACTION_TEXT}"]
 LABEL_FEATURES_TO_ENCODE = [f"{LABEL}_{ACTION_NAME}", f"{LABEL}_{ACTION_TEXT}"]
@@ -872,7 +873,7 @@ def _compute_dialogue_indices(
     ) -> None:
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32)
         # wrap in a list, because that's the structure of tf_batch_data
-        tf_batch_data[DIALOGUE][IDS] = [
+        tf_batch_data[DIALOGUE][INDICES] = [
             (
                 tf.map_fn(
                     tf.range,
@@ -1073,7 +1074,10 @@ def _create_last_dialogue_turns_mask(
         last_dialogue_turn_mask = tf.math.logical_not(
             tf.cast(
                 tf.concat(
-                    [tf_batch_data[DIALOGUE][IDS][0], tf.zeros((1,), dtype=tf.int32)],
+                    [
+                        tf_batch_data[DIALOGUE][INDICES][0],
+                        tf.zeros((1,), dtype=tf.int32),
+                    ],
                     axis=0,
                 )[1:],
                 dtype=tf.bool,
@@ -1215,7 +1219,7 @@ def _convert_to_original_shape(
             dialogue_lengths = tf.cast(
                 tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32
             )
-            dialogue_indices = tf_batch_data[DIALOGUE][IDS][0]
+            dialogue_indices = tf_batch_data[DIALOGUE][INDICES][0]
         else:
             # for labels, dialogue length is a fake dim and equal to 1
             dialogue_lengths = tf.ones((tf.shape(attribute_mask)[0],), dtype=tf.int32)