RasaHQ · rasabot · Feb 5, 2021 · Feb 1, 2021 · Feb 1, 2021 · Feb 1, 2021
diff --git a/changelog/7867.bugfix.md b/changelog/7867.bugfix.md
@@ -0,0 +1,2 @@
+Fix the role of `unidirectional_encoder` in TED. This parameter is only applied to
+transformers for for `text`, `action_text`, and `label_action_text`.
diff --git a/changelog/7867.improvement.md b/changelog/7867.improvement.md
@@ -0,0 +1,3 @@
+If `MaxHistoryTrackerFeaturizer` is used, invert the dialogue sequence before passing
+it to the transformer so that the last dialogue input becomes the first one and
+therefore always have the same positional encoding.
diff --git a/docs/docs/policies.mdx b/docs/docs/policies.mdx
@@ -240,6 +240,9 @@ However, additional parameters exist that can be adapted.
 +---------------------------------------+------------------------+--------------------------------------------------------------+
 | number_of_attention_heads             | 4                      | Number of self-attention heads in transformers.              |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
+| unidirectional_encoder                | True                   | Use a unidirectional or bidirectional encoder                |
+|                                       |                        | for `text`, `action_text`, and `label_action_text`.          |
++---------------------------------------+------------------------+--------------------------------------------------------------+
 | use_key_relative_attention            | False                  | If 'True' use key relative embeddings in attention.          |
 +---------------------------------------+------------------------+--------------------------------------------------------------+
 | use_value_relative_attention          | False                  | If 'True' use value relative embeddings in attention.        |

@@ -190,8 +190,9 @@ class TEDPolicy(Policy):
         VALUE_RELATIVE_ATTENTION: False,
         # Max position for relative embeddings
         MAX_RELATIVE_POSITION: None,
-        # Use a unidirectional or bidirectional encoder.
-        UNIDIRECTIONAL_ENCODER: True,
+        # Use a unidirectional or bidirectional encoder
+        # for `text`, `action_text`, and `label_action_text`.
+        UNIDIRECTIONAL_ENCODER: False,
         # ## Training parameters
         # Initial and final batch sizes:
         # Batch size will be linearly increased for each epoch.
@@ -316,11 +317,6 @@ def __init__(
         super().__init__(
             featurizer, priority, should_finetune=should_finetune, **kwargs
         )
-        if isinstance(featurizer, FullDialogueTrackerFeaturizer):
-            self.is_full_dialogue_featurizer_used = True
-        else:
-            self.is_full_dialogue_featurizer_used = False
-
         self._load_params(**kwargs)
 
         self.model = model
@@ -803,9 +799,9 @@ def load(
             model_data_example,
             data_signature=model_data_example.get_signature(),
             config=meta,
-            # during prediction we don't care about previous dialogue turns,
-            # so to save computation time, use only the last one
-            use_only_last_dialogue_turns=True,
+            max_history_featurizer_is_used=isinstance(
+                featurizer, MaxHistoryTrackerFeaturizer
+            ),
             label_data=label_data,
             entity_tag_specs=entity_tag_specs,
             finetune_mode=should_finetune,
@@ -842,7 +838,7 @@ def __init__(
         self,
         data_signature: Dict[Text, Dict[Text, List[FeatureSignature]]],
         config: Dict[Text, Any],
-        use_only_last_dialogue_turns: bool,
+        max_history_featurizer_is_used: bool,
         label_data: RasaModelData,
         entity_tag_specs: Optional[List[EntityTagSpec]],
     ) -> None:
@@ -851,13 +847,14 @@ def __init__(
         Args:
             data_signature: the data signature of the input data
             config: the model configuration
-            use_only_last_dialogue_turns: if 'True' only the last dialogue turn will be used
+            max_history_featurizer_is_used: if 'True'
+                only the last dialogue turn will be used
             label_data: the label data
             entity_tag_specs: the entity tag specifications
         """
         super().__init__("TED", config, data_signature, label_data)
 
-        self.use_only_last_dialogue_turns = use_only_last_dialogue_turns
+        self.max_history_featurizer_is_used = max_history_featurizer_is_used
 
         self.predict_data_signature = {
             feature_name: features
@@ -925,6 +922,11 @@ def _prepare_layers(self) -> None:
             self.config[TRANSFORMER_SIZE][DIALOGUE],
             self.config[DROP_RATE_DIALOGUE],
             self.config[DROP_RATE_ATTENTION],
+            # use bidirectional transformer, because
+            # we will invert dialogue sequence so that the last turn is located
+            # at the first position and would always have
+            # exactly the same positional encoding
+            unidirectional=not self.max_history_featurizer_is_used,
         )
 
         self._prepare_embed_layers(DIALOGUE)
@@ -1062,13 +1064,24 @@ def _embed_dialogue(
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
         mask = self._compute_mask(dialogue_lengths)
 
+        if self.max_history_featurizer_is_used:
+            # invert dialogue sequence so that the last turn would always have
+            # exactly the same positional encoding
+            dialogue_in = tf.reverse_sequence(dialogue_in, dialogue_lengths, seq_axis=1)
+
         dialogue_transformed, attention_weights = self._tf_layers[
             f"transformer.{DIALOGUE}"
         ](dialogue_in, 1 - mask, self._training)
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
-        if self.use_only_last_dialogue_turns:
-            # pick last vector if max history featurizer is used
+        if self.max_history_featurizer_is_used:
+            # pick last vector if max history featurizer is used, since we inverted
+            # dialogue sequence, the last vector is actually the first one
+            dialogue_transformed = dialogue_transformed[:, :1, :]
+            mask = tf.expand_dims(self._last_token(mask, dialogue_lengths), 1)
+        elif not self._training:
+            # during prediction we don't care about previous dialogue turns,
+            # so to save computation time, use only the last one
             dialogue_transformed = tf.expand_dims(
                 self._last_token(dialogue_transformed, dialogue_lengths), 1
             )
@@ -1184,7 +1197,7 @@ def _encode_fake_features_per_attribute(
     def _create_last_dialogue_turns_mask(
         tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
     ) -> tf.Tensor:
-        # Since use_only_last_dialogue_turns is True,
+        # Since max_history_featurizer_is_used is True,
         # we need to find the locations of last dialogue turns in
         # (combined batch dimension and dialogue length,) dimension,
         # so that we can use `_sequence_lengths` as a boolean  mask to pick
@@ -1234,7 +1247,7 @@ def _encode_real_features_per_attribute(
         Args:
             tf_batch_data: dictionary mapping every attribute to its features and masks
             attribute: the attribute we will encode features for
-            (e.g., ACTION_NAME, INTENT)
+                (e.g., ACTION_NAME, INTENT)
 
         Returns:
             A tensor combining  all features for `attribute`
@@ -1277,7 +1290,7 @@ def _encode_real_features_per_attribute(
                 text_transformer_output = attribute_features
                 text_sequence_lengths = sequence_lengths
 
-                if self.use_only_last_dialogue_turns:
+                if self.max_history_featurizer_is_used:
                     # get the location of all last dialogue inputs
                     last_dialogue_turns_mask = self._create_last_dialogue_turns_mask(
                         tf_batch_data, attribute
@@ -1334,10 +1347,9 @@ def _convert_to_original_shape(
 
         Args:
             attribute_features: the "real" features to convert
-            attribute_mask:  the tensor containing the position of "real" features
-                in the dialogue, shape is (batch-size x dialogue_len x 1)
-            dialogue_lengths: the tensor containing the actual dialogue length,
-                shape is (batch-size,)
+            tf_batch_data: dictionary mapping every attribute to its features and masks
+            attribute: the attribute we will encode features for
+                (e.g., ACTION_NAME, INTENT)
 
         Returns:
             The converted attribute features
@@ -1483,7 +1495,7 @@ def _reshape_for_entities(
         attribute_mask = tf_batch_data[TEXT][MASK][0]
         dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
 
-        if self.use_only_last_dialogue_turns:
+        if self.max_history_featurizer_is_used:
             # pick outputs that correspond to the last dialogue turns
             attribute_mask = tf.expand_dims(
                 self._last_token(attribute_mask, dialogue_lengths), axis=1

@@ -755,6 +755,7 @@ def _prepare_transformer_layer(
         units: int,
         drop_rate: float,
         drop_rate_attention: float,
+        unidirectional: bool,
         prefix: Text = "transformer",
     ):
         if num_layers > 0:
@@ -767,7 +768,7 @@ def _prepare_transformer_layer(
                 dropout_rate=drop_rate,
                 attention_dropout_rate=drop_rate_attention,
                 sparsity=self.config[WEIGHT_SPARSITY],
-                unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
+                unidirectional=unidirectional,
                 use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
                 use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
                 max_relative_position=self.config[MAX_RELATIVE_POSITION],
@@ -872,6 +873,7 @@ def _prepare_sequence_layers(self, name: Text) -> None:
             size,
             self.config[DROP_RATE],
             self.config[DROP_RATE_ATTENTION],
+            self.config[UNIDIRECTIONAL_ENCODER],
         )
 
     def _prepare_entity_recognition_layers(self) -> None:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Fix the role of `unidirectional_encoder` in TED. This parameter is only applied to
		transformers for for `text`, `action_text`, and `label_action_text`.
Ghostvv marked this conversation as resolved. Show resolved Hide resolved