Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

invert dialogue time in ted #7867

Merged
merged 14 commits into from
Feb 5, 2021
2 changes: 2 additions & 0 deletions changelog/7867.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix the role of `unidirectional_encoder` in TED. This parameter is only applied to
transformers for for `text`, `action_text`, and `label_action_text`.
Ghostvv marked this conversation as resolved.
Show resolved Hide resolved
3 changes: 3 additions & 0 deletions changelog/7867.improvement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
If `MaxHistoryTrackerFeaturizer` is used, invert the dialogue sequence before passing
it to the transformer so that the last dialogue input becomes the first one and
therefore always have the same positional encoding.
3 changes: 3 additions & 0 deletions docs/docs/policies.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,9 @@ However, additional parameters exist that can be adapted.
+---------------------------------------+------------------------+--------------------------------------------------------------+
| number_of_attention_heads | 4 | Number of self-attention heads in transformers. |
+---------------------------------------+------------------------+--------------------------------------------------------------+
| unidirectional_encoder | True | Use a unidirectional or bidirectional encoder |
| | | for `text`, `action_text`, and `label_action_text`. |
+---------------------------------------+------------------------+--------------------------------------------------------------+
| use_key_relative_attention | False | If 'True' use key relative embeddings in attention. |
+---------------------------------------+------------------------+--------------------------------------------------------------+
| use_value_relative_attention | False | If 'True' use value relative embeddings in attention. |
Expand Down
58 changes: 35 additions & 23 deletions rasa/core/policies/ted_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,9 @@ class TEDPolicy(Policy):
VALUE_RELATIVE_ATTENTION: False,
# Max position for relative embeddings
MAX_RELATIVE_POSITION: None,
# Use a unidirectional or bidirectional encoder.
UNIDIRECTIONAL_ENCODER: True,
# Use a unidirectional or bidirectional encoder
# for `text`, `action_text`, and `label_action_text`.
UNIDIRECTIONAL_ENCODER: False,
Ghostvv marked this conversation as resolved.
Show resolved Hide resolved
# ## Training parameters
# Initial and final batch sizes:
# Batch size will be linearly increased for each epoch.
Expand Down Expand Up @@ -316,11 +317,6 @@ def __init__(
super().__init__(
featurizer, priority, should_finetune=should_finetune, **kwargs
)
if isinstance(featurizer, FullDialogueTrackerFeaturizer):
self.is_full_dialogue_featurizer_used = True
else:
self.is_full_dialogue_featurizer_used = False

self._load_params(**kwargs)

self.model = model
Expand Down Expand Up @@ -803,9 +799,9 @@ def load(
model_data_example,
data_signature=model_data_example.get_signature(),
config=meta,
# during prediction we don't care about previous dialogue turns,
# so to save computation time, use only the last one
use_only_last_dialogue_turns=True,
max_history_featurizer_is_used=isinstance(
featurizer, MaxHistoryTrackerFeaturizer
),
label_data=label_data,
entity_tag_specs=entity_tag_specs,
finetune_mode=should_finetune,
Expand Down Expand Up @@ -842,7 +838,7 @@ def __init__(
self,
data_signature: Dict[Text, Dict[Text, List[FeatureSignature]]],
config: Dict[Text, Any],
use_only_last_dialogue_turns: bool,
max_history_featurizer_is_used: bool,
label_data: RasaModelData,
entity_tag_specs: Optional[List[EntityTagSpec]],
) -> None:
Expand All @@ -851,13 +847,14 @@ def __init__(
Args:
data_signature: the data signature of the input data
config: the model configuration
use_only_last_dialogue_turns: if 'True' only the last dialogue turn will be used
max_history_featurizer_is_used: if 'True'
only the last dialogue turn will be used
label_data: the label data
entity_tag_specs: the entity tag specifications
"""
super().__init__("TED", config, data_signature, label_data)

self.use_only_last_dialogue_turns = use_only_last_dialogue_turns
self.max_history_featurizer_is_used = max_history_featurizer_is_used

self.predict_data_signature = {
feature_name: features
Expand Down Expand Up @@ -925,6 +922,11 @@ def _prepare_layers(self) -> None:
self.config[TRANSFORMER_SIZE][DIALOGUE],
self.config[DROP_RATE_DIALOGUE],
self.config[DROP_RATE_ATTENTION],
# use bidirectional transformer, because
# we will invert dialogue sequence so that the last turn is located
# at the first position and would always have
# exactly the same positional encoding
unidirectional=not self.max_history_featurizer_is_used,
)

self._prepare_embed_layers(DIALOGUE)
Expand Down Expand Up @@ -1062,13 +1064,24 @@ def _embed_dialogue(
dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
mask = self._compute_mask(dialogue_lengths)

if self.max_history_featurizer_is_used:
# invert dialogue sequence so that the last turn would always have
# exactly the same positional encoding
dialogue_in = tf.reverse_sequence(dialogue_in, dialogue_lengths, seq_axis=1)

dialogue_transformed, attention_weights = self._tf_layers[
f"transformer.{DIALOGUE}"
](dialogue_in, 1 - mask, self._training)
dialogue_transformed = tfa.activations.gelu(dialogue_transformed)

if self.use_only_last_dialogue_turns:
# pick last vector if max history featurizer is used
if self.max_history_featurizer_is_used:
# pick last vector if max history featurizer is used, since we inverted
# dialogue sequence, the last vector is actually the first one
dialogue_transformed = dialogue_transformed[:, :1, :]
mask = tf.expand_dims(self._last_token(mask, dialogue_lengths), 1)
elif not self._training:
# during prediction we don't care about previous dialogue turns,
# so to save computation time, use only the last one
dialogue_transformed = tf.expand_dims(
self._last_token(dialogue_transformed, dialogue_lengths), 1
)
Expand Down Expand Up @@ -1184,7 +1197,7 @@ def _encode_fake_features_per_attribute(
def _create_last_dialogue_turns_mask(
tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
) -> tf.Tensor:
# Since use_only_last_dialogue_turns is True,
# Since max_history_featurizer_is_used is True,
# we need to find the locations of last dialogue turns in
# (combined batch dimension and dialogue length,) dimension,
# so that we can use `_sequence_lengths` as a boolean mask to pick
Expand Down Expand Up @@ -1234,7 +1247,7 @@ def _encode_real_features_per_attribute(
Args:
tf_batch_data: dictionary mapping every attribute to its features and masks
attribute: the attribute we will encode features for
(e.g., ACTION_NAME, INTENT)
(e.g., ACTION_NAME, INTENT)

Returns:
A tensor combining all features for `attribute`
Expand Down Expand Up @@ -1277,7 +1290,7 @@ def _encode_real_features_per_attribute(
text_transformer_output = attribute_features
text_sequence_lengths = sequence_lengths

if self.use_only_last_dialogue_turns:
if self.max_history_featurizer_is_used:
# get the location of all last dialogue inputs
last_dialogue_turns_mask = self._create_last_dialogue_turns_mask(
tf_batch_data, attribute
Expand Down Expand Up @@ -1334,10 +1347,9 @@ def _convert_to_original_shape(

Args:
attribute_features: the "real" features to convert
attribute_mask: the tensor containing the position of "real" features
in the dialogue, shape is (batch-size x dialogue_len x 1)
dialogue_lengths: the tensor containing the actual dialogue length,
shape is (batch-size,)
tf_batch_data: dictionary mapping every attribute to its features and masks
attribute: the attribute we will encode features for
(e.g., ACTION_NAME, INTENT)

Returns:
The converted attribute features
Expand Down Expand Up @@ -1483,7 +1495,7 @@ def _reshape_for_entities(
attribute_mask = tf_batch_data[TEXT][MASK][0]
dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)

if self.use_only_last_dialogue_turns:
if self.max_history_featurizer_is_used:
# pick outputs that correspond to the last dialogue turns
attribute_mask = tf.expand_dims(
self._last_token(attribute_mask, dialogue_lengths), axis=1
Expand Down
4 changes: 3 additions & 1 deletion rasa/utils/tensorflow/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,7 @@ def _prepare_transformer_layer(
units: int,
drop_rate: float,
drop_rate_attention: float,
unidirectional: bool,
prefix: Text = "transformer",
):
if num_layers > 0:
Expand All @@ -767,7 +768,7 @@ def _prepare_transformer_layer(
dropout_rate=drop_rate,
attention_dropout_rate=drop_rate_attention,
sparsity=self.config[WEIGHT_SPARSITY],
unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
unidirectional=unidirectional,
use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
max_relative_position=self.config[MAX_RELATIVE_POSITION],
Expand Down Expand Up @@ -872,6 +873,7 @@ def _prepare_sequence_layers(self, name: Text) -> None:
size,
self.config[DROP_RATE],
self.config[DROP_RATE_ATTENTION],
self.config[UNIDIRECTIONAL_ENCODER],
)

def _prepare_entity_recognition_layers(self) -> None:
Expand Down