Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

invert dialogue time in ted #7867

Merged
merged 14 commits into from
Feb 5, 2021
3 changes: 3 additions & 0 deletions changelog/7867.improvement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
If `MaxHistoryTrackerFeaturizer` is used, invert the dialogue sequence before passing
it to the transformer so that the last dialogue input becomes the first one and
therefore always have the same pos encoding.
Ghostvv marked this conversation as resolved.
Show resolved Hide resolved
35 changes: 25 additions & 10 deletions rasa/core/policies/ted_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ class TEDPolicy(Policy):
# Max position for relative embeddings
MAX_RELATIVE_POSITION: None,
# Use a unidirectional or bidirectional encoder.
UNIDIRECTIONAL_ENCODER: True,
UNIDIRECTIONAL_ENCODER: False,
Ghostvv marked this conversation as resolved.
Show resolved Hide resolved
# ## Training parameters
# Initial and final batch sizes:
# Batch size will be linearly increased for each epoch.
Expand Down Expand Up @@ -803,9 +803,9 @@ def load(
model_data_example,
data_signature=model_data_example.get_signature(),
config=meta,
# during prediction we don't care about previous dialogue turns,
# so to save computation time, use only the last one
use_only_last_dialogue_turns=True,
use_only_last_dialogue_turns=isinstance(
featurizer, MaxHistoryTrackerFeaturizer
),
label_data=label_data,
entity_tag_specs=entity_tag_specs,
finetune_mode=should_finetune,
Expand Down Expand Up @@ -925,6 +925,11 @@ def _prepare_layers(self) -> None:
self.config[TRANSFORMER_SIZE][DIALOGUE],
self.config[DROP_RATE_DIALOGUE],
self.config[DROP_RATE_ATTENTION],
# use bidirectional transformer, because
# we will invert dialogue sequence so that the last turn is located
# at the first position and would always have
# exactly the same positional encoding
unidirectional=not self.use_only_last_dialogue_turns,
Ghostvv marked this conversation as resolved.
Show resolved Hide resolved
)

self._prepare_embed_layers(DIALOGUE)
Expand Down Expand Up @@ -1062,13 +1067,24 @@ def _embed_dialogue(
dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
mask = self._compute_mask(dialogue_lengths)

if self.use_only_last_dialogue_turns:
# invert dialogue sequence so that the last turn would always have
# exactly the same positional encoding
dialogue_in = tf.reverse_sequence(dialogue_in, dialogue_lengths, seq_axis=1)

dialogue_transformed, attention_weights = self._tf_layers[
f"transformer.{DIALOGUE}"
](dialogue_in, 1 - mask, self._training)
dialogue_transformed = tfa.activations.gelu(dialogue_transformed)

if self.use_only_last_dialogue_turns:
# pick last vector if max history featurizer is used
# pick last vector if max history featurizer is used, since we inverted
# dialogue sequence, the last vector is actually the first one
dialogue_transformed = dialogue_transformed[:, :1, :]
mask = tf.expand_dims(self._last_token(mask, dialogue_lengths), 1)
elif not self._training:
# during prediction we don't care about previous dialogue turns,
# so to save computation time, use only the last one
dialogue_transformed = tf.expand_dims(
self._last_token(dialogue_transformed, dialogue_lengths), 1
)
Expand Down Expand Up @@ -1234,7 +1250,7 @@ def _encode_real_features_per_attribute(
Args:
tf_batch_data: dictionary mapping every attribute to its features and masks
attribute: the attribute we will encode features for
(e.g., ACTION_NAME, INTENT)
(e.g., ACTION_NAME, INTENT)

Returns:
A tensor combining all features for `attribute`
Expand Down Expand Up @@ -1334,10 +1350,9 @@ def _convert_to_original_shape(

Args:
attribute_features: the "real" features to convert
attribute_mask: the tensor containing the position of "real" features
in the dialogue, shape is (batch-size x dialogue_len x 1)
dialogue_lengths: the tensor containing the actual dialogue length,
shape is (batch-size,)
tf_batch_data: dictionary mapping every attribute to its features and masks
attribute: the attribute we will encode features for
(e.g., ACTION_NAME, INTENT)

Returns:
The converted attribute features
Expand Down
4 changes: 3 additions & 1 deletion rasa/utils/tensorflow/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,7 @@ def _prepare_transformer_layer(
units: int,
drop_rate: float,
drop_rate_attention: float,
unidirectional: bool,
prefix: Text = "transformer",
):
if num_layers > 0:
Expand All @@ -767,7 +768,7 @@ def _prepare_transformer_layer(
dropout_rate=drop_rate,
attention_dropout_rate=drop_rate_attention,
sparsity=self.config[WEIGHT_SPARSITY],
unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
unidirectional=unidirectional,
use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
max_relative_position=self.config[MAX_RELATIVE_POSITION],
Expand Down Expand Up @@ -872,6 +873,7 @@ def _prepare_sequence_layers(self, name: Text) -> None:
size,
self.config[DROP_RATE],
self.config[DROP_RATE_ATTENTION],
self.config[UNIDIRECTIONAL_ENCODER],
)

def _prepare_entity_recognition_layers(self) -> None:
Expand Down