Skip to content

Commit

Permalink
Merge changes to TED (#7867).
Browse files Browse the repository at this point in the history
  • Loading branch information
samsucik committed Feb 8, 2021
2 parents 7f747c7 + 3338ab3 commit 4cdcbda
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 24 deletions.
2 changes: 2 additions & 0 deletions changelog/7867.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix the role of `unidirectional_encoder` in TED. This parameter is only applied to
transformers for `text`, `action_text` and `label_action_text`.
3 changes: 3 additions & 0 deletions changelog/7867.improvement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
If `MaxHistoryTrackerFeaturizer` is used, invert the dialogue sequence before passing
it to the transformer so that the last dialogue input becomes the first one and
therefore always have the same positional encoding.
3 changes: 3 additions & 0 deletions docs/docs/policies.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,9 @@ However, additional parameters exist that can be adapted.
+---------------------------------------+------------------------+--------------------------------------------------------------+
| number_of_attention_heads | 4 | Number of self-attention heads in transformers. |
+---------------------------------------+------------------------+--------------------------------------------------------------+
| unidirectional_encoder | True | Use a unidirectional or bidirectional encoder |
| | | for `text`, `action_text`, and `label_action_text`. |
+---------------------------------------+------------------------+--------------------------------------------------------------+
| use_key_relative_attention | False | If 'True' use key relative embeddings in attention. |
+---------------------------------------+------------------------+--------------------------------------------------------------+
| use_value_relative_attention | False | If 'True' use value relative embeddings in attention. |
Expand Down
58 changes: 35 additions & 23 deletions rasa/core/policies/ted_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,9 @@ class TEDPolicy(Policy):
VALUE_RELATIVE_ATTENTION: False,
# Max position for relative embeddings
MAX_RELATIVE_POSITION: None,
# Use a unidirectional or bidirectional encoder.
UNIDIRECTIONAL_ENCODER: True,
# Use a unidirectional or bidirectional encoder
# for `text`, `action_text`, and `label_action_text`.
UNIDIRECTIONAL_ENCODER: False,
# ## Training parameters
# Initial and final batch sizes:
# Batch size will be linearly increased for each epoch.
Expand Down Expand Up @@ -317,11 +318,6 @@ def __init__(
super().__init__(
featurizer, priority, should_finetune=should_finetune, **kwargs
)
if isinstance(featurizer, FullDialogueTrackerFeaturizer):
self.is_full_dialogue_featurizer_used = True
else:
self.is_full_dialogue_featurizer_used = False

self._load_params(**kwargs)

self.model = model
Expand Down Expand Up @@ -801,9 +797,9 @@ def load(
model_data_example,
data_signature=model_data_example.get_signature(),
config=meta,
# during prediction we don't care about previous dialogue turns,
# so to save computation time, use only the last one
use_only_last_dialogue_turns=True,
max_history_featurizer_is_used=isinstance(
featurizer, MaxHistoryTrackerFeaturizer
),
label_data=label_data,
entity_tag_specs=entity_tag_specs,
finetune_mode=should_finetune,
Expand Down Expand Up @@ -840,7 +836,7 @@ def __init__(
self,
data_signature: Dict[Text, Dict[Text, List[FeatureSignature]]],
config: Dict[Text, Any],
use_only_last_dialogue_turns: bool,
max_history_featurizer_is_used: bool,
label_data: RasaModelData,
entity_tag_specs: Optional[List[EntityTagSpec]],
) -> None:
Expand All @@ -849,13 +845,14 @@ def __init__(
Args:
data_signature: the data signature of the input data
config: the model configuration
use_only_last_dialogue_turns: if 'True' only the last dialogue turn will be used
max_history_featurizer_is_used: if 'True'
only the last dialogue turn will be used
label_data: the label data
entity_tag_specs: the entity tag specifications
"""
super().__init__("TED", config, data_signature, label_data)

self.use_only_last_dialogue_turns = use_only_last_dialogue_turns
self.max_history_featurizer_is_used = max_history_featurizer_is_used

self.predict_data_signature = {
feature_name: features
Expand Down Expand Up @@ -919,6 +916,11 @@ def _prepare_layers(self) -> None:
self.config[TRANSFORMER_SIZE][DIALOGUE],
self.config[DROP_RATE_DIALOGUE],
self.config[DROP_RATE_ATTENTION],
# use bidirectional transformer, because
# we will invert dialogue sequence so that the last turn is located
# at the first position and would always have
# exactly the same positional encoding
unidirectional=not self.max_history_featurizer_is_used,
)

self._prepare_embed_layers(DIALOGUE)
Expand Down Expand Up @@ -1048,13 +1050,24 @@ def _embed_dialogue(
dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
mask = self._compute_mask(dialogue_lengths)

if self.max_history_featurizer_is_used:
# invert dialogue sequence so that the last turn would always have
# exactly the same positional encoding
dialogue_in = tf.reverse_sequence(dialogue_in, dialogue_lengths, seq_axis=1)

dialogue_transformed, attention_weights = self._tf_layers[
f"transformer.{DIALOGUE}"
](dialogue_in, 1 - mask, self._training)
dialogue_transformed = tfa.activations.gelu(dialogue_transformed)

if self.use_only_last_dialogue_turns:
# pick last vector if max history featurizer is used
if self.max_history_featurizer_is_used:
# pick last vector if max history featurizer is used, since we inverted
# dialogue sequence, the last vector is actually the first one
dialogue_transformed = dialogue_transformed[:, :1, :]
mask = tf.expand_dims(self._last_token(mask, dialogue_lengths), 1)
elif not self._training:
# during prediction we don't care about previous dialogue turns,
# so to save computation time, use only the last one
dialogue_transformed = tf.expand_dims(
self._last_token(dialogue_transformed, dialogue_lengths), 1
)
Expand Down Expand Up @@ -1135,7 +1148,7 @@ def _encode_fake_features_per_attribute(
def _create_last_dialogue_turns_mask(
tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
) -> tf.Tensor:
# Since use_only_last_dialogue_turns is True,
# Since max_history_featurizer_is_used is True,
# we need to find the locations of last dialogue turns in
# (combined batch dimension and dialogue length,) dimension,
# so that we can use `_sequence_lengths` as a boolean mask to pick
Expand Down Expand Up @@ -1185,7 +1198,7 @@ def _encode_real_features_per_attribute(
Args:
tf_batch_data: dictionary mapping every attribute to its features and masks
attribute: the attribute we will encode features for
(e.g., ACTION_NAME, INTENT)
(e.g., ACTION_NAME, INTENT)
Returns:
A tensor combining all features for `attribute`
Expand Down Expand Up @@ -1225,7 +1238,7 @@ def _encode_real_features_per_attribute(
text_output = attribute_features
text_sequence_lengths = combined_sentence_sequence_feature_lengths

if self.use_only_last_dialogue_turns:
if self.max_history_featurizer_is_used:
# get the location of all last dialogue inputs
last_dialogue_turns_mask = self._create_last_dialogue_turns_mask(
tf_batch_data, attribute
Expand Down Expand Up @@ -1283,10 +1296,9 @@ def _convert_to_original_shape(
Args:
attribute_features: the "real" features to convert
attribute_mask: the tensor containing the position of "real" features
in the dialogue, shape is (batch-size x dialogue_len x 1)
dialogue_lengths: the tensor containing the actual dialogue length,
shape is (batch-size,)
tf_batch_data: dictionary mapping every attribute to its features and masks
attribute: the attribute we will encode features for
(e.g., ACTION_NAME, INTENT)
Returns:
The converted attribute features
Expand Down Expand Up @@ -1432,7 +1444,7 @@ def _reshape_for_entities(
attribute_mask = tf_batch_data[TEXT][MASK][0]
dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)

if self.use_only_last_dialogue_turns:
if self.max_history_featurizer_is_used:
# pick outputs that correspond to the last dialogue turns
attribute_mask = tf.expand_dims(
self._last_token(attribute_mask, dialogue_lengths), axis=1
Expand Down
3 changes: 2 additions & 1 deletion rasa/utils/tensorflow/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,7 @@ def _prepare_transformer_layer(
units: int,
drop_rate: float,
drop_rate_attention: float,
unidirectional: bool,
prefix: Text = "transformer",
):
if num_layers > 0:
Expand All @@ -767,7 +768,7 @@ def _prepare_transformer_layer(
dropout_rate=drop_rate,
attention_dropout_rate=drop_rate_attention,
sparsity=self.config[WEIGHT_SPARSITY],
unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
unidirectional=unidirectional,
use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
max_relative_position=self.config[MAX_RELATIVE_POSITION],
Expand Down

0 comments on commit 4cdcbda

Please sign in to comment.