Monster ted (#7262)

* add diet to ted * reshape 4d tensors into 3d and back * fix shapes in non eager mode * make shape indices more general * fix add_length * add todo * sentence features are now also 4D * sequence length is 4D * convert 4d to 3 during padding * mask is 4d now * bring mask in correct shape before transformer * keep also the orginial dialogue length * update doc strings * use tf.scatter_nd to tranform 3d back to 4d * move tensor transformation to _encode_features_per_attribute * fix issues in _encode_features_per_attribute * use correct dialogue length * add comments * clean up * update constants * review comment * keep entity dict * create tag_ids for TED * clean up after merge * add batch_loss_entities (not working) * concatenate text and dialogue transformer output * get last dialogue before CRF * add predicting entities * clean up * differentiate between max history tracker featurizer used or not * add todo * add comments * use correct tag id mapping * check if text exists * fix frozenset issues * ignore actual entity value in MemoizationPolicy * fix import * fix some tests * update after merge * use python if instead of tf.cond * we need to return a tensor in tf.cond instead of None * create entity tags for all texts * update batch loss entities (not yet working) * input to entity loss * update entity prediction * fix randomness and shapes * fix ffnn encoding layer name * add todo * Update rasa/core/policies/ted_policy.py Co-authored-by: Tanja <tabergma@gmail.com> * Update rasa/core/featurizers/single_state_featurizer.py Co-authored-by: Tanja <tabergma@gmail.com> * rename to entity_tag_id_mapping * add comment to last dial mask * add comments to tf.cond * add docstrings * refactor number of dims check * rename zero features to fake features * pre compute dialogue_indices * create helper methods * calculate number of units for text_transformer_output * add todo * fix tests * use indices constant Co-authored-by: Tanja Bergmann <tabergma@gmail.com>
RasaHQ · Nov 13, 2020 · e0bec49 · e0bec49
1 parent fe9b0e7
commit e0bec49
Show file tree

Hide file tree

Showing 21 changed files with 953 additions and 292 deletions.
diff --git a/examples/e2ebot/data/stories.yml b/examples/e2ebot/data/stories.yml
@@ -10,7 +10,7 @@ stories:
 
 - story: sad path (text to text)
   steps:
-  - user: "Hello"
+  - user: "[Hello](bla)"
   - bot: "Welcome to moodbot. How are you feeling today?"
   - user: "Horrible"
   - bot: "Oh no! Here is a kitten photo. Did it help?"

diff --git a/examples/e2ebot/domain.yml b/examples/e2ebot/domain.yml
@@ -1,5 +1,5 @@
 version: "2.0"
-
+# TODO create a bot that makes sense
 actions:
  - utter_greet
  - utter_happy
@@ -9,3 +9,6 @@ actions:
 intents:
  - greet
  - mood_great
+
+entities:
+ - bla
diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
@@ -1,10 +1,11 @@
 import logging
 import numpy as np
 import scipy.sparse
-from typing import List, Optional, Dict, Text, Set
+from typing import List, Optional, Dict, Text, Set, Any
 from collections import defaultdict
 
 import rasa.shared.utils.io
+from rasa.nlu.constants import TOKENS_NAMES
 from rasa.shared.core.domain import SubState, State, Domain
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter
 from rasa.shared.core.constants import PREVIOUS_ACTION, ACTIVE_LOOP, USER, SLOTS
@@ -16,9 +17,15 @@
     ACTION_TEXT,
     ACTION_NAME,
     INTENT,
+    TEXT,
+    NO_ENTITY_TAG,
+    ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_TAGS,
 )
 from rasa.shared.nlu.training_data.features import Features
 from rasa.shared.nlu.training_data.message import Message
+from rasa.utils.tensorflow.model_data_utils import TAG_ID_ORIGIN
+from rasa.utils.tensorflow.constants import IDS
 
 logger = logging.getLogger(__name__)
 
@@ -36,6 +43,23 @@ class SingleStateFeaturizer:
     def __init__(self) -> None:
         self._default_feature_states = {}
         self.action_texts = []
+        self.entity_tag_id_mapping = {}
+
+    def get_entity_tag_ids(self) -> Dict[Text, int]:
+        """Returns the tag to index mapping for entities.
+
+        Returns:
+            Tag to index mapping.
+        """
+        if ENTITIES not in self._default_feature_states:
+            return {}
+
+        tag_ids = {
+            tag: idx + 1  # +1 to keep 0 for the NO_ENTITY_TAG
+            for tag, idx in self._default_feature_states[ENTITIES].items()
+        }
+        tag_ids[NO_ENTITY_TAG] = 0
+        return tag_ids
 
     def prepare_from_domain(self, domain: Domain) -> None:
         """Gets necessary information for featurization from domain.
@@ -55,6 +79,7 @@ def convert_to_dict(feature_states: List[Text]) -> Dict[Text, int]:
         self._default_feature_states[SLOTS] = convert_to_dict(domain.slot_states)
         self._default_feature_states[ACTIVE_LOOP] = convert_to_dict(domain.form_names)
         self.action_texts = domain.action_texts
+        self.entity_tag_id_mapping = self.get_entity_tag_ids()
 
     def _state_features_for_attribute(
         self, sub_state: SubState, attribute: Text
@@ -84,7 +109,7 @@ def _create_features(
 
         features = np.zeros(len(self._default_feature_states[attribute]), np.float32)
         for state_feature, value in state_features.items():
-            # check that the value is in default_feature_states to be able to assigh
+            # check that the value is in default_feature_states to be able to assign
             # its value
             if state_feature in self._default_feature_states[attribute]:
                 features[self._default_feature_states[attribute][state_feature]] = value
@@ -215,6 +240,51 @@ def encode_state(
 
         return state_features
 
+    def encode_entities(
+        self, entity_data: Dict[Text, Any], interpreter: NaturalLanguageInterpreter
+    ) -> Dict[Text, List["Features"]]:
+        """Encode the given entity data with the help of the given interpreter.
+
+        Produce numeric entity tags for tokens.
+
+        Args:
+            entity_data: The dict containing the text and entity labels and locations
+            interpreter: The interpreter used to encode the state
+
+        Returns:
+            A dictionary of entity type to list of features.
+        """
+        from rasa.nlu.test import determine_token_labels
+
+        # TODO
+        #  The entity states used to create the tag-idx-mapping contains the
+        #  entities and the concatenated entity and roles/groups. We do not
+        #  distinguish between entities and roles/groups right now.
+        # TODO
+        #  Should we support BILOU tagging?
+
+        if TEXT not in entity_data or len(self.entity_tag_id_mapping) < 2:
+            # we cannot build a classifier if there are less than 2 class
+            return {}
+
+        parsed_text = interpreter.featurize_message(Message({TEXT: entity_data[TEXT]}))
+        entities = entity_data.get(ENTITIES, [])
+
+        _tags = []
+        for token in parsed_text.get(TOKENS_NAMES[TEXT]):
+            _tag = determine_token_labels(
+                token, entities, attribute_key=ENTITY_ATTRIBUTE_TYPE
+            )
+            # TODO handle if tag is not in mapping
+            _tags.append(self.entity_tag_id_mapping[_tag])
+
+        # transpose to have seq_len x 1
+        return {
+            ENTITY_TAGS: [
+                Features(np.array([_tags]).T, IDS, ENTITY_TAGS, TAG_ID_ORIGIN,)
+            ]
+        }
+
     def _encode_action(
         self, action: Text, interpreter: NaturalLanguageInterpreter
     ) -> Dict[Text, List["Features"]]:

diff --git a/rasa/core/featurizers/tracker_featurizers.py b/rasa/core/featurizers/tracker_featurizers.py
@@ -3,15 +3,15 @@
 import jsonpickle
 import logging
 
-from rasa.shared.nlu.constants import TEXT, INTENT
+from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES
 from rasa.shared.exceptions import RasaException
 from tqdm import tqdm
-from typing import Tuple, List, Optional, Dict, Text, Union
+from typing import Tuple, List, Optional, Dict, Text, Union, Any
 import numpy as np
 
 from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer
 from rasa.shared.core.domain import State, Domain
-from rasa.shared.core.events import ActionExecuted
+from rasa.shared.core.events import ActionExecuted, UserUttered
 from rasa.shared.core.trackers import (
     DialogueStateTracker,
     is_prev_action_listen_in_state,
@@ -91,6 +91,43 @@ def _convert_labels_to_ids(
             ]
         )
 
+    def _create_entity_tags(
+        self,
+        trackers_as_entities: List[List[Dict[Text, Any]]],
+        interpreter: NaturalLanguageInterpreter,
+    ) -> List[List[Dict[Text, List["Features"]]]]:
+        return [
+            [
+                self.state_featurizer.encode_entities(entity_data, interpreter)
+                for entity_data in trackers_entities
+            ]
+            for trackers_entities in trackers_as_entities
+        ]
+
+    @staticmethod
+    def _entity_data(event: UserUttered) -> Dict[Text, Any]:
+        if event.text:
+            return {TEXT: event.text, ENTITIES: event.entities}
+
+        # input is not textual, so add empty dict
+        return {}
+
+    def training_states_actions_and_entities(
+        self, trackers: List[DialogueStateTracker], domain: Domain
+    ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
+        """Transforms list of trackers to lists of states, actions and entity data.
+
+        Args:
+            trackers: The trackers to transform
+            domain: The domain
+
+        Returns:
+            A tuple of list of states, list of actions and list of entity data.
+        """
+        raise NotImplementedError(
+            "Featurizer must have the capacity to encode trackers to feature vectors"
+        )
+
     def training_states_and_actions(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[State]], List[List[Text]]]:
@@ -103,16 +140,23 @@ def training_states_and_actions(
         Returns:
             A tuple of list of states and list of actions.
         """
-        raise NotImplementedError(
-            "Featurizer must have the capacity to encode trackers to feature vectors"
-        )
+        (
+            trackers_as_states,
+            trackers_as_actions,
+            _,
+        ) = self.training_states_actions_and_entities(trackers, domain)
+        return trackers_as_states, trackers_as_actions
 
     def featurize_trackers(
         self,
         trackers: List[DialogueStateTracker],
         domain: Domain,
         interpreter: NaturalLanguageInterpreter,
-    ) -> Tuple[List[List[Dict[Text, List["Features"]]]], np.ndarray]:
+    ) -> Tuple[
+        List[List[Dict[Text, List["Features"]]]],
+        np.ndarray,
+        List[List[Dict[Text, List["Features"]]]],
+    ]:
         """Featurize the training trackers.
 
         Args:
@@ -137,14 +181,17 @@ def featurize_trackers(
 
         self.state_featurizer.prepare_from_domain(domain)
 
-        trackers_as_states, trackers_as_actions = self.training_states_and_actions(
-            trackers, domain
-        )
+        (
+            trackers_as_states,
+            trackers_as_actions,
+            trackers_as_entities,
+        ) = self.training_states_actions_and_entities(trackers, domain)
 
         tracker_state_features = self._featurize_states(trackers_as_states, interpreter)
         label_ids = self._convert_labels_to_ids(trackers_as_actions, domain)
+        entity_tags = self._create_entity_tags(trackers_as_entities, interpreter)
 
-        return tracker_state_features, label_ids
+        return tracker_state_features, label_ids, entity_tags
 
     @staticmethod
     def _choose_last_user_input(
@@ -252,23 +299,22 @@ class FullDialogueTrackerFeaturizer(TrackerFeaturizer):
     Training data is padded up to the length of the longest dialogue with -1.
     """
 
-    def training_states_and_actions(
+    def training_states_actions_and_entities(
         self, trackers: List[DialogueStateTracker], domain: Domain
-    ) -> Tuple[List[List[State]], List[List[Text]]]:
-        """Transforms list of trackers to lists of states and actions.
-
-        Training data is padded up to the length of the longest dialogue with -1.
+    ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
+        """Transforms list of trackers to lists of states, actions and entity data.
 
         Args:
             trackers: The trackers to transform
             domain: The domain
 
         Returns:
-            A tuple of list of states and list of actions.
+            A tuple of list of states, list of actions and list of entity data.
         """
 
         trackers_as_states = []
         trackers_as_actions = []
+        trackers_as_entities = []
 
         logger.debug(
             "Creating states and action examples from "
@@ -285,14 +331,20 @@ def training_states_and_actions(
 
             delete_first_state = False
             actions = []
+            entities = []
+            entity_data = {}
             for event in tracker.applied_events():
+                if isinstance(event, UserUttered):
+                    entity_data = self._entity_data(event)
+
                 if not isinstance(event, ActionExecuted):
                     continue
 
                 if not event.unpredictable:
                     # only actions which can be
                     # predicted at a stories start
                     actions.append(event.action_name or event.action_text)
+                    entities.append(entity_data)
                 else:
                     # unpredictable actions can be
                     # only the first in the story
@@ -303,13 +355,17 @@ def training_states_and_actions(
                         )
                     delete_first_state = True
 
+                # reset entity_data for the the next turn
+                entity_data = {}
+
             if delete_first_state:
                 states = states[1:]
 
             trackers_as_states.append(states[:-1])
             trackers_as_actions.append(actions)
+            trackers_as_entities.append(entities)
 
-        return trackers_as_states, trackers_as_actions
+        return trackers_as_states, trackers_as_actions, trackers_as_entities
 
     def prediction_states(
         self,
@@ -386,23 +442,22 @@ def _hash_example(
         frozen_actions = (action,)
         return hash((frozen_states, frozen_actions))
 
-    def training_states_and_actions(
+    def training_states_actions_and_entities(
         self, trackers: List[DialogueStateTracker], domain: Domain
-    ) -> Tuple[List[List[State]], List[List[Text]]]:
-        """Transforms list of trackers to lists of states and actions.
-
-        Training data is padded up to the length of the longest dialogue with -1.
+    ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
+        """Transforms list of trackers to lists of states, actions and entity data.
 
         Args:
             trackers: The trackers to transform
             domain: The domain
 
         Returns:
-            A tuple of list of states and list of actions.
+            A tuple of list of states, list of actions and list of entity data.
         """
 
         trackers_as_states = []
         trackers_as_actions = []
+        trackers_as_entities = []
 
         # from multiple states that create equal featurizations
         # we only need to keep one.
@@ -422,7 +477,11 @@ def training_states_and_actions(
             states = self._create_states(tracker, domain)
 
             states_length_for_action = 0
+            entity_data = {}
             for event in tracker.applied_events():
+                if isinstance(event, UserUttered):
+                    entity_data = self._entity_data(event)
+
                 if not isinstance(event, ActionExecuted):
                     continue
 
@@ -448,15 +507,19 @@ def training_states_and_actions(
                         trackers_as_actions.append(
                             [event.action_name or event.action_text]
                         )
+                        trackers_as_entities.append([entity_data])
                 else:
                     trackers_as_states.append(sliced_states)
                     trackers_as_actions.append([event.action_name or event.action_text])
+                    trackers_as_entities.append([entity_data])
 
+                # reset entity_data for the the next turn
+                entity_data = {}
                 pbar.set_postfix({"# actions": "{:d}".format(len(trackers_as_actions))})
 
         logger.debug("Created {} action examples.".format(len(trackers_as_actions)))
 
-        return trackers_as_states, trackers_as_actions
+        return trackers_as_states, trackers_as_actions, trackers_as_entities
 
     def prediction_states(
         self,