RasaHQ · dakshvar22 · Dec 15, 2020 · Dec 1, 2020 · Dec 1, 2020 · Dec 1, 2020
diff --git a/changelog/7423.improvement.md b/changelog/7423.improvement.md
@@ -0,0 +1 @@
+Use response selector keys (sub-intents) as labels for plotting the confusion matrix during NLU evaluation to improve readability.
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
@@ -31,6 +31,7 @@
     ENTITY_ATTRIBUTE_TYPE,
     ENTITY_ATTRIBUTE_GROUP,
     ENTITY_ATTRIBUTE_ROLE,
+    RESPONSE_KEY_ATTRIBUTE,
 )
 from rasa.model import get_model
 from rasa.nlu import config, training_data, utils
@@ -62,7 +63,7 @@
 
 ResponseSelectionEvaluationResult = namedtuple(
     "ResponseSelectionEvaluationResult",
-    "intent_target " "response_target " "response_prediction " "message " "confidence",
+    "intent_target response_key response_target response_prediction_full_intent response_prediction message confidence",
 )
 
 EntityEvaluationResult = namedtuple(
@@ -373,7 +374,7 @@ def evaluate_response_selections(
     )
 
     target_responses, predicted_responses = _targets_predictions_from(
-        response_selection_results, "response_target", "response_prediction"
+        response_selection_results, "response_key", "response_prediction_full_intent"
     )
 
     if report_folder:
@@ -1050,12 +1051,24 @@ def get_eval_data(
                 response_prediction_key, {}
             ).get(OPEN_UTTERANCE_PREDICTION_KEY, {})
 
+            response_prediction_full_intent = selector_properties.get(
+                response_prediction_key, {}
+            ).get("full_retrieval_intent", {})
+
+            if isinstance(response_prediction_full_intent, str):
+                response_prediction_full_intent = response_prediction_full_intent.split(
+                    "/"
+                )[1]
+
             response_target = example.get("response", "")
+            response_key = example.get(RESPONSE_KEY_ATTRIBUTE, "")
 
             response_selection_results.append(
                 ResponseSelectionEvaluationResult(
                     intent_target,
+                    response_key,
                     response_target,
+                    response_prediction_full_intent,
                     response_prediction.get("name"),
                     result.get("text", {}),
                     response_prediction.get("confidence"),

diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
@@ -471,14 +471,18 @@ def test_response_evaluation_report(tmpdir_factory):
     response_results = [
         ResponseSelectionEvaluationResult(
             "chitchat",
+            "weather",
             "It's sunny in Berlin",
+            "weather",
             "It's sunny in Berlin",
             "What's the weather",
             0.65432,
         ),
         ResponseSelectionEvaluationResult(
             "chitchat",
+            "bot_name",
             "My name is Mr.bot",
+            "bot_name",
             "My name is Mr.bot",
             "What's your name?",
             0.98765,
@@ -506,7 +510,7 @@ def test_response_evaluation_report(tmpdir_factory):
     }
 
     assert len(report.keys()) == 5
-    assert report["My name is Mr.bot"] == name_query_results
+    assert report["bot_name"] == name_query_results
     assert result["predictions"][1] == prediction
 
 
@@ -591,11 +595,19 @@ def test_empty_intent_removal():
 def test_empty_response_removal():
     response_results = [
         ResponseSelectionEvaluationResult(
-            "chitchat", None, "It's sunny in Berlin", "What's the weather", 0.65432
+            "chitchat",
+            None,
+            None,
+            "It's sunny in Berlin",
+            None,
+            "What's the weather",
+            0.65432,
         ),
         ResponseSelectionEvaluationResult(
             "chitchat",
+            None,
             "My name is Mr.bot",
+            None,
             "My name is Mr.bot",
             "What's your name?",
             0.98765,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Use response selector keys (sub-intents) as labels for plotting the confusion matrix during NLU evaluation to improve readability.