RasaHQ · tabergma · Feb 3, 2020 · Feb 3, 2020 · Feb 3, 2020 · Feb 3, 2020
diff --git a/changelog/5171.bugfix.rst b/changelog/5171.bugfix.rst
@@ -0,0 +1,4 @@
+Fix bug ``ValueError: Cannot concatenate sparse features as sequence dimension does not match``.
+
+When training a Rasa model that contains responses for just some of the intents, training was failing.
+Fixed the featurizers to return a consistent feature vector in case no response was given for a specific message.
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -271,7 +271,7 @@ def _get_processed_message_tokens_by_attribute(
         if message.get(attribute) is None:
             # return empty string since sklearn countvectorizer does not like None
             # object while training and predicting
-            return [""]
+            return []
 
         tokens = self._get_message_tokens_by_attribute(message, attribute)
         tokens = self._process_tokens(tokens, attribute)
@@ -420,7 +420,9 @@ def _create_sequence(
             seq_vec.sort_indices()
 
             if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
-                tokens_text = [" ".join(tokens_without_cls)]
+                tokens_text = (
+                    [" ".join(tokens_without_cls)] if tokens_without_cls else []
+                )
                 cls_vec = self.vectorizers[attribute].transform(tokens_text)
                 cls_vec.sort_indices()
 
@@ -489,7 +491,6 @@ def train(
 
         # transform for all attributes
         for attribute in self._attributes:
-
             attribute_features = self._get_featurized_attribute(
                 attribute, processed_attribute_tokens[attribute]
             )

diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -52,6 +52,50 @@ def test_count_vector_featurizer(sentence, expected, expected_cls):
     assert np.all(actual[-1] == expected_cls)
 
 
+@pytest.mark.parametrize(
+    "sentence, intent, response, intent_features, response_features",
+    [("hello", "greet", None, [[1]], None), ("hello", "greet", "hi", [[1]], [[1]])],
+)
+def test_count_vector_featurizer_response_attribute_featurization(
+    sentence, intent, response, intent_features, response_features
+):
+    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
+    tk = WhitespaceTokenizer()
+
+    train_message = Message(sentence)
+    # this is needed for a valid training example
+    train_message.set(INTENT_ATTRIBUTE, intent)
+    train_message.set(RESPONSE_ATTRIBUTE, response)
+
+    second_message = Message("hello")
+    second_message.set(RESPONSE_ATTRIBUTE, "hi")
+    second_message.set(INTENT_ATTRIBUTE, "greet")
+
+    data = TrainingData([train_message, second_message])
+
+    tk.train(data)
+    ftr.train(data)
+
+    if intent_features:
+        assert (
+            train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0]
+            == intent_features
+        )
+    else:
+        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None
+
+    if response_features:
+        assert (
+            train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0]
+            == response_features
+        )
+    else:
+        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).shape == (
+            0,
+            1,
+        )
+
+
 @pytest.mark.parametrize(
     "sentence, intent, response, intent_features, response_features",
     [