Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug in sparse featurizers #5172

Merged
merged 4 commits into from
Feb 3, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog/5171.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fix bug ``ValueError: Cannot concatenate sparse features as sequence dimension does not match``.

When training a Rasa model that contains responses for just some of the intents, training was failing.
Fixed the featurizers to return a consistent feature vector in case no response was given for a specific message.
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def _get_processed_message_tokens_by_attribute(
if message.get(attribute) is None:
# return empty string since sklearn countvectorizer does not like None
tabergma marked this conversation as resolved.
Show resolved Hide resolved
# object while training and predicting
return [""]
return []

tokens = self._get_message_tokens_by_attribute(message, attribute)
tokens = self._process_tokens(tokens, attribute)
Expand Down Expand Up @@ -420,7 +420,9 @@ def _create_sequence(
seq_vec.sort_indices()

if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
tokens_text = [" ".join(tokens_without_cls)]
tokens_text = (
[" ".join(tokens_without_cls)] if tokens_without_cls else []
)
cls_vec = self.vectorizers[attribute].transform(tokens_text)
cls_vec.sort_indices()

Expand Down Expand Up @@ -489,7 +491,6 @@ def train(

# transform for all attributes
for attribute in self._attributes:

attribute_features = self._get_featurized_attribute(
attribute, processed_attribute_tokens[attribute]
)
Expand Down
44 changes: 44 additions & 0 deletions tests/nlu/featurizers/test_count_vectors_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,50 @@ def test_count_vector_featurizer(sentence, expected, expected_cls):
assert np.all(actual[-1] == expected_cls)


@pytest.mark.parametrize(
"sentence, intent, response, intent_features, response_features",
[("hello", "greet", None, [[1]], None), ("hello", "greet", "hi", [[1]], [[1]])],
)
def test_count_vector_featurizer_response_attribute_featurization(
sentence, intent, response, intent_features, response_features
):
ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
tk = WhitespaceTokenizer()

train_message = Message(sentence)
# this is needed for a valid training example
train_message.set(INTENT_ATTRIBUTE, intent)
train_message.set(RESPONSE_ATTRIBUTE, response)

second_message = Message("hello")
second_message.set(RESPONSE_ATTRIBUTE, "hi")
second_message.set(INTENT_ATTRIBUTE, "greet")

data = TrainingData([train_message, second_message])

tk.train(data)
ftr.train(data)

if intent_features:
assert (
train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0]
== intent_features
)
else:
assert train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None

if response_features:
assert (
train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0]
== response_features
)
else:
assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).shape == (
tabergma marked this conversation as resolved.
Show resolved Hide resolved
0,
1,
)


@pytest.mark.parametrize(
"sentence, intent, response, intent_features, response_features",
[
Expand Down