-
Notifications
You must be signed in to change notification settings - Fork 4.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5989 from RasaHQ/5983_new_nlu_parser
Initial implementation of NLU YAML parser
- Loading branch information
Showing
19 changed files
with
967 additions
and
267 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
nlu: | ||
- intent: estimate_emissions | ||
# Arbitrary metadata | ||
metadata: | ||
author: Some example metadata! | ||
key: value | ||
# Multiline examples, each line is a separate training example. | ||
examples: | | ||
how much CO2 will that use? | ||
how much carbon will a one way flight from [new york]{"entity": "city", "role": "from"} to california produce? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
import rasa.nlu.training_data.entities_parser | ||
import rasa.nlu.training_data.synonyms_parser | ||
import rasa.nlu.training_data.lookup_tables_parser | ||
from rasa.nlu.training_data.loading import load_data | ||
from rasa.nlu.training_data.message import Message | ||
from rasa.nlu.training_data.training_data import TrainingData |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
import re | ||
from json import JSONDecodeError | ||
from typing import Text, List, Dict, Match, Optional, NamedTuple, Any | ||
|
||
from rasa.constants import DOCS_URL_TRAINING_DATA_NLU | ||
from rasa.nlu.constants import ( | ||
ENTITY_ATTRIBUTE_GROUP, | ||
ENTITY_ATTRIBUTE_TYPE, | ||
ENTITY_ATTRIBUTE_ROLE, | ||
ENTITY_ATTRIBUTE_VALUE, | ||
) | ||
from rasa.utils.common import raise_warning | ||
|
||
GROUP_ENTITY_VALUE = "value" | ||
GROUP_ENTITY_TYPE = "entity" | ||
GROUP_ENTITY_DICT = "entity_dict" | ||
GROUP_ENTITY_TEXT = "entity_text" | ||
GROUP_COMPLETE_MATCH = 0 | ||
|
||
# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})` | ||
ENTITY_REGEX = re.compile( | ||
r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\})" | ||
) | ||
|
||
|
||
class EntityAttributes(NamedTuple): | ||
"""Attributes of an entity defined in markdown data.""" | ||
|
||
type: Text | ||
value: Text | ||
text: Text | ||
group: Optional[Text] | ||
role: Optional[Text] | ||
|
||
|
||
def find_entities_in_training_example(example: Text) -> List[Dict[Text, Any]]: | ||
"""Extracts entities from an intent example. | ||
Args: | ||
example: Intent example. | ||
Returns: | ||
Extracted entities. | ||
""" | ||
import rasa.nlu.utils as rasa_nlu_utils | ||
|
||
entities = [] | ||
offset = 0 | ||
|
||
for match in re.finditer(ENTITY_REGEX, example): | ||
entity_attributes = extract_entity_attributes(match) | ||
|
||
start_index = match.start() - offset | ||
end_index = start_index + len(entity_attributes.text) | ||
offset += len(match.group(0)) - len(entity_attributes.text) | ||
|
||
entity = rasa_nlu_utils.build_entity( | ||
start_index, | ||
end_index, | ||
entity_attributes.value, | ||
entity_attributes.type, | ||
entity_attributes.role, | ||
entity_attributes.group, | ||
) | ||
entities.append(entity) | ||
|
||
return entities | ||
|
||
|
||
def extract_entity_attributes(match: Match) -> EntityAttributes: | ||
"""Extract the entity attributes, i.e. type, value, etc., from the | ||
regex match. | ||
Args: | ||
match: Regex match to extract the entity attributes from. | ||
Returns: | ||
EntityAttributes object. | ||
""" | ||
entity_text = match.groupdict()[GROUP_ENTITY_TEXT] | ||
|
||
if match.groupdict()[GROUP_ENTITY_DICT]: | ||
return extract_entity_attributes_from_dict(entity_text, match) | ||
|
||
entity_type = match.groupdict()[GROUP_ENTITY_TYPE] | ||
|
||
if match.groupdict()[GROUP_ENTITY_VALUE]: | ||
entity_value = match.groupdict()[GROUP_ENTITY_VALUE] | ||
else: | ||
entity_value = entity_text | ||
|
||
return EntityAttributes(entity_type, entity_value, entity_text, None, None) | ||
|
||
|
||
def extract_entity_attributes_from_dict( | ||
entity_text: Text, match: Match | ||
) -> EntityAttributes: | ||
"""Extract entity attributes from dict format. | ||
Args: | ||
entity_text: Original entity text. | ||
match: Regex match. | ||
Returns: | ||
Extracted entity attributes. | ||
""" | ||
entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT] | ||
entity_dict = get_validated_dict(entity_dict_str) | ||
return EntityAttributes( | ||
entity_dict.get(ENTITY_ATTRIBUTE_TYPE), | ||
entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text), | ||
entity_text, | ||
entity_dict.get(ENTITY_ATTRIBUTE_GROUP), | ||
entity_dict.get(ENTITY_ATTRIBUTE_ROLE), | ||
) | ||
|
||
|
||
def get_validated_dict(json_str: Text) -> Dict[Text, Text]: | ||
"""Converts the provided `json_str` to a valid dict containing the entity | ||
attributes. | ||
Users can specify entity roles, synonyms, groups for an entity in a dict, e.g. | ||
[LA]{"entity": "city", "role": "to", "value": "Los Angeles"}. | ||
Args: | ||
json_str: The entity dict as string without "{}". | ||
Raises: | ||
ValidationError if validation of entity dict fails. | ||
JSONDecodeError if provided entity dict is not valid json. | ||
Returns: | ||
Deserialized and validated `json_str`. | ||
""" | ||
import json | ||
import rasa.utils.validation as validation_utils | ||
import rasa.nlu.schemas.data_schema as schema | ||
|
||
# add {} as they are not part of the regex | ||
try: | ||
data = json.loads(f"{{{json_str}}}") | ||
except JSONDecodeError as e: | ||
raise_warning( | ||
f"Incorrect training data format ('{{{json_str}}}'). Make sure your " | ||
f"data is valid.", | ||
docs=DOCS_URL_TRAINING_DATA_NLU, | ||
) | ||
raise e | ||
|
||
validation_utils.validate_training_data(data, schema.entity_dict_schema()) | ||
|
||
return data | ||
|
||
|
||
def replace_entities(training_example: Text) -> Text: | ||
"""Replace special symbols related to the entities in the provided | ||
training example. | ||
Args: | ||
training_example: Original training example with special symbols. | ||
Returns: | ||
String with removed special symbols. | ||
""" | ||
return re.sub( | ||
ENTITY_REGEX, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], training_example | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.