Skip to content

Commit

Permalink
Adding Document.annotate_text() in language.
Browse files Browse the repository at this point in the history
  • Loading branch information
dhermes committed Aug 26, 2016
1 parent 0e6850f commit fadc226
Show file tree
Hide file tree
Showing 2 changed files with 274 additions and 21 deletions.
74 changes: 74 additions & 0 deletions gcloud/language/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

from gcloud.language.entity import Entity
from gcloud.language.sentiment import Sentiment
from gcloud.language.token import Sentence
from gcloud.language.token import Token


DEFAULT_LANGUAGE = 'en-US'
Expand Down Expand Up @@ -185,3 +187,75 @@ def analyze_sentiment(self):
api_response = self.client.connection.api_request(
method='POST', path='analyzeSentiment', data=data)
return Sentiment.from_api_repr(api_response['documentSentiment'])

def annotate_text(self, include_syntax=True, include_entities=True,
include_sentiment=True):
"""Advanced natural language API: document syntax and other features.
Includes the full functionality of :meth:`analyze_entities` and
:meth:`analyze_sentiment`, enabled by the flags
``include_entities`` and ``include_sentiment`` respectively.
In addition ``include_syntax`` adds a new feature that analyzes
the document for semantic and syntacticinformation.
.. note::
This API is intended for users who are familiar with machine
learning and need in-depth text features to build upon.
.. _annotateText: https://cloud.google.com/natural-language/\
reference/rest/v1beta1/documents/annotateText
See `annotateText`_.
:type include_syntax: bool
:param include_syntax: (Optional) Flag to enable syntax analysis
of the current document.
:type include_entities: bool
:param include_entities: (Optional) Flag to enable entity extraction
from the current document.
:type include_sentiment: bool
:param include_sentiment: (Optional) Flag to enable sentiment
analysis of the current document.
:rtype: :class:`Annotations`
:returns: A tuple of each of the four values returned from the API:
sentences, tokens, sentiment and entities.
"""
features = {}
if include_syntax:
features['extractSyntax'] = True
if include_entities:
features['extractEntities'] = True
if include_sentiment:
features['extractDocumentSentiment'] = True

data = {
'document': self._to_dict(),
'features': features,
'encodingType': self.encoding,
}
api_response = self.client.connection.api_request(
method='POST', path='annotateText', data=data)

sentences = [Sentence.from_api_repr(sentence)
for sentence in api_response['sentences']]
tokens = [Token.from_api_repr(token)
for token in api_response['tokens']]
sentiment_info = api_response.get('documentSentiment')
if sentiment_info is None:
sentiment = None
else:
sentiment = Sentiment.from_api_repr(sentiment_info)
entities = [Entity.from_api_repr(entity)
for entity in api_response['entities']]
annotations = Annotations(
sentences=sentences,
tokens=tokens,
sentiment=sentiment,
entities=entities,
)
return annotations
221 changes: 200 additions & 21 deletions gcloud/language/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,86 @@
import unittest


ANNOTATE_NAME = 'Moon'
ANNOTATE_CONTENT = 'A cow jumped over the %s.' % (ANNOTATE_NAME,)
ANNOTATE_POLARITY = 1
ANNOTATE_MAGNITUDE = 0.2
ANNOTATE_SALIENCE = 0.11793101
ANNOTATE_WIKI_URL = 'http://en.wikipedia.org/wiki/Natural_satellite'


def _make_token_json(name, part_of_speech, head, edge_label):
token_dict = {
'text': {
'content': name,
'beginOffset': -1,
},
'partOfSpeech': {'tag': part_of_speech},
'dependencyEdge': {
'headTokenIndex': head,
'label': edge_label,
},
'lemma': name,
}
return token_dict


def _get_token_and_sentences(include_syntax):
from gcloud.language.token import PartOfSpeech

if include_syntax:
token_info = [
('A', PartOfSpeech.DETERMINER, 1, 'DET'),
('cow', PartOfSpeech.NOUN, 2, 'NSUBJ'),
('jumped', PartOfSpeech.VERB, 2, 'ROOT'),
('over', PartOfSpeech.ADPOSITION, 2, 'PREP'),
('the', PartOfSpeech.DETERMINER, 5, 'DET'),
(ANNOTATE_NAME, PartOfSpeech.NOUN, 3, 'POBJ'),
('.', PartOfSpeech.PUNCTUATION, 2, 'P'),
]
sentences = [
{
'text': {
'content': ANNOTATE_CONTENT,
'beginOffset': -1,
},
},
]
else:
token_info = []
sentences = []

return token_info, sentences


def _get_entities(include_entities):
from gcloud.language.entity import EntityType

if include_entities:
entities = [
{
'name': ANNOTATE_NAME,
'type': EntityType.LOCATION,
'metadata': {
'wikipedia_url': ANNOTATE_WIKI_URL,
},
'salience': ANNOTATE_SALIENCE,
'mentions': [
{
'text': {
'content': ANNOTATE_NAME,
'beginOffset': -1
}
}
]
},
]
else:
entities = []

return entities


class TestDocument(unittest.TestCase):

def _getTargetClass(self):
Expand Down Expand Up @@ -95,8 +175,18 @@ def test__to_dict_with_no_content(self):
'type': klass.PLAIN_TEXT,
})

def test_analyze_entities(self):
def _verify_entity(self, entity, name, entity_type, wiki_url, salience):
from gcloud.language.entity import Entity

self.assertIsInstance(entity, Entity)
self.assertEqual(entity.name, name)
self.assertEqual(entity.entity_type, entity_type)
self.assertEqual(entity.wikipedia_url, wiki_url)
self.assertEqual(entity.metadata, {})
self.assertEqual(entity.salience, salience)
self.assertEqual(entity.mentions, [name])

def test_analyze_entities(self):
from gcloud.language.entity import EntityType

name1 = 'R-O-C-K'
Expand Down Expand Up @@ -136,7 +226,7 @@ def test_analyze_entities(self):
],
},
],
'language': 'en',
'language': 'en-US',
}
connection = _Connection(response)
client = _Client(connection=connection)
Expand All @@ -145,31 +235,26 @@ def test_analyze_entities(self):
entities = document.analyze_entities()
self.assertEqual(len(entities), 2)
entity1 = entities[0]
self.assertIsInstance(entity1, Entity)
self.assertEqual(entity1.name, name1)
self.assertEqual(entity1.entity_type, EntityType.OTHER)
self.assertEqual(entity1.wikipedia_url, None)
self.assertEqual(entity1.metadata, {})
self.assertEqual(entity1.salience, salience1)
self.assertEqual(entity1.mentions, [name1])
self._verify_entity(entity1, name1, EntityType.OTHER,
None, salience1)
entity2 = entities[1]
self.assertIsInstance(entity2, Entity)
self.assertEqual(entity2.name, name2)
self.assertEqual(entity2.entity_type, EntityType.LOCATION)
self.assertEqual(entity2.wikipedia_url, wiki2)
self.assertEqual(entity2.metadata, {})
self.assertEqual(entity2.salience, salience2)
self.assertEqual(entity2.mentions, [name2])
self._verify_entity(entity2, name2, EntityType.LOCATION,
wiki2, salience2)

# Verify the request.
self.assertEqual(len(connection._requested), 1)
req = connection._requested[0]
self.assertEqual(req['path'], 'analyzeEntities')
self.assertEqual(req['method'], 'POST')

def test_analyze_sentiment(self):
def _verify_sentiment(self, sentiment, polarity, magnitude):
from gcloud.language.sentiment import Sentiment

self.assertIsInstance(sentiment, Sentiment)
self.assertEqual(sentiment.polarity, polarity)
self.assertEqual(sentiment.magnitude, magnitude)

def test_analyze_sentiment(self):
content = 'All the pretty horses.'
polarity = 1
magnitude = 0.6
Expand All @@ -178,23 +263,117 @@ def test_analyze_sentiment(self):
'polarity': polarity,
'magnitude': magnitude,
},
'language': 'en',
'language': 'en-US',
}
connection = _Connection(response)
client = _Client(connection=connection)
document = self._makeOne(client, content)

sentiment = document.analyze_sentiment()
self.assertIsInstance(sentiment, Sentiment)
self.assertEqual(sentiment.polarity, polarity)
self.assertEqual(sentiment.magnitude, magnitude)
self._verify_sentiment(sentiment, polarity, magnitude)

# Verify the request.
self.assertEqual(len(connection._requested), 1)
req = connection._requested[0]
self.assertEqual(req['path'], 'analyzeSentiment')
self.assertEqual(req['method'], 'POST')

def _verify_sentences(self, include_syntax, annotations):
from gcloud.language.token import Sentence

if include_syntax:
self.assertEqual(len(annotations.sentences), 1)
sentence = annotations.sentences[0]
self.assertIsInstance(sentence, Sentence)
self.assertEqual(sentence.content, ANNOTATE_CONTENT)
self.assertEqual(sentence.begin, -1)
else:
self.assertEqual(annotations.sentences, [])

def _verify_tokens(self, annotations, token_info):
from gcloud.language.token import Token

self.assertEqual(len(annotations.tokens), len(token_info))
for token, info in zip(annotations.tokens, token_info):
self.assertIsInstance(token, Token)
self.assertEqual(token.text_content, info[0])
self.assertEqual(token.text_begin, -1)
self.assertEqual(token.part_of_speech, info[1])
self.assertEqual(token.edge_index, info[2])
self.assertEqual(token.edge_label, info[3])
self.assertEqual(token.lemma, info[0])

def _annotate_text_helper(self, include_sentiment,
include_entities, include_syntax):
from gcloud.language.document import Annotations
from gcloud.language.entity import EntityType

token_info, sentences = _get_token_and_sentences(include_syntax)
entities = _get_entities(include_entities)
tokens = [_make_token_json(*info) for info in token_info]
response = {
'sentences': sentences,
'tokens': tokens,
'entities': entities,
'language': 'en-US',
}
if include_sentiment:
response['documentSentiment'] = {
'polarity': ANNOTATE_POLARITY,
'magnitude': ANNOTATE_MAGNITUDE,
}

connection = _Connection(response)
client = _Client(connection=connection)
document = self._makeOne(client, ANNOTATE_CONTENT)

annotations = document.annotate_text(
include_syntax=include_syntax, include_entities=include_entities,
include_sentiment=include_sentiment)
self.assertIsInstance(annotations, Annotations)
# Sentences
self._verify_sentences(include_syntax, annotations)
# Token
self._verify_tokens(annotations, token_info)
# Sentiment
if include_sentiment:
self._verify_sentiment(annotations.sentiment,
ANNOTATE_POLARITY, ANNOTATE_MAGNITUDE)
else:
self.assertIsNone(annotations.sentiment)
# Entity
if include_entities:
self.assertEqual(len(annotations.entities), 1)
entity = annotations.entities[0]
self._verify_entity(entity, ANNOTATE_NAME, EntityType.LOCATION,
ANNOTATE_WIKI_URL, ANNOTATE_SALIENCE)
else:
self.assertEqual(annotations.entities, [])

# Verify the request.
self.assertEqual(len(connection._requested), 1)
req = connection._requested[0]
self.assertEqual(req['path'], 'annotateText')
self.assertEqual(req['method'], 'POST')
features = req['data']['features']
self.assertEqual(features.get('extractDocumentSentiment', False),
include_sentiment)
self.assertEqual(features.get('extractEntities', False),
include_entities)
self.assertEqual(features.get('extractSyntax', False), include_syntax)

def test_annotate_text(self):
self._annotate_text_helper(True, True, True)

def test_annotate_text_sentiment_only(self):
self._annotate_text_helper(True, False, False)

def test_annotate_text_entities_only(self):
self._annotate_text_helper(False, True, False)

def test_annotate_text_syntax_only(self):
self._annotate_text_helper(False, False, True)


class _Connection(object):

Expand Down

0 comments on commit fadc226

Please sign in to comment.