Adding Document.annotate_text() in language.

googleapis · Aug 26, 2016 · fadc226 · fadc226
1 parent 0e6850f
commit fadc226
Show file tree

Hide file tree

Showing 2 changed files with 274 additions and 21 deletions.
diff --git a/gcloud/language/document.py b/gcloud/language/document.py
@@ -21,6 +21,8 @@
 
 from gcloud.language.entity import Entity
 from gcloud.language.sentiment import Sentiment
+from gcloud.language.token import Sentence
+from gcloud.language.token import Token
 
 
 DEFAULT_LANGUAGE = 'en-US'
@@ -185,3 +187,75 @@ def analyze_sentiment(self):
         api_response = self.client.connection.api_request(
             method='POST', path='analyzeSentiment', data=data)
         return Sentiment.from_api_repr(api_response['documentSentiment'])
+
+    def annotate_text(self, include_syntax=True, include_entities=True,
+                      include_sentiment=True):
+        """Advanced natural language API: document syntax and other features.
+
+        Includes the full functionality of :meth:`analyze_entities` and
+        :meth:`analyze_sentiment`, enabled by the flags
+        ``include_entities`` and ``include_sentiment`` respectively.
+
+        In addition ``include_syntax`` adds a new feature that analyzes
+        the document for semantic and syntacticinformation.
+
+        .. note::
+
+            This API is intended for users who are familiar with machine
+            learning and need in-depth text features to build upon.
+
+        .. _annotateText: https://cloud.google.com/natural-language/\
+                          reference/rest/v1beta1/documents/annotateText
+
+        See `annotateText`_.
+
+        :type include_syntax: bool
+        :param include_syntax: (Optional) Flag to enable syntax analysis
+                               of the current document.
+
+        :type include_entities: bool
+        :param include_entities: (Optional) Flag to enable entity extraction
+                                 from the current document.
+
+        :type include_sentiment: bool
+        :param include_sentiment: (Optional) Flag to enable sentiment
+                                  analysis of the current document.
+
+        :rtype: :class:`Annotations`
+        :returns: A tuple of each of the four values returned from the API:
+                  sentences, tokens, sentiment and entities.
+        """
+        features = {}
+        if include_syntax:
+            features['extractSyntax'] = True
+        if include_entities:
+            features['extractEntities'] = True
+        if include_sentiment:
+            features['extractDocumentSentiment'] = True
+
+        data = {
+            'document': self._to_dict(),
+            'features': features,
+            'encodingType': self.encoding,
+        }
+        api_response = self.client.connection.api_request(
+            method='POST', path='annotateText', data=data)
+
+        sentences = [Sentence.from_api_repr(sentence)
+                     for sentence in api_response['sentences']]
+        tokens = [Token.from_api_repr(token)
+                  for token in api_response['tokens']]
+        sentiment_info = api_response.get('documentSentiment')
+        if sentiment_info is None:
+            sentiment = None
+        else:
+            sentiment = Sentiment.from_api_repr(sentiment_info)
+        entities = [Entity.from_api_repr(entity)
+                    for entity in api_response['entities']]
+        annotations = Annotations(
+            sentences=sentences,
+            tokens=tokens,
+            sentiment=sentiment,
+            entities=entities,
+        )
+        return annotations
diff --git a/gcloud/language/test_document.py b/gcloud/language/test_document.py
@@ -15,6 +15,86 @@
 import unittest
 
 
+ANNOTATE_NAME = 'Moon'
+ANNOTATE_CONTENT = 'A cow jumped over the %s.' % (ANNOTATE_NAME,)
+ANNOTATE_POLARITY = 1
+ANNOTATE_MAGNITUDE = 0.2
+ANNOTATE_SALIENCE = 0.11793101
+ANNOTATE_WIKI_URL = 'http://en.wikipedia.org/wiki/Natural_satellite'
+
+
+def _make_token_json(name, part_of_speech, head, edge_label):
+    token_dict = {
+        'text': {
+            'content': name,
+            'beginOffset': -1,
+        },
+        'partOfSpeech': {'tag': part_of_speech},
+        'dependencyEdge': {
+            'headTokenIndex': head,
+            'label': edge_label,
+        },
+        'lemma': name,
+    }
+    return token_dict
+
+
+def _get_token_and_sentences(include_syntax):
+    from gcloud.language.token import PartOfSpeech
+
+    if include_syntax:
+        token_info = [
+            ('A', PartOfSpeech.DETERMINER, 1, 'DET'),
+            ('cow', PartOfSpeech.NOUN, 2, 'NSUBJ'),
+            ('jumped', PartOfSpeech.VERB, 2, 'ROOT'),
+            ('over', PartOfSpeech.ADPOSITION, 2, 'PREP'),
+            ('the', PartOfSpeech.DETERMINER, 5, 'DET'),
+            (ANNOTATE_NAME, PartOfSpeech.NOUN, 3, 'POBJ'),
+            ('.', PartOfSpeech.PUNCTUATION, 2, 'P'),
+        ]
+        sentences = [
+            {
+                'text': {
+                    'content': ANNOTATE_CONTENT,
+                    'beginOffset': -1,
+                },
+            },
+        ]
+    else:
+        token_info = []
+        sentences = []
+
+    return token_info, sentences
+
+
+def _get_entities(include_entities):
+    from gcloud.language.entity import EntityType
+
+    if include_entities:
+        entities = [
+            {
+                'name': ANNOTATE_NAME,
+                'type': EntityType.LOCATION,
+                'metadata': {
+                    'wikipedia_url': ANNOTATE_WIKI_URL,
+                },
+                'salience': ANNOTATE_SALIENCE,
+                'mentions': [
+                    {
+                        'text': {
+                            'content': ANNOTATE_NAME,
+                            'beginOffset': -1
+                        }
+                    }
+                ]
+            },
+        ]
+    else:
+        entities = []
+
+    return entities
+
+
 class TestDocument(unittest.TestCase):
 
     def _getTargetClass(self):
@@ -95,8 +175,18 @@ def test__to_dict_with_no_content(self):
             'type': klass.PLAIN_TEXT,
         })
 
-    def test_analyze_entities(self):
+    def _verify_entity(self, entity, name, entity_type, wiki_url, salience):
         from gcloud.language.entity import Entity
+
+        self.assertIsInstance(entity, Entity)
+        self.assertEqual(entity.name, name)
+        self.assertEqual(entity.entity_type, entity_type)
+        self.assertEqual(entity.wikipedia_url, wiki_url)
+        self.assertEqual(entity.metadata, {})
+        self.assertEqual(entity.salience, salience)
+        self.assertEqual(entity.mentions, [name])
+
+    def test_analyze_entities(self):
         from gcloud.language.entity import EntityType
 
         name1 = 'R-O-C-K'
@@ -136,7 +226,7 @@ def test_analyze_entities(self):
                     ],
                 },
             ],
-            'language': 'en',
+            'language': 'en-US',
         }
         connection = _Connection(response)
         client = _Client(connection=connection)
@@ -145,31 +235,26 @@ def test_analyze_entities(self):
         entities = document.analyze_entities()
         self.assertEqual(len(entities), 2)
         entity1 = entities[0]
-        self.assertIsInstance(entity1, Entity)
-        self.assertEqual(entity1.name, name1)
-        self.assertEqual(entity1.entity_type, EntityType.OTHER)
-        self.assertEqual(entity1.wikipedia_url, None)
-        self.assertEqual(entity1.metadata, {})
-        self.assertEqual(entity1.salience, salience1)
-        self.assertEqual(entity1.mentions, [name1])
+        self._verify_entity(entity1, name1, EntityType.OTHER,
+                            None, salience1)
         entity2 = entities[1]
-        self.assertIsInstance(entity2, Entity)
-        self.assertEqual(entity2.name, name2)
-        self.assertEqual(entity2.entity_type, EntityType.LOCATION)
-        self.assertEqual(entity2.wikipedia_url, wiki2)
-        self.assertEqual(entity2.metadata, {})
-        self.assertEqual(entity2.salience, salience2)
-        self.assertEqual(entity2.mentions, [name2])
+        self._verify_entity(entity2, name2, EntityType.LOCATION,
+                            wiki2, salience2)
 
         # Verify the request.
         self.assertEqual(len(connection._requested), 1)
         req = connection._requested[0]
         self.assertEqual(req['path'], 'analyzeEntities')
         self.assertEqual(req['method'], 'POST')
 
-    def test_analyze_sentiment(self):
+    def _verify_sentiment(self, sentiment, polarity, magnitude):
         from gcloud.language.sentiment import Sentiment
 
+        self.assertIsInstance(sentiment, Sentiment)
+        self.assertEqual(sentiment.polarity, polarity)
+        self.assertEqual(sentiment.magnitude, magnitude)
+
+    def test_analyze_sentiment(self):
         content = 'All the pretty horses.'
         polarity = 1
         magnitude = 0.6
@@ -178,23 +263,117 @@ def test_analyze_sentiment(self):
                 'polarity': polarity,
                 'magnitude': magnitude,
             },
-            'language': 'en',
+            'language': 'en-US',
         }
         connection = _Connection(response)
         client = _Client(connection=connection)
         document = self._makeOne(client, content)
 
         sentiment = document.analyze_sentiment()
-        self.assertIsInstance(sentiment, Sentiment)
-        self.assertEqual(sentiment.polarity, polarity)
-        self.assertEqual(sentiment.magnitude, magnitude)
+        self._verify_sentiment(sentiment, polarity, magnitude)
 
         # Verify the request.
         self.assertEqual(len(connection._requested), 1)
         req = connection._requested[0]
         self.assertEqual(req['path'], 'analyzeSentiment')
         self.assertEqual(req['method'], 'POST')
 
+    def _verify_sentences(self, include_syntax, annotations):
+        from gcloud.language.token import Sentence
+
+        if include_syntax:
+            self.assertEqual(len(annotations.sentences), 1)
+            sentence = annotations.sentences[0]
+            self.assertIsInstance(sentence, Sentence)
+            self.assertEqual(sentence.content, ANNOTATE_CONTENT)
+            self.assertEqual(sentence.begin, -1)
+        else:
+            self.assertEqual(annotations.sentences, [])
+
+    def _verify_tokens(self, annotations, token_info):
+        from gcloud.language.token import Token
+
+        self.assertEqual(len(annotations.tokens), len(token_info))
+        for token, info in zip(annotations.tokens, token_info):
+            self.assertIsInstance(token, Token)
+            self.assertEqual(token.text_content, info[0])
+            self.assertEqual(token.text_begin, -1)
+            self.assertEqual(token.part_of_speech, info[1])
+            self.assertEqual(token.edge_index, info[2])
+            self.assertEqual(token.edge_label, info[3])
+            self.assertEqual(token.lemma, info[0])
+
+    def _annotate_text_helper(self, include_sentiment,
+                              include_entities, include_syntax):
+        from gcloud.language.document import Annotations
+        from gcloud.language.entity import EntityType
+
+        token_info, sentences = _get_token_and_sentences(include_syntax)
+        entities = _get_entities(include_entities)
+        tokens = [_make_token_json(*info) for info in token_info]
+        response = {
+            'sentences': sentences,
+            'tokens': tokens,
+            'entities': entities,
+            'language': 'en-US',
+        }
+        if include_sentiment:
+            response['documentSentiment'] = {
+                'polarity': ANNOTATE_POLARITY,
+                'magnitude': ANNOTATE_MAGNITUDE,
+            }
+
+        connection = _Connection(response)
+        client = _Client(connection=connection)
+        document = self._makeOne(client, ANNOTATE_CONTENT)
+
+        annotations = document.annotate_text(
+            include_syntax=include_syntax, include_entities=include_entities,
+            include_sentiment=include_sentiment)
+        self.assertIsInstance(annotations, Annotations)
+        # Sentences
+        self._verify_sentences(include_syntax, annotations)
+        # Token
+        self._verify_tokens(annotations, token_info)
+        # Sentiment
+        if include_sentiment:
+            self._verify_sentiment(annotations.sentiment,
+                                   ANNOTATE_POLARITY, ANNOTATE_MAGNITUDE)
+        else:
+            self.assertIsNone(annotations.sentiment)
+        # Entity
+        if include_entities:
+            self.assertEqual(len(annotations.entities), 1)
+            entity = annotations.entities[0]
+            self._verify_entity(entity, ANNOTATE_NAME, EntityType.LOCATION,
+                                ANNOTATE_WIKI_URL, ANNOTATE_SALIENCE)
+        else:
+            self.assertEqual(annotations.entities, [])
+
+        # Verify the request.
+        self.assertEqual(len(connection._requested), 1)
+        req = connection._requested[0]
+        self.assertEqual(req['path'], 'annotateText')
+        self.assertEqual(req['method'], 'POST')
+        features = req['data']['features']
+        self.assertEqual(features.get('extractDocumentSentiment', False),
+                         include_sentiment)
+        self.assertEqual(features.get('extractEntities', False),
+                         include_entities)
+        self.assertEqual(features.get('extractSyntax', False), include_syntax)
+
+    def test_annotate_text(self):
+        self._annotate_text_helper(True, True, True)
+
+    def test_annotate_text_sentiment_only(self):
+        self._annotate_text_helper(True, False, False)
+
+    def test_annotate_text_entities_only(self):
+        self._annotate_text_helper(False, True, False)
+
+    def test_annotate_text_syntax_only(self):
+        self._annotate_text_helper(False, False, True)
+
 
 class _Connection(object):