From 5fb9defe08e429bf3ac996f03a9d4f41aa7e7324 Mon Sep 17 00:00:00 2001 From: Jerjou Cheng Date: Wed, 6 Apr 2016 11:11:12 -0700 Subject: [PATCH 01/36] Add samples for natural language api. Change-Id: I089b244f1a0a2210fffab6f5747eed9f6150147a --- .coveragerc | 1 + conftest.py | 24 +- language/README.md | 17 + language/api/README.md | 87 +++++ language/api/analyze.py | 115 ++++++ language/api/analyze_test.py | 258 +++++++++++++ language/api/requirements.txt | 1 + language/movie_nl/README.md | 154 ++++++++ language/movie_nl/main.py | 346 +++++++++++++++++ language/movie_nl/main_test.py | 82 ++++ language/movie_nl/requirements.txt | 3 + language/ocr_nl/README.md | 227 +++++++++++ language/ocr_nl/main.py | 362 ++++++++++++++++++ language/ocr_nl/main_test.py | 97 +++++ language/ocr_nl/requirements.txt | 1 + language/syntax_triples/README.md | 91 +++++ language/syntax_triples/main.py | 180 +++++++++ language/syntax_triples/main_test.py | 50 +++ language/syntax_triples/requirements.txt | 1 + .../resources/obama_wikipedia.txt | 1 + 20 files changed, 2097 insertions(+), 1 deletion(-) create mode 100644 language/README.md create mode 100644 language/api/README.md create mode 100644 language/api/analyze.py create mode 100644 language/api/analyze_test.py create mode 100644 language/api/requirements.txt create mode 100644 language/movie_nl/README.md create mode 100644 language/movie_nl/main.py create mode 100644 language/movie_nl/main_test.py create mode 100644 language/movie_nl/requirements.txt create mode 100644 language/ocr_nl/README.md create mode 100755 language/ocr_nl/main.py create mode 100755 language/ocr_nl/main_test.py create mode 100644 language/ocr_nl/requirements.txt create mode 100644 language/syntax_triples/README.md create mode 100644 language/syntax_triples/main.py create mode 100755 language/syntax_triples/main_test.py create mode 100644 language/syntax_triples/requirements.txt create mode 100644 language/syntax_triples/resources/obama_wikipedia.txt diff --git a/.coveragerc b/.coveragerc index a0523ed4a49e..cb53bfdaf96d 100644 --- a/.coveragerc +++ b/.coveragerc @@ -9,6 +9,7 @@ include = dns/* datastore/* error_reporting/* + language/* managed_vms/* monitoring/* speech/* diff --git a/conftest.py b/conftest.py index 14876c24eac7..3fa68de6953b 100644 --- a/conftest.py +++ b/conftest.py @@ -15,9 +15,10 @@ import os import pytest +import requests -class Namespace: +class Namespace(object): def __init__(self, **kwargs): self.__dict__.update(kwargs) @@ -48,3 +49,24 @@ def resource(request): testing resource""" local_path = os.path.dirname(request.module.__file__) return lambda *args: get_resource_path(args, local_path) + + +def fetch_gcs_resource(resource, tmpdir, _chunk_size=1024): + resp = requests.get(resource, stream=True) + dest_file = str(tmpdir.join(os.path.basename(resource))) + with open(dest_file, 'wb') as f: + for chunk in resp.iter_content(_chunk_size): + f.write(chunk) + + return dest_file + + +@pytest.fixture(scope='module') +def remote_resource(cloud_config): + """Provides a function that downloads the given resource from Cloud + Storage, returning the path to the downloaded resource.""" + remote_uri = 'http://storage.googleapis.com/{}/'.format( + cloud_config.storage_bucket) + + return lambda path, tmpdir: fetch_gcs_resource( + remote_uri + path.strip('/'), tmpdir) diff --git a/language/README.md b/language/README.md new file mode 100644 index 000000000000..e63d45eb9a6a --- /dev/null +++ b/language/README.md @@ -0,0 +1,17 @@ +# Google Cloud Natural Language API examples + +This directory contains Python examples that use the +[Google Cloud Natural Language API](https://cloud.google.com/natural-language/). + +- [api](api) has a simple command line tool that shows off the API's features. + +- [movie_nl](movie_nl) combines sentiment and entity analysis to come up with +actors/directors who are the most and least popular in the imdb movie reviews. + +- [ocr_nl](ocr_nl) uses the [Cloud Vision API](https://cloud.google.com/vision/) +to extract text from images, then uses the NL API to extract entity information +from those texts, and stores the extracted information in a database in support +of further analysis and correlation. + +- [syntax_triples](syntax_triples) uses syntax analysis to find +subject-verb-object triples in a given piece of text. diff --git a/language/api/README.md b/language/api/README.md new file mode 100644 index 000000000000..9625df30c89f --- /dev/null +++ b/language/api/README.md @@ -0,0 +1,87 @@ + +# Google Cloud Natural Language API Sample + +This Python sample demonstrates the use of the [Google Cloud Natural Language API][NL-Docs] +for sentiment, entity, and syntax analysis. + +[NL-Docs]: https://cloud.google.com/natural-language/docs/ + +## Setup + +Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) +steps in the Quickstart doc to create a project and enable the +Cloud Natural Language API. Following those steps, make sure that you +[Set Up a Service Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), +and export the following environment variable: + +``` +export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json +``` + +## Run the sample + +Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. + +To run the example, install the necessary libraries using pip: + +```sh +$ pip install -r requirements.txt +``` + +Then, run the script: + +```sh +$ python analyze.py +``` + +where `` is one of: `entities`, `sentiment`, or `syntax`. + +The script will write to STDOUT the json returned from the API for the requested feature. + +For example, if you run: + +```sh +$ python analyze.py entities "Tom Sawyer is a book written by a guy known as Mark Twain." +``` + +You will see something like the following returned: + +``` +{ + "entities": [ + { + "salience": 0.49785897, + "mentions": [ + { + "text": { + "content": "Tom Sawyer", + "beginOffset": 0 + } + } + ], + "type": "PERSON", + "name": "Tom Sawyer", + "metadata": { + "wikipedia_url": "http://en.wikipedia.org/wiki/The_Adventures_of_Tom_Sawyer" + } + }, + { + "salience": 0.12209519, + "mentions": [ + { + "text": { + "content": "Mark Twain", + "beginOffset": 47 + } + } + ], + "type": "PERSON", + "name": "Mark Twain", + "metadata": { + "wikipedia_url": "http://en.wikipedia.org/wiki/Mark_Twain" + } + } + ], + "language": "en" +} +``` diff --git a/language/api/analyze.py b/language/api/analyze.py new file mode 100644 index 000000000000..73e892c354a1 --- /dev/null +++ b/language/api/analyze.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python + +# Copyright 2016 Google, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Analyzes text using the Google Cloud Natural Language API.""" + +import argparse +import json +import sys + +from googleapiclient import discovery +import httplib2 +from oauth2client.client import GoogleCredentials + + +def get_service(): + credentials = GoogleCredentials.get_application_default() + scoped_credentials = credentials.create_scoped( + ['https://www.googleapis.com/auth/cloud-platform']) + http = httplib2.Http() + scoped_credentials.authorize(http) + return discovery.build('language', 'v1beta1', http=http) + + +def get_native_encoding_type(): + """Returns the encoding type that matches Python's native strings.""" + if sys.maxunicode == 65535: + return 'UTF16' + else: + return 'UTF32' + + +def analyze_entities(text, encoding='UTF32'): + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'encodingType': encoding, + } + + service = get_service() + + request = service.documents().analyzeEntities(body=body) + response = request.execute() + + return response + + +def analyze_sentiment(text): + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + } + } + + service = get_service() + + request = service.documents().analyzeSentiment(body=body) + response = request.execute() + + return response + + +def analyze_syntax(text, encoding='UTF32'): + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'features': { + 'extract_syntax': True, + }, + 'encodingType': encoding, + } + + service = get_service() + + request = service.documents().annotateText(body=body) + response = request.execute() + + return response + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('command', choices=[ + 'entities', 'sentiment', 'syntax']) + parser.add_argument('text') + + args = parser.parse_args() + + if args.command == 'entities': + result = analyze_entities(args.text, get_native_encoding_type()) + elif args.command == 'sentiment': + result = analyze_sentiment(args.text) + elif args.command == 'syntax': + result = analyze_syntax(args.text, get_native_encoding_type()) + + print(json.dumps(result, indent=2)) diff --git a/language/api/analyze_test.py b/language/api/analyze_test.py new file mode 100644 index 000000000000..11b0d65d6299 --- /dev/null +++ b/language/api/analyze_test.py @@ -0,0 +1,258 @@ +# Copyright 2016, Google, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import textwrap + +import analyze + + +def test_analyze_entities(): + result = analyze.analyze_entities( + 'Tom Sawyer is a book written by a guy known as Mark Twain.') + + assert result['language'] == 'en' + entities = result['entities'] + assert len(entities) + subject = entities[0] + assert subject['type'] == 'PERSON' + assert subject['name'].startswith('Tom') + + +def test_analyze_sentiment(capsys): + result = analyze.analyze_sentiment( + 'your face is really ugly and i hate it.') + + sentiment = result['documentSentiment'] + assert sentiment['polarity'] < 0 + assert sentiment['magnitude'] < 1 + + result = analyze.analyze_sentiment( + 'cheerio, mate - I greatly admire the pallor of your visage, and your ' + 'angle of repose leaves little room for improvement.') + + sentiment = result['documentSentiment'] + assert sentiment['polarity'] > 0 + assert sentiment['magnitude'] < 1 + + +def test_analyze_syntax(capsys): + result = analyze.analyze_syntax(textwrap.dedent(u'''\ + Keep away from people who try to belittle your ambitions. Small people + always do that, but the really great make you feel that you, too, can + become great. + - Mark Twain''')) + + assert len(result['tokens']) + first_token = result['tokens'][0] + assert first_token['text']['content'] == 'Keep' + assert first_token['partOfSpeech']['tag'] == 'VERB' + assert len(result['sentences']) > 1 + assert result['language'] == 'en' + + +def test_analyze_syntax_utf8(): + """Demonstrate the interpretation of the offsets when encoding=utf8. + + UTF8 is a variable-length encoding, where each character is at least 8 + bits. The offsets we get should be the index of the first byte of the + character. + """ + test_string = u'a \u00e3 \u0201 \U0001f636 b' + byte_array = test_string.encode('utf8') + result = analyze.analyze_syntax(test_string, encoding='UTF8') + tokens = result['tokens'] + + assert tokens[0]['text']['content'] == 'a' + offset = tokens[0]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset+1].decode('utf8') == + tokens[0]['text']['content']) + + assert tokens[1]['text']['content'] == u'\u00e3' + offset = tokens[1]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset+2].decode('utf8') == + tokens[1]['text']['content']) + + assert tokens[2]['text']['content'] == u'\u0201' + offset = tokens[2]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset+2].decode('utf8') == + tokens[2]['text']['content']) + + assert tokens[3]['text']['content'] == u'\U0001f636' + offset = tokens[3]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset+4].decode('utf8') == + tokens[3]['text']['content']) + + # This demonstrates that the offset takes into account the variable-length + # characters before the target token. + assert tokens[4]['text']['content'] == u'b' + offset = tokens[4]['text'].get('beginOffset', 0) + # 'b' is only one byte long + assert (byte_array[offset:offset+1].decode('utf8') == + tokens[4]['text']['content']) + + +def test_analyze_syntax_utf16(): + """Demonstrate the interpretation of the offsets when encoding=utf16. + + UTF16 is a variable-length encoding, where each character is at least 16 + bits. The returned offsets will be the index of the first 2-byte character + of the token. + """ + test_string = u'a \u00e3 \u0201 \U0001f636 b' + byte_array = test_string.encode('utf16') + # Remove the byte order marker, which the offsets don't account for + byte_array = byte_array[2:] + result = analyze.analyze_syntax(test_string, encoding='UTF16') + tokens = result['tokens'] + + assert tokens[0]['text']['content'] == 'a' + # The offset is an offset into an array where each entry is 16 bits. Since + # we have an 8-bit array, the offsets should be doubled to index into our + # array. + offset = 2 * tokens[0]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset + 2].decode('utf16') == + tokens[0]['text']['content']) + + assert tokens[1]['text']['content'] == u'\u00e3' + offset = 2 * tokens[1]['text'].get('beginOffset', 0) + # A UTF16 character with a low codepoint is 16 bits (2 bytes) long, so + # slice out 2 bytes starting from the offset. Then interpret the bytes as + # utf16 for comparison. + assert (byte_array[offset:offset + 2].decode('utf16') == + tokens[1]['text']['content']) + + assert tokens[2]['text']['content'] == u'\u0201' + offset = 2 * tokens[2]['text'].get('beginOffset', 0) + # A UTF16 character with a low codepoint is 16 bits (2 bytes) long, so + # slice out 2 bytes starting from the offset. Then interpret the bytes as + # utf16 for comparison. + assert (byte_array[offset:offset + 2].decode('utf16') == + tokens[2]['text']['content']) + + assert tokens[3]['text']['content'] == u'\U0001f636' + offset = 2 * tokens[3]['text'].get('beginOffset', 0) + # A UTF16 character with a high codepoint is 32 bits (4 bytes) long, so + # slice out 4 bytes starting from the offset. Then interpret those bytes as + # utf16 for comparison. + assert (byte_array[offset:offset + 4].decode('utf16') == + tokens[3]['text']['content']) + + # This demonstrates that the offset takes into account the variable-length + # characters before the target token. + assert tokens[4]['text']['content'] == u'b' + offset = 2 * tokens[4]['text'].get('beginOffset', 0) + # Even though 'b' is only one byte long, utf16 still encodes it using 16 + # bits + assert (byte_array[offset:offset + 2].decode('utf16') == + tokens[4]['text']['content']) + + +def test_annotate_text_utf32(): + """Demonstrate the interpretation of the offsets when encoding=utf32. + + UTF32 is a fixed-length encoding, where each character is exactly 32 bits. + The returned offsets will be the index of the first 4-byte character + of the token. + + Python unicode objects index by the interpreted unicode character. This + means a given unicode character only ever takes up one slot in a unicode + string. This is equivalent to indexing into a UTF32 string, where all + characters are a fixed length and thus will only ever take up one slot. + + Thus, if you're indexing into a python unicode object, you can set + encoding to UTF32 to index directly into the unicode object (as opposed to + the byte arrays, as these examples do). + + Nonetheless, this test still demonstrates indexing into the byte array, for + consistency. Note that you could just index into the origin test_string + unicode object with the raw offset returned by the api (ie without + multiplying it by 4, as it is below). + """ + test_string = u'a \u00e3 \u0201 \U0001f636 b' + byte_array = test_string.encode('utf32') + # Remove the byte order marker, which the offsets don't account for + byte_array = byte_array[4:] + result = analyze.analyze_syntax(test_string, encoding='UTF32') + tokens = result['tokens'] + + assert tokens[0]['text']['content'] == 'a' + # The offset is an offset into an array where each entry is 32 bits. Since + # we have an 8-bit array, the offsets should be quadrupled to index into + # our array. + offset = 4 * tokens[0]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[0]['text']['content']) + + assert tokens[1]['text']['content'] == u'\u00e3' + offset = 4 * tokens[1]['text'].get('beginOffset', 0) + # A UTF32 character with a low codepoint is 32 bits (4 bytes) long, so + # slice out 4 bytes starting from the offset. Then interpret the bytes as + # utf32 for comparison. + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[1]['text']['content']) + + assert tokens[2]['text']['content'] == u'\u0201' + offset = 4 * tokens[2]['text'].get('beginOffset', 0) + # A UTF32 character with a low codepoint is 32 bits (4 bytes) long, so + # slice out 4 bytes starting from the offset. Then interpret the bytes as + # utf32 for comparison. + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[2]['text']['content']) + + assert tokens[3]['text']['content'] == u'\U0001f636' + offset = 4 * tokens[3]['text'].get('beginOffset', 0) + # A UTF32 character with a high codepoint is 32 bits (4 bytes) long, so + # slice out 4 bytes starting from the offset. Then interpret those bytes as + # utf32 for comparison. + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[3]['text']['content']) + + # This demonstrates that the offset takes into account the variable-length + # characters before the target token. + assert tokens[4]['text']['content'] == u'b' + offset = 4 * tokens[4]['text'].get('beginOffset', 0) + # Even though 'b' is only one byte long, utf32 still encodes it using 32 + # bits + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[4]['text']['content']) + + +def test_annotate_text_utf32_directly_index_into_unicode(): + """Demonstrate using offsets directly, using encoding=utf32. + + See the explanation for test_annotate_text_utf32. Essentially, indexing + into a utf32 array is equivalent to indexing into a python unicode object. + """ + test_string = u'a \u00e3 \u0201 \U0001f636 b' + result = analyze.analyze_syntax(test_string, encoding='UTF32') + tokens = result['tokens'] + + assert tokens[0]['text']['content'] == 'a' + offset = tokens[0]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[0]['text']['content'] + + assert tokens[1]['text']['content'] == u'\u00e3' + offset = tokens[1]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[1]['text']['content'] + + assert tokens[2]['text']['content'] == u'\u0201' + offset = tokens[2]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[2]['text']['content'] + + assert tokens[3]['text']['content'] == u'\U0001f636' + offset = tokens[3]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[3]['text']['content'] + + assert tokens[4]['text']['content'] == u'b' + offset = tokens[4]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[4]['text']['content'] diff --git a/language/api/requirements.txt b/language/api/requirements.txt new file mode 100644 index 000000000000..0b96c82ee4c2 --- /dev/null +++ b/language/api/requirements.txt @@ -0,0 +1 @@ +google-api-python-client==1.5.1 diff --git a/language/movie_nl/README.md b/language/movie_nl/README.md new file mode 100644 index 000000000000..b651dee8bb73 --- /dev/null +++ b/language/movie_nl/README.md @@ -0,0 +1,154 @@ +# Introduction +This sample is an application of the Google Cloud Platform Natural Language API. +It uses the [imdb movie reviews data set](https://www.cs.cornell.edu/people/pabo/movie-review-data/) +from [Cornell University](http://www.cs.cornell.edu/) and performs sentiment & entity +analysis on it. It combines the capabilities of sentiment analysis and entity recognition +to come up with actors/directors who are the most and least popular. + +### Set Up to Authenticate With Your Project's Credentials + +Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) +steps in the Quickstart doc to create a project and enable the +Cloud Natural Language API. Following those steps, make sure that you +[Set Up a Service Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), +and export the following environment variable: + +``` +export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json +``` + +**Note:** If you get an error saying your API hasn't been enabled, make sure +that you have correctly set this environment variable, and that the project that +you got the service account from has the Natural Language API enabled. + +## How it works +This sample uses the Natural Language API to annotate the input text. The +movie review document is broken into sentences using the `extract_syntax` feature. +Each sentence is sent to the API for sentiment analysis. The positive and negative +sentiment values are combined to come up with a single overall sentiment of the +movie document. + +In addition to the sentiment, the program also extracts the entities of type +`PERSON`, who are the actors in the movie (including the director and anyone +important). These entities are assigned the sentiment value of the document to +come up with the most and least popular actors/directors. + +### Movie document +We define a movie document as a set of reviews. These reviews are individual +sentences and we use the NL API to extract the sentences from the document. See +an example movie document below. + +``` + Sample review sentence 1. Sample review sentence 2. Sample review sentence 3. +``` + +### Sentences and Sentiment +Each sentence from the above document is assigned a sentiment as below. + +``` + Sample review sentence 1 => Sentiment 1 + Sample review sentence 2 => Sentiment 2 + Sample review sentence 3 => Sentiment 3 +``` + +### Sentiment computation +The final sentiment is computed by simply adding the sentence sentiments. + +``` + Total Sentiment = Sentiment 1 + Sentiment 2 + Sentiment 3 +``` + + +### Entity extraction and Sentiment assignment +Entities with type `PERSON` are extracted from the movie document using the NL +API. Since these entities are mentioned in their respective movie document, +they are associated with the document sentiment. + +``` + Document 1 => Sentiment 1 + + Person 1 + Person 2 + Person 3 + + Document 2 => Sentiment 2 + + Person 2 + Person 4 + Person 5 +``` + +Based on the above data we can calculate the sentiment associated with Person 2: + +``` + Person 2 => (Sentiment 1 + Sentiment 2) +``` + +## Movie Data Set +We have used the Cornell Movie Review data as our input. Please follow the instructions below to download and extract the data. + +### Download Instructions + +``` + $ curl -O http://www.cs.cornell.edu/people/pabo/movie-review-data/mix20_rand700_tokens.zip + $ unzip mix20_rand700_tokens.zip +``` + +## Command Line Usage +In order to use the movie analyzer, follow the instructions below. (Note that the `--sample` parameter below runs the script on +fewer documents, and can be omitted to run it on the entire corpus) + +### Install Dependencies + +Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. + +Then, install dependencies by running the following pip command: + +``` +$ pip install -r requirements.txt +``` +### How to Run + +``` +$ python main.py --inp "tokens/*/*" \ + --sout sentiment.json \ + --eout entity.json \ + --sample 5 +``` + +You should see the log file `movie.log` created. + +## Output Data +The program produces sentiment and entity output in json format. For example: + +### Sentiment Output +``` + { + "doc_id": "cv310_tok-16557.txt", + "sentiment": 3.099, + "label": -1 + } +``` + +### Entity Output + +``` + { + "name": "Sean Patrick Flanery", + "wiki_url": "http://en.wikipedia.org/wiki/Sean_Patrick_Flanery", + "sentiment": 3.099 + } +``` + +### Entity Output Sorting +In order to sort and rank the entities generated, use the same `main.py` script. For example, +this will print the top 5 actors with negative sentiment: + +``` +$ python main.py --inp entity.json \ + --sout sentiment.json \ + --eout entity.json \ + --sentiment neg \ + --reverse True \ + --sample 5 +``` diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py new file mode 100644 index 000000000000..380f495f9673 --- /dev/null +++ b/language/movie_nl/main.py @@ -0,0 +1,346 @@ +# Copyright 2016 Google, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import codecs +import glob +import json +import logging +import os + +from googleapiclient import discovery +import httplib2 +from oauth2client.client import GoogleCredentials +import requests + + +def analyze_document(service, document): + """Analyze the document and get the distribution of sentiments and + the movie name.""" + logging.info('Analyzing {}'.format(document.doc_id)) + + sentences, entities = document.extract_all_sentences(service) + sentiments = [get_sentiment(service, sentence) for sentence in sentences] + + return sentiments, entities + + +def get_request_body(text, syntax=True, entities=True, sentiment=True): + """Creates the body of the request to the language api in + order to get an appropriate api response.""" + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'features': { + 'extract_syntax': syntax, + 'extract_entities': entities, + 'extract_document_sentiment': sentiment, + }, + 'encoding_type': 'UTF32' + } + + return body + + +def get_sentiment(service, sentence): + """Get the sentence-level sentiment.""" + body = get_request_body( + sentence, syntax=False, entities=True, sentiment=True) + + docs = service.documents() + request = docs.annotateText(body=body) + response = request.execute() + sentiment = response.get("documentSentiment") + + if sentiment is None: + return (None, None) + else: + pol = sentiment.get("polarity") + mag = sentiment.get("magnitude") + + if pol is None and mag is not None: + pol = 0 + return (pol, mag) + + +class Document(object): + """Document class captures a single document of movie reviews.""" + + def __init__(self, text, doc_id, doc_path): + self.text = text + self.doc_id = doc_id + self.doc_path = doc_path + self.sentent_pair = None + self.label = None + + def extract_all_sentences(self, service): + """Extract the sentences in a document.""" + + if self.sentent_pair is None: + docs = service.documents() + request_body = get_request_body( + self.text, + syntax=True, + entities=True, + sentiment=False) + request = docs.annotateText(body=request_body) + + ent_list = [] + + response = request.execute() + entities = response.get('entities', []) + sentences = response.get('sentences', []) + + sent_list = [ + sentence.get('text').get('content')for sentence in sentences + ] + + for entity in entities: + ent_type = entity.get('type') + wiki_url = entity.get('metadata', {}).get('wikipedia_url') + + if ent_type == 'PERSON' and wiki_url is not None: + ent_list.append(wiki_url) + + self.sentent_pair = (sent_list, ent_list) + + return self.sentent_pair + + +def to_sentiment_json(doc_id, sent, label): + """Convert the sentiment info to json.""" + json_doc = {} + + json_doc['doc_id'] = doc_id + json_doc['sentiment'] = float('%.3f' % sent) + json_doc['label'] = label + + return json.dumps(json_doc) + + +def get_wiki_title(wiki_url): + """Get the wikipedia page title for a given wikipedia URL.""" + try: + content = requests.get(wiki_url).text + return content.split('title')[1].split('-')[0].split('>')[1].strip() + except: + return os.path.basename(wiki_url).replace('_', ' ') + + +def to_entity_json(entity, e_tuple): + """Convert the entity info to json.""" + json_doc = {} + + avg_sentiment = float(e_tuple[0]) / float(e_tuple[1]) + + json_doc['wiki_url'] = entity + json_doc['name'] = get_wiki_title(entity) + json_doc['sentiment'] = float('%.3f' % e_tuple[0]) + json_doc['avg_sentiment'] = float('%.3f' % avg_sentiment) + + return json.dumps(json_doc) + + +def get_sentiment_entities(service, document): + """Compute the overall sentiment volume in the document""" + sentiments, entities = analyze_document(service, document) + + sentiments = [sent for sent in sentiments if sent[0] is not None] + negative_sentiments = [ + polarity for polarity, magnitude in sentiments if polarity < 0.0] + positive_sentiments = [ + polarity for polarity, magnitude in sentiments if polarity > 0.0] + + negative = sum(negative_sentiment) + positive = sum(positive_sentiment) + total = positive + negative + + return (total, entities) + + +def get_sentiment_label(sentiment): + """Return the sentiment label based on the sentiment quantity.""" + if sentiment < 0: + return -1 + elif sentiment > 0: + return 1 + else: + return 0 + + +def process_movie_reviews(service, reader, sentiment_writer, entity_writer): + """Perform some sentiment math and come up with movie review.""" + collected_entities = {} + + for document in reader: + try: + sentiment_total, entities = get_sentiment_entities( + service, document) + document.label = get_sentiment_label(sentiment_total) + + sentiment_writer.write( + to_sentiment_json( + document.doc_id, + sentiment_total, + document.label + ) + ) + + sentiment_writer.write('\n') + + for ent in entities: + ent_sent, frequency = collected_entities.get(ent, (0, 0)) + ent_sent += sentiment_total + frequency += 1 + + collected_entities[ent] = (ent_sent, frequency) + + except Exception: + logging.info('Skipping {}'.format(document.doc_id)) + + for entity, e_tuple in collected_entities.items(): + entity_writer.write(to_entity_json(entity, e_tuple)) + entity_writer.write('\n') + + sentiment_writer.flush() + entity_writer.flush() + + +def document_generator(dir_path_pattern, count=None): + """Generator for the input movie documents.""" + for running_count, item in enumerate(glob.iglob(dir_path_pattern)): + if count and running_count >= count: + raise StopIteration() + + doc_id = os.path.basename(item) + + with codecs.open(item, encoding='utf-8') as f: + try: + text = f.read() + except UnicodeDecodeError: + text = None + + yield Document(text, doc_id, item) + + +def rank_entities(reader, sentiment=None, topn=None, reverse_bool=False): + """Rank the entities (actors) based on their sentiment + assigned from the movie.""" + + items = [] + for item in reader: + json_item = json.loads(item) + sent = json_item.get('sentiment') + entity_item = (sent, json_item) + + if sentiment: + if sentiment == 'pos' and sent > 0: + items.append(entity_item) + elif sentiment == 'neg' and sent < 0: + items.append(entity_item) + else: + items.append(entity_item) + + items.sort(reverse=True) + items = [json.dumps(item[1]) for item in items] + + if reverse_bool: + items.reverse() + + if topn: + print('\n'.join(items[:topn])) + else: + print('\n'.join(items)) + + +def get_service(): + """Build a client to the Google Cloud Natural Language API.""" + + credentials = GoogleCredentials.get_application_default() + scoped_credentials = credentials.create_scoped( + ['https://www.googleapis.com/auth/cloud-platform']) + http = httplib2.Http() + scoped_credentials.authorize(http) + return discovery.build('language', 'v1beta1', http=http) + + +def main(input_dir, sent_out, ent_out, sample, log_file, + operation, sentiment, ent_in, reverse_bool): + """Movie demo main program""" + + sample = int(sample) if sample else None + + if operation == 'rank': + with open(ent_in) as reader: + rank_entities(reader, sentiment, sample, reverse_bool) + else: + # Create logger settings + logging.basicConfig(filename=log_file, level=logging.DEBUG) + + # Create a Google Service object + service = get_service() + + # Create a sentiment output writer + sentiment_writer = open(sent_out, 'w') + + # Create an entity output writer + entity_writer = open(ent_out, 'w') + + reader = document_generator(input_dir, sample) + + # Process the movie documents + process_movie_reviews(service, reader, sentiment_writer, entity_writer) + + # close reader and writers + sentiment_writer.close() + entity_writer.close() + reader.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('--inp', help='location of the input', required=True) + parser.add_argument( + '--sout', help='location of the sentiment output', required=True) + parser.add_argument( + '--eout', help='location of the entity output', required=True) + parser.add_argument('--sample', help='number of top items to process') + parser.add_argument( + '--op', + help='operation to perform "rank" or "analyze"', + default='analyze') + parser.add_argument( + '--sentiment', help='filter sentiment as "neg" or "pos"') + parser.add_argument( + '--ein', help='location of entity input') + parser.add_argument( + '--reverse', help='reverse the order of the items') + + args = parser.parse_args() + + log_file = 'movie.log' + + main(args.inp, + args.sout, + args.eout, + args.sample, + log_file, + args.op, + args.sentiment, + args.ein, + args.reverse) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py new file mode 100644 index 000000000000..96907908018d --- /dev/null +++ b/language/movie_nl/main_test.py @@ -0,0 +1,82 @@ +# Copyright 2016 Google, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import json + +import main + + +def test_get_request_body(): + text = 'hello world' + body = main.get_request_body(text, syntax=True, entities=True, + sentiment=False) + assert body.get('document').get('content') == text + + assert body.get('features').get('extract_syntax') is True + assert body.get('features').get('extract_entities') is True + assert body.get('features').get('extract_document_sentiment') is False + + +def test_get_sentiment_label(): + assert main.get_sentiment_label(20.50) == 1 + assert main.get_sentiment_label(-42.34) == -1 + + +def test_to_sentiment_json(): + doc_id = '12345' + sentiment = 23.344564 + label = 1 + + sentiment_json = json.loads( + main.to_sentiment_json(doc_id, sentiment, label) + ) + + assert sentiment_json.get('doc_id') == doc_id + assert sentiment_json.get('sentiment') == 23.345 + assert sentiment_json.get('label') == label + + +def test_process_movie_reviews(): + service = main.get_service() + + doc1 = main.Document('Top Gun was awesome and Tom Cruise rocked!', 'doc1', + 'doc1') + doc2 = main.Document('Tom Cruise is a great actor.', 'doc2', 'doc2') + + reader = [doc1, doc2] + swriter = io.StringIO() + ewriter = io.StringIO() + + main.process_movie_reviews(service, reader, swriter, ewriter) + + sentiments = swriter.getvalue().strip().split('\n') + entities = ewriter.getvalue().strip().split('\n') + + sentiments = [json.loads(sentiment) for sentiment in sentiments] + entities = [json.loads(entity) for entity in entities] + + # assert sentiments + assert sentiments[0].get('sentiment') == 1.0 + assert sentiments[0].get('label') == 1 + + assert sentiments[1].get('sentiment') == 1.0 + assert sentiments[1].get('label') == 1 + + # assert entities + assert len(entities) == 1 + assert entities[0].get('name') == 'Tom Cruise' + assert (entities[0].get('wiki_url') == + 'http://en.wikipedia.org/wiki/Tom_Cruise') + assert entities[0].get('sentiment') == 2.0 diff --git a/language/movie_nl/requirements.txt b/language/movie_nl/requirements.txt new file mode 100644 index 000000000000..391be2e98434 --- /dev/null +++ b/language/movie_nl/requirements.txt @@ -0,0 +1,3 @@ +urlparse2==1.1.1 +google-api-python-client==1.5.1 +requests==2.10.0 diff --git a/language/ocr_nl/README.md b/language/ocr_nl/README.md new file mode 100644 index 000000000000..189e93979010 --- /dev/null +++ b/language/ocr_nl/README.md @@ -0,0 +1,227 @@ + +# Using the Cloud Natural Language API to analyze image text found with Cloud Vision + +This example uses the [Cloud Vision API](https://cloud.google.com/vision/) to +detect text in images, then analyzes that text using the [Cloud NL (Natural +Language) API](https://cloud.google.com/natural-language/) to detect +[entities](https://cloud.google.com/natural-language/docs/basics#entity_analysis) +in the text. It stores the detected entity +information in an [sqlite3](https://www.sqlite.org) database, which may then be +queried. + +(This kind of analysis can be useful with scans of brochures and fliers, +invoices, and other types of company documents... or maybe just organizing your +memes). + +After the example script has analyzed a directory of images, it outputs some +information on the images' entities to STDOUT. You can also further query +the generated sqlite3 database. + +## Setup + +### Install sqlite3 as necessary + +The example requires that sqlite3 be installed. Most likely, sqlite3 is already +installed for you on your machine, but if not, you can find it +[here](https://www.sqlite.org/download.html). + +### Set Up to Authenticate With Your Project's Credentials + +* Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) +steps in the Quickstart doc to create a project and enable the +Cloud Natural Language API. +* Following those steps, make sure that you [Set Up a Service + Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), + and export the following environment variable: + + ``` + export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json + ``` +* This sample also requires that you [enable the Cloud Vision + API](https://console.cloud.google.com/apis/api/vision.googleapis.com/overview?project=_) + +## Running the example + +Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. + +To run the example, install the necessary libraries using pip: + +```sh +$ pip install -r requirements.txt +``` + +You must also be set up to authenticate with the Cloud APIs using your +project's service account credentials, as described above. + +Then, run the script on a directory of images to do the analysis, E.g.: + +```sh +$ python main.py --input_directory= +``` + +You can try this on a sample directory of images: + +```sh +$ curl -O http://storage.googleapis.com/python-docs-samples-tests/language/ocr_nl-images.zip +$ unzip ocr_nl-images.zip +$ python main.py --input_directory=images/ +``` + +## A walkthrough of the example and its results + +Let's take a look at what the example generates when run on the `images/` +sample directory, and how it does it. + +The script looks at each image file in the given directory, and uses the Vision +API's text detection capabilities (OCR) to find any text in each image. It +passes that info to the NL API, and asks it to detect [entities](xxx) in the +discovered text, then stores this information in a queryable database. + +To keep things simple, we're just passing to the NL API all the text found in a +given image, in one string. Note that sometimes this string can include +misinterpreted characters (if the image text was not very clear), or list words +"out of order" from how a human would interpret them. So, the text that is +actually passed to the NL API might not be quite what you would have predicted +with your human eyeballs. + +The Entity information returned by the NL API includes *type*, *name*, *salience*, +information about where in the text the given entity was found, and detected +language. It may also include *metadata*, including a link to a Wikipedia URL +that the NL API believes this entity maps to. See the +[documentation](https://cloud.google.com/natural-language/docs/) and the [API +reference pages](https://cloud.google.com/natural-language/reference/rest/v1beta1/Entity) +for more information about `Entity` fields. + +For example, if the NL API was given the sentence: + +``` +"Holmes and Watson walked over to the cafe." +``` + +it would return a response something like the following: + +``` +{ + "entities": [{ + "salience": 0.51629782, + "mentions": [{ + "text": { + "content": "Holmes", + "beginOffset": 0 + }}], + "type": "PERSON", + "name": "Holmes", + "metadata": { + "wikipedia_url": "http://en.wikipedia.org/wiki/Sherlock_Holmes" + }}, + { + "salience": 0.22334209, + "mentions": [{ + "text": { + "content": "Watson", + "beginOffset": 11 + }}], + "type": "PERSON", + "name": "Watson", + "metadata": { + "wikipedia_url": "http://en.wikipedia.org/wiki/Dr._Watson" + }}], + "language": "en" +} +``` + +Note that the NL API determined from context that "Holmes" was referring to +'Sherlock Holmes', even though the name "Sherlock" was not included. + +Note also that not all nouns in a given sentence are detected as Entities. An +Entity represents a phrase in the text that is a known entity, such as a person, +an organization, or location. The generic mention of a 'cafe' is not treated as +an entity in this sense. + +For each image file, we store its detected entity information (if any) in an +sqlite3 database. + +### Querying for information about the detected entities + +Once the detected entity information from all the images is stored in the +sqlite3 database, we can run some queries to do some interesting analysis. The +script runs a couple of such example query sets and outputs the result to STDOUT. + +The first set of queries outputs information about the top 15 most frequent +entity names found in the images, and the second outputs information about the +top 15 most frequent Wikipedia URLs found. + +For example, with the sample image set, note that the name 'Sherlock Holmes' is +found three times, but entities associated with the URL +http://en.wikipedia.org/wiki/Sherlock_Holmes are found four times; one of the +entity names was only "Holmes", but the NL API detected from context that it +referred to Sherlock Holmes. Similarly, you can see that mentions of 'Hive' and +'Spark' mapped correctly – given their context – to the URLs of those Apache +products. + +``` +----entity: http://en.wikipedia.org/wiki/Apache_Hive was found with count 1 +Found in file images/IMG_20160621_133020.jpg, detected as type OTHER, with + locale en. +names(s): set([u'hive']) +salience measure(s): set([0.0023808887]) +``` + +Similarly, 'Elizabeth' (in screencaps of text from "Pride and Prejudice") is +correctly mapped to http://en.wikipedia.org/wiki/Elizabeth_Bennet because of the +context of the surrounding text. + +``` +----entity: http://en.wikipedia.org/wiki/Elizabeth_Bennet was found with count 2 +Found in file images/Screenshot 2016-06-19 11.51.50.png, detected as type PERSON, with + locale en. +Found in file images/Screenshot 2016-06-19 12.08.30.png, detected as type PERSON, with + locale en. +names(s): set([u'elizabeth']) +salience measure(s): set([0.34601286, 0.0016268975]) +``` + +## Further queries to the sqlite3 database + +When the script runs, it makes a couple of example queries to the database +containing the entity information returned from the NL API. You can make further +queries on that database by starting up sqlite3 from the command line, and +passing it the name of the database file generated by running the example. This +file will be in the same directory, and have `entities` as a prefix, with the +timestamp appended. (If you have run the example more than once, a new database +file will be created each time). + +Run sqlite3 as follows (using the name of your own database file): + +```sh +$ sqlite3 entities1466518508.db +``` + +You'll see something like this: + +``` +SQLite version 3.8.10.2 2015-05-20 18:17:19 +Enter ".help" for usage hints. +sqlite> +``` + +From this prompt, you can make any queries on the data that you want. E.g., +start with something like: + +``` +sqlite> select * from entities limit 20; +``` + +Or, try this to see in which images the most entities were detected: + +``` +sqlite> select filename, count(filename) from entities group by filename; +``` + +You can do more complex queries to get further information about the entities +that have been discovered in your images. E.g., you might want to investigate +which of the entities are most commonly found together in the same image. See +the [SQLite documentation](https://www.sqlite.org/docs.html) for more +information. + + diff --git a/language/ocr_nl/main.py b/language/ocr_nl/main.py new file mode 100755 index 000000000000..6e329f53386e --- /dev/null +++ b/language/ocr_nl/main.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This example uses the Google Cloud Vision API to detect text in images, then +analyzes that text using the Google Cloud Natural Language API to detect +entities in the text. It stores the detected entity information in an sqlite3 +database, which may then be queried. + +After this script has analyzed a directory of images, it outputs some +information on the images' entities to STDOUT. You can also further query +the generated sqlite3 database; see the README for more information. + +Run the script on a directory of images to do the analysis, E.g.: + $ python main.py --input_directory= + +You can try this on a sample directory of images: + $ curl -O http://storage.googleapis.com/python-docs-samples-tests/language/ocr_nl-images.zip + $ unzip ocr_nl-images.zip + $ python main.py --input_directory=images/ + +""" # noqa + +import argparse +import base64 +import contextlib +import logging +import os +import sqlite3 +import sys +import time + +from googleapiclient import discovery +from googleapiclient import errors +import httplib2 +from oauth2client.client import GoogleCredentials + +BATCH_SIZE = 10 + + +class VisionApi(object): + """Construct and use the Cloud Vision API service.""" + + def __init__(self): + credentials = GoogleCredentials.get_application_default() + self.service = discovery.build('vision', 'v1', credentials=credentials) + + def detect_text(self, input_filenames, num_retries=3, max_results=6): + """Uses the Vision API to detect text in the given file.""" + batch_request = [] + for filename in input_filenames: + request = { + 'image': {}, + 'features': [{ + 'type': 'TEXT_DETECTION', + 'maxResults': max_results, + }] + } + + # Accept both files in cloud storage, as well as local files. + if filename.startswith('gs://'): + request['image']['source'] = { + 'gcsImageUri': filename + } + else: + with open(filename, 'rb') as image_file: + request['image']['content'] = base64.b64encode( + image_file.read()).decode('UTF-8') + + batch_request.append(request) + + request = self.service.images().annotate( + body={'requests': batch_request}) + + try: + responses = request.execute(num_retries=num_retries) + if 'responses' not in responses: + return {} + + text_response = {} + for filename, response in zip( + input_filenames, responses['responses']): + + if 'error' in response: + logging.error('API Error for {}: {}'.format( + filename, + response['error'].get('message', ''))) + continue + + text_response[filename] = response.get('textAnnotations', []) + + return text_response + + except errors.HttpError as e: + logging.error('Http Error for {}: {}'.format(filename, e)) + except KeyError as e2: + logging.error('Key error: {}'.format(e2)) + + +class TextAnalyzer(object): + """Construct and use the Google Natural Language API service.""" + + def __init__(self, db_filename=None): + credentials = GoogleCredentials.get_application_default() + scoped_credentials = credentials.create_scoped( + ['https://www.googleapis.com/auth/cloud-platform']) + http = httplib2.Http() + scoped_credentials.authorize(http) + self.service = discovery.build('language', 'v1beta1', http=http) + + # This list will store the entity information gleaned from the + # image files. + self.entity_info = [] + + # This is the filename of the sqlite3 database to save to + self.db_filename = db_filename or 'entities{}.db'.format( + int(time.time())) + + def _get_native_encoding_type(self): + """Returns the encoding type that matches Python's native strings.""" + if sys.maxunicode == 65535: + return 'UTF16' + else: + return 'UTF32' + + def nl_detect(self, text): + """Use the Natural Language API to analyze the given text string.""" + # We're only requesting 'entity' information from the Natural Language + # API at this time. + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'encodingType': self._get_native_encoding_type(), + } + entities = [] + try: + request = self.service.documents().analyzeEntities(body=body) + response = request.execute() + entities = response['entities'] + except errors.HttpError as e: + logging.error('Http Error: %s' % e) + except KeyError as e2: + logging.error('Key error: %s' % e2) + return entities + + def add_entities(self, filename, locale, document): + """Apply the Natural Language API to the document, and collect the + detected entities.""" + + # Apply the Natural Language API to the document. + entities = self.nl_detect(document) + self.extract_and_save_entity_info(entities, locale, filename) + + def extract_entity_info(self, entity): + """Extract information about an entity.""" + type = entity['type'] + name = entity['name'].lower() + metadata = entity['metadata'] + salience = entity['salience'] + wiki_url = metadata.get('wikipedia_url', None) + return (type, name, salience, wiki_url) + + def extract_and_save_entity_info(self, entities, locale, filename): + for entity in entities: + type, name, salience, wiki_url = self.extract_entity_info(entity) + # Because this is a small example, we're using a list to hold + # all the entity information, then we'll insert it into the + # database all at once when we've processed all the files. + # For a larger data set, you would want to write to the database + # in batches. + self.entity_info.append( + (locale, type, name, salience, wiki_url, filename)) + + def write_entity_info_to_db(self): + """Store the info gleaned about the entities in the text, via the + Natural Language API, in an sqlite3 database table, and then print out + some simple analytics. + """ + logging.info('Saving entity info to the sqlite3 database.') + # Create the db. + with contextlib.closing(sqlite3.connect(self.db_filename)) as conn: + with conn as cursor: + # Create table + cursor.execute( + 'CREATE TABLE if not exists entities (locale text, ' + 'type text, name text, salience real, wiki_url text, ' + 'filename text)') + with conn as cursor: + # Load all the data + cursor.executemany( + 'INSERT INTO entities VALUES (?,?,?,?,?,?)', + self.entity_info) + + def output_entity_data(self): + """Output some info about the entities by querying the generated + sqlite3 database. + """ + + with contextlib.closing(sqlite3.connect(self.db_filename)) as conn: + + # This query finds the number of times each entity name was + # detected, in descending order by count, and returns information + # about the first 15 names, including the files in which they were + # found, their detected 'salience' and language (locale), and the + # wikipedia urls (if any) associated with them. + print('\n==============\nTop 15 most frequent entity names:') + + cursor = conn.cursor() + results = cursor.execute( + 'select name, count(name) as wc from entities ' + 'group by name order by wc desc limit 15;') + + for item in results: + cursor2 = conn.cursor() + print(u'\n----Name: {} was found with count {}'.format(*item)) + results2 = cursor2.execute( + 'SELECT name, type, filename, locale, wiki_url, salience ' + 'FROM entities WHERE name=?', (item[0],)) + urls = set() + for elt in results2: + print(('Found in file {}, detected as type {}, with\n' + ' locale {} and salience {}.').format( + elt[2], elt[1], elt[3], elt[5])) + if elt[4]: + urls.add(elt[4]) + if urls: + print('url(s): {}'.format(urls)) + + # This query finds the number of times each wikipedia url was + # detected, in descending order by count, and returns information + # about the first 15 urls, including the files in which they were + # found and the names and 'salience' with which they were + # associated. + print('\n==============\nTop 15 most frequent Wikipedia URLs:') + c = conn.cursor() + results = c.execute( + 'select wiki_url, count(wiki_url) as wc from entities ' + 'group by wiki_url order by wc desc limit 15;') + + for item in results: + cursor2 = conn.cursor() + print('\n----entity: {} was found with count {}'.format(*item)) + results2 = cursor2.execute( + 'SELECT name, type, filename, locale, salience ' + 'FROM entities WHERE wiki_url=?', (item[0],)) + names = set() + salience = set() + for elt in results2: + print(('Found in file {}, detected as type {}, with\n' + ' locale {}.').format(elt[2], elt[1], elt[3])) + names.add(elt[0]) + salience.add(elt[4]) + print('names(s): {}'.format(names)) + print('salience measure(s): {}'.format(salience)) + + +def extract_description(texts): + """Returns text annotations as a single string""" + document = [] + + for text in texts: + try: + document.append(text['description']) + locale = text['locale'] + # Process only the first entry, which contains all + # text detected. + break + except KeyError as e: + logging.error('KeyError: %s\n%s' % (e, text)) + return (locale, ' '.join(document)) + + +def extract_descriptions(input_filename, texts, text_analyzer): + """Gets the text that was detected in the image.""" + if texts: + locale, document = extract_description(texts) + text_analyzer.add_entities(input_filename, locale, document) + sys.stdout.write('.') # Output a progress indicator. + sys.stdout.flush() + elif texts == []: + print('%s had no discernible text.' % input_filename) + + +def get_text_from_files(vision, input_filenames, text_analyzer): + """Call the Vision API on a file and index the results.""" + texts = vision.detect_text(input_filenames) + if texts: + for filename, text in texts.items(): + extract_descriptions(filename, text, text_analyzer) + + +def batch(list_to_batch, batch_size=BATCH_SIZE): + """Group a list into batches of size batch_size. + + >>> tuple(batch([1, 2, 3, 4, 5], batch_size=2)) + ((1, 2), (3, 4), (5)) + """ + for i in range(0, len(list_to_batch), batch_size): + yield tuple(list_to_batch[i:i + batch_size]) + + +def main(input_dir, db_filename=None): + """Walk through all the image files in the given directory, extracting any + text from them and feeding that text to the Natural Language API for + analysis. + """ + # Create a client object for the Vision API + vision_api_client = VisionApi() + # Create an object to analyze our text using the Natural Language API + text_analyzer = TextAnalyzer(db_filename) + + if input_dir: + allfileslist = [] + # Recursively construct a list of all the files in the given input + # directory. + for folder, subs, files in os.walk(input_dir): + for filename in files: + allfileslist.append(os.path.join(folder, filename)) + + # Analyze the text in the files using the Vision and Natural Language + # APIs. + for filenames in batch(allfileslist, batch_size=1): + get_text_from_files(vision_api_client, filenames, text_analyzer) + + # Save the result to a database, then run some queries on the database, + # with output to STDOUT. + text_analyzer.write_entity_info_to_db() + + # now, print some information about the entities detected. + text_analyzer.output_entity_data() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Detects text in the images in the given directory.') + parser.add_argument( + '--input_directory', + help='The image directory you\'d like to detect text in. If left ' + 'unspecified, the --db specified will be queried without being ' + 'updated.') + parser.add_argument( + '--db', help='The filename to use for the sqlite3 database.') + args = parser.parse_args() + + if not (args.input_directory or args.db): + parser.error('Either --input_directory or --db must be specified.') + + main(args.input_directory, args.db) diff --git a/language/ocr_nl/main_test.py b/language/ocr_nl/main_test.py new file mode 100755 index 000000000000..c07ed747ea0f --- /dev/null +++ b/language/ocr_nl/main_test.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for main.""" + +import re +import zipfile + +import main + + +_TEST_IMAGE_URI = 'gs://{}/language/image8.png' + + +def test_batch_empty(): + for batch_size in range(1, 10): + assert len( + list(main.batch([], batch_size=batch_size))) == 0 + + +def test_batch_single(): + for batch_size in range(1, 10): + batched = tuple(main.batch([1], batch_size=batch_size)) + assert batched == ((1,),) + + +def test_single_image_returns_text(cloud_config): + vision_api_client = main.VisionApi() + + image_path = _TEST_IMAGE_URI.format(cloud_config.storage_bucket) + texts = vision_api_client.detect_text([image_path]) + + assert image_path in texts + _, document = main.extract_description(texts[image_path]) + assert "daughter" in document + assert "Bennet" in document + assert "hat" in document + + +def test_single_nonimage_returns_error(): + vision_api_client = main.VisionApi() + texts = vision_api_client.detect_text(['README.md']) + assert "README.md" not in texts + + +def test_text_returns_entities(): + text = "Holmes and Watson walked to the cafe." + text_analyzer = main.TextAnalyzer() + entities = text_analyzer.nl_detect(text) + assert len(entities) == 2 + etype, ename, salience, wurl = text_analyzer.extract_entity_info( + entities[0]) + assert ename == 'holmes' + assert wurl == 'http://en.wikipedia.org/wiki/Sherlock_Holmes' + + +def test_entities_list(cloud_config): + vision_api_client = main.VisionApi() + image_path = _TEST_IMAGE_URI.format(cloud_config.storage_bucket) + texts = vision_api_client.detect_text([image_path]) + locale, document = main.extract_description(texts[image_path]) + text_analyzer = main.TextAnalyzer() + entities = text_analyzer.nl_detect(document) + assert len(entities) == 4 + etype, ename, salience, wurl = text_analyzer.extract_entity_info( + entities[0]) + assert ename == 'bennet' + assert wurl == 'http://en.wikipedia.org/wiki/Mr_Bennet' + + +def test_main(remote_resource, tmpdir, capsys): + images_path = str(tmpdir.mkdir('images')) + + # First, pull down some test data + zip_path = remote_resource('language/ocr_nl-images-small.zip', tmpdir) + + # Extract it to the image directory + with zipfile.ZipFile(zip_path) as zfile: + zfile.extractall(images_path) + + main.main(images_path, str(tmpdir.join('ocr_nl.db'))) + + stdout, _ = capsys.readouterr() + + assert re.search(r'google was found with count', stdout) diff --git a/language/ocr_nl/requirements.txt b/language/ocr_nl/requirements.txt new file mode 100644 index 000000000000..0b96c82ee4c2 --- /dev/null +++ b/language/ocr_nl/requirements.txt @@ -0,0 +1 @@ +google-api-python-client==1.5.1 diff --git a/language/syntax_triples/README.md b/language/syntax_triples/README.md new file mode 100644 index 000000000000..1342ee65289d --- /dev/null +++ b/language/syntax_triples/README.md @@ -0,0 +1,91 @@ +# Using the Cloud Natural Language API to find subject-verb-object triples in text + +This example finds subject-verb-object triples in a given piece of text using +syntax analysis capabilities of +[Cloud Natural Language API](https://cloud.google.com/natural-language/). +To do this, it calls the extractSyntax feature of the API +and uses the dependency parse tree and part-of-speech tags in the resposne +to build the subject-verb-object triples. The results are printed to STDOUT. +This type of analysis can be considered as the +first step towards an information extraction task. + +## Set Up to Authenticate With Your Project's Credentials + +Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) +steps in the Quickstart doc to create a project and enable the +Cloud Natural Language API. Following those steps, make sure that you +[Set Up a Service Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), +and export the following environment variable: + +``` +export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json +``` + +## Running the example + +Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. + +To run the example, install the necessary libraries using pip: + +``` +$ pip install -r requirements.txt +``` +You must also be set up to authenticate with the Cloud APIs using your +project's service account credentials, as described above. + +Then, run the script on a file containing the text that you wish to analyze. +The text must be encoded in UTF8 or ASCII: + +``` +$ python main.py +``` + +Try this on a sample text in the resources directory: + +``` +$ python main.py resources/obama_wikipedia.txt +``` + +## A walkthrough of the example and its results + +Let's take a look at what the example generates when run on the +`obama_wikipedia.txt` sample file, and how it does it. + +The goal is to find all subject-verb-object +triples in the text. The example first sends the text to the Cloud Natural +Language API to perform extractSyntax analysis. Then, using part-of-speech tags, + it finds all the verbs in the text. For each verb, it uses the dependency +parse tree information to find all the dependent tokens. + +For example, given the following sentence in the `obama_wikipedia.txt` file: + +``` +"He began his presidential campaign in 2007" +``` +The example finds the verb `began`, and `He`, `campaign`, and `in` as its +dependencies. Then the script enumerates the dependencies for each verb and +finds all the subjects and objects. For the sentence above, the found subject +and object are `He` and `campaign`. + +The next step is to complete each subject and object token by adding their +dependencies to them. For example, in the sentence above, `his` and +`presidential` are dependent tokens for `campaign`. This is done using the +dependency parse tree, similar to verb dependencies as explained above. The +final result is (`He`, `began`, `his presidential campaign`) triple for +the example sentence above. + +The script performs this analysis for the entire text and prints the result. +For the `obama_wikipedia.txt` file, the result is the following: + +```sh ++------------------------------+------------+------------------------------+ +| Obama | received | national attention | ++------------------------------+------------+------------------------------+ +| He | began | his presidential campaign | ++------------------------------+------------+------------------------------+ +| he | won | sufficient delegates in the | +| | | Democratic Party primaries | ++------------------------------+------------+------------------------------+ +| He | defeated | Republican nominee John | +| | | McCain | +``` diff --git a/language/syntax_triples/main.py b/language/syntax_triples/main.py new file mode 100644 index 000000000000..1be174bff04c --- /dev/null +++ b/language/syntax_triples/main.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This example finds subject-verb-object triples in a given piece of text using +the syntax analysis capabilities of Cloud Natural Language API. The triples are +printed to STDOUT. This can be considered as the first step towards an +information extraction task. + +Run the script on a file containing the text that you wish to analyze. +The text must be encoded in UTF8 or ASCII: + $ python main.py + +Try this on a sample text in the resources directory: + $ python main.py resources/obama_wikipedia.txt +""" + +import argparse +import sys +import textwrap + +from googleapiclient import discovery +import httplib2 +from oauth2client.client import GoogleCredentials + + +def dependents(tokens, head_index): + """Returns an ordered list of the token indices of the dependents for + the given head.""" + # Create head->dependency index. + head_to_deps = {} + for i, token in enumerate(tokens): + head = token['dependencyEdge']['headTokenIndex'] + if i != head: + head_to_deps.setdefault(head, []).append(i) + return head_to_deps.get(head_index, ()) + + +def phrase_text_for_head(tokens, text, head_index): + """Returns the entire phrase containing the head token + and its dependents. + """ + begin, end = phrase_extent_for_head(tokens, head_index) + return text[begin:end] + + +def phrase_extent_for_head(tokens, head_index): + """Returns the begin and end offsets for the entire phrase + containing the head token and its dependents. + """ + begin = tokens[head_index]['text']['beginOffset'] + end = begin + len(tokens[head_index]['text']['content']) + for child in dependents(tokens, head_index): + child_begin, child_end = phrase_extent_for_head(tokens, child) + begin = min(begin, child_begin) + end = max(end, child_end) + return (begin, end) + + +def analyze_syntax(text): + """Use the NL API to analyze the given text string, and returns the + response from the API. Requests an encodingType that matches + the encoding used natively by Python. Raises an + errors.HTTPError if there is a connection problem. + """ + credentials = GoogleCredentials.get_application_default() + scoped_credentials = credentials.create_scoped( + ['https://www.googleapis.com/auth/cloud-platform']) + http = httplib2.Http() + scoped_credentials.authorize(http) + service = discovery.build( + 'language', 'v1beta1', http=http) + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'features': { + 'extract_syntax': True, + }, + 'encodingType': get_native_encoding_type(), + } + request = service.documents().annotateText(body=body) + return request.execute() + + +def get_native_encoding_type(): + """Returns the encoding type that matches Python's native strings.""" + if sys.maxunicode == 65535: + return 'UTF16' + else: + return 'UTF32' + + +def find_triples(tokens, + left_dependency_label='NSUBJ', + head_part_of_speech='VERB', + right_dependency_label='DOBJ'): + """Generator function that searches the given tokens + with the given part of speech tag, that have dependencies + with the given labels. For each such head found, yields a tuple + (left_dependent, head, right_dependent), where each element of the + tuple is an index into the tokens array. + """ + for head, token in enumerate(tokens): + if token['partOfSpeech']['tag'] == head_part_of_speech: + children = dependents(tokens, head) + left_deps = [] + right_deps = [] + for child in children: + child_token = tokens[child] + child_dep_label = child_token['dependencyEdge']['label'] + if child_dep_label == left_dependency_label: + left_deps.append(child) + elif child_dep_label == right_dependency_label: + right_deps.append(child) + for left_dep in left_deps: + for right_dep in right_deps: + yield (left_dep, head, right_dep) + + +def show_triple(tokens, text, triple): + """Prints the given triple (left, head, right). For left and right, + the entire phrase headed by each token is shown. For head, only + the head token itself is shown. + + """ + nsubj, verb, dobj = triple + + # Extract the text for each element of the triple. + nsubj_text = phrase_text_for_head(tokens, text, nsubj) + verb_text = tokens[verb]['text']['content'] + dobj_text = phrase_text_for_head(tokens, text, dobj) + + # Pretty-print the triple. + left = textwrap.wrap(nsubj_text, width=28) + mid = textwrap.wrap(verb_text, width=10) + right = textwrap.wrap(dobj_text, width=28) + print('+' + 30 * '-' + '+' + 12 * '-' + '+' + 30 * '-' + '+') + for l, m, r in zip(left, mid, right): + print('| {:<28s} | {:<10s} | {:<28s} |'.format( + l or '', m or '', r or '')) + + +def main(text_file): + # Extracts subject-verb-object triples from the given text file, + # and print each one. + + # Read the input file. + text = open(text_file, 'rb').read().decode('utf8') + + analysis = analyze_syntax(text) + tokens = analysis.get('tokens', []) + + for triple in find_triples(tokens): + show_triple(tokens, text, triple) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + 'text_file', + help='A file containing the document to process. ' + 'Should be encoded in UTF8 or ASCII') + args = parser.parse_args() + main(args.text_file) diff --git a/language/syntax_triples/main_test.py b/language/syntax_triples/main_test.py new file mode 100755 index 000000000000..62c2915da02e --- /dev/null +++ b/language/syntax_triples/main_test.py @@ -0,0 +1,50 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import main + + +def test_dependents(): + text = "I am eating a delicious banana" + analysis = main.analyze_syntax(text) + tokens = analysis.get('tokens', []) + assert [0, 1, 5] == main.dependents(tokens, 2) + assert [3, 4] == main.dependents(tokens, 5) + + +def test_phrase_text_for_head(): + text = "A small collection of words" + analysis = main.analyze_syntax(text) + tokens = analysis.get('tokens', []) + assert "words" == main.phrase_text_for_head(tokens, text, 4) + + +def test_find_triples(): + text = "President Obama won the noble prize" + analysis = main.analyze_syntax(text) + tokens = analysis.get('tokens', []) + triples = main.find_triples(tokens) + for triple in triples: + assert (1, 2, 5) == triple + + +def test_obama_example(resource, capsys): + main.main(resource('obama_wikipedia.txt')) + stdout, _ = capsys.readouterr() + lines = stdout.split('\n') + assert re.match( + r'.*Obama\b.*\| received\b.*\| national attention\b', + lines[1]) diff --git a/language/syntax_triples/requirements.txt b/language/syntax_triples/requirements.txt new file mode 100644 index 000000000000..0b96c82ee4c2 --- /dev/null +++ b/language/syntax_triples/requirements.txt @@ -0,0 +1 @@ +google-api-python-client==1.5.1 diff --git a/language/syntax_triples/resources/obama_wikipedia.txt b/language/syntax_triples/resources/obama_wikipedia.txt new file mode 100644 index 000000000000..1e89d4ab0818 --- /dev/null +++ b/language/syntax_triples/resources/obama_wikipedia.txt @@ -0,0 +1 @@ +In 2004, Obama received national attention during his campaign to represent Illinois in the United States Senate with his victory in the March Democratic Party primary, his keynote address at the Democratic National Convention in July, and his election to the Senate in November. He began his presidential campaign in 2007 and, after a close primary campaign against Hillary Clinton in 2008, he won sufficient delegates in the Democratic Party primaries to receive the presidential nomination. He then defeated Republican nominee John McCain in the general election, and was inaugurated as president on January 20, 2009. Nine months after his inauguration, Obama was named the 2009 Nobel Peace Prize laureate. From aff93ad794f89e1bf14b095a5ac372271ae2c24c Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Tue, 19 Jul 2016 18:48:17 -0700 Subject: [PATCH 02/36] fixed variable name error --- language/movie_nl/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 380f495f9673..9281e1d6bee9 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -164,8 +164,8 @@ def get_sentiment_entities(service, document): positive_sentiments = [ polarity for polarity, magnitude in sentiments if polarity > 0.0] - negative = sum(negative_sentiment) - positive = sum(positive_sentiment) + negative = sum(negative_sentiments) + positive = sum(positive_sentiments) total = positive + negative return (total, entities) From 03cb67be40be347998e5b199ebbe6fc1f4886d7c Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Tue, 19 Jul 2016 19:06:34 -0700 Subject: [PATCH 03/36] logged error message with exception Change-Id: I8ff59d08f2ae8ce4cb3dd0b57b3548db3d5b8add --- language/movie_nl/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 9281e1d6bee9..066818f4de61 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -208,8 +208,8 @@ def process_movie_reviews(service, reader, sentiment_writer, entity_writer): collected_entities[ent] = (ent_sent, frequency) - except Exception: - logging.info('Skipping {}'.format(document.doc_id)) + except Exception as e: + logging.exception('Skipping {}'.format(document.doc_id)) for entity, e_tuple in collected_entities.items(): entity_writer.write(to_entity_json(entity, e_tuple)) From e2cc3d548932877d7469ad3a6f0c16eb9e4344a8 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Tue, 19 Jul 2016 19:10:11 -0700 Subject: [PATCH 04/36] fixed variable unused error Change-Id: I1367429c3ceaefd0b0d36d9d7cad6ca29cbfdd2d --- language/movie_nl/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 066818f4de61..d1142b4f1ea0 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -208,7 +208,7 @@ def process_movie_reviews(service, reader, sentiment_writer, entity_writer): collected_entities[ent] = (ent_sent, frequency) - except Exception as e: + except Exception: logging.exception('Skipping {}'.format(document.doc_id)) for entity, e_tuple in collected_entities.items(): From 17896774eaf49d2ce926dd2a3f9b16c01f215c74 Mon Sep 17 00:00:00 2001 From: Jerjou Cheng Date: Tue, 19 Jul 2016 19:20:05 -0700 Subject: [PATCH 05/36] Refactor for clarity. Change-Id: Iecc1327db4aa21eb5ee61ce58bba3d142e600734 --- language/movie_nl/README.md | 6 +- language/movie_nl/main.py | 127 ++++++++++++++++-------------------- 2 files changed, 58 insertions(+), 75 deletions(-) diff --git a/language/movie_nl/README.md b/language/movie_nl/README.md index b651dee8bb73..dab79d20c6d1 100644 --- a/language/movie_nl/README.md +++ b/language/movie_nl/README.md @@ -110,7 +110,7 @@ $ pip install -r requirements.txt ### How to Run ``` -$ python main.py --inp "tokens/*/*" \ +$ python main.py analyze --inp "tokens/*/*" \ --sout sentiment.json \ --eout entity.json \ --sample 5 @@ -145,9 +145,7 @@ In order to sort and rank the entities generated, use the same `main.py` script. this will print the top 5 actors with negative sentiment: ``` -$ python main.py --inp entity.json \ - --sout sentiment.json \ - --eout entity.json \ +$ python main.py rank entity.json \ --sentiment neg \ --reverse True \ --sample 5 diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index d1142b4f1ea0..9d6380738c46 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -186,30 +186,26 @@ def process_movie_reviews(service, reader, sentiment_writer, entity_writer): collected_entities = {} for document in reader: - try: - sentiment_total, entities = get_sentiment_entities( - service, document) - document.label = get_sentiment_label(sentiment_total) - - sentiment_writer.write( - to_sentiment_json( - document.doc_id, - sentiment_total, - document.label - ) + sentiment_total, entities = get_sentiment_entities( + service, document) + document.label = get_sentiment_label(sentiment_total) + + sentiment_writer.write( + to_sentiment_json( + document.doc_id, + sentiment_total, + document.label ) + ) - sentiment_writer.write('\n') + sentiment_writer.write('\n') - for ent in entities: - ent_sent, frequency = collected_entities.get(ent, (0, 0)) - ent_sent += sentiment_total - frequency += 1 + for ent in entities: + ent_sent, frequency = collected_entities.get(ent, (0, 0)) + ent_sent += sentiment_total + frequency += 1 - collected_entities[ent] = (ent_sent, frequency) - - except Exception: - logging.exception('Skipping {}'.format(document.doc_id)) + collected_entities[ent] = (ent_sent, frequency) for entity, e_tuple in collected_entities.items(): entity_writer.write(to_entity_json(entity, e_tuple)) @@ -231,7 +227,7 @@ def document_generator(dir_path_pattern, count=None): try: text = f.read() except UnicodeDecodeError: - text = None + continue yield Document(text, doc_id, item) @@ -277,70 +273,59 @@ def get_service(): return discovery.build('language', 'v1beta1', http=http) -def main(input_dir, sent_out, ent_out, sample, log_file, - operation, sentiment, ent_in, reverse_bool): +def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): """Movie demo main program""" - sample = int(sample) if sample else None - - if operation == 'rank': - with open(ent_in) as reader: - rank_entities(reader, sentiment, sample, reverse_bool) - else: - # Create logger settings - logging.basicConfig(filename=log_file, level=logging.DEBUG) - - # Create a Google Service object - service = get_service() - - # Create a sentiment output writer - sentiment_writer = open(sent_out, 'w') + # Create logger settings + logging.basicConfig(filename=log_file, level=logging.DEBUG) - # Create an entity output writer - entity_writer = open(ent_out, 'w') + # Create a Google Service object + service = get_service() - reader = document_generator(input_dir, sample) + reader = document_generator(input_dir, sample) - # Process the movie documents - process_movie_reviews(service, reader, sentiment_writer, entity_writer) + # Process the movie documents + process_movie_reviews(service, reader, sentiment_writer, entity_writer) - # close reader and writers - sentiment_writer.close() - entity_writer.close() - reader.close() + # close reader and writers + sentiment_writer.close() + entity_writer.close() + reader.close() if __name__ == '__main__': parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('--inp', help='location of the input', required=True) - parser.add_argument( - '--sout', help='location of the sentiment output', required=True) - parser.add_argument( - '--eout', help='location of the entity output', required=True) - parser.add_argument('--sample', help='number of top items to process') - parser.add_argument( - '--op', - help='operation to perform "rank" or "analyze"', - default='analyze') - parser.add_argument( + + subparsers = parser.add_subparsers(dest='command') + + rank_parser = subparsers.add_parser('rank') + + rank_parser.add_argument( + 'entity_input', help='location of entity input', type=argparse.FileType('r')) + rank_parser.add_argument( '--sentiment', help='filter sentiment as "neg" or "pos"') - parser.add_argument( - '--ein', help='location of entity input') - parser.add_argument( + rank_parser.add_argument( '--reverse', help='reverse the order of the items') + rank_parser.add_argument('--sample', help='number of top items to process') + + analyze_parser = subparsers.add_parser('analyze') + + analyze_parser.add_argument( + '--inp', help='location of the input', required=True) + analyze_parser.add_argument( + '--sout', help='location of the sentiment output', required=True, + type=argparse.FileType('w')) + analyze_parser.add_argument( + '--eout', help='location of the entity output', required=True, + type=argparse.FileType('w')) + analyze_parser.add_argument('--sample', help='number of top items to process') + analyze_parser.add_argument('--log_file', default='movie.log') args = parser.parse_args() - log_file = 'movie.log' - - main(args.inp, - args.sout, - args.eout, - args.sample, - log_file, - args.op, - args.sentiment, - args.ein, - args.reverse) + if args.command == 'analyze': + analyze(args.inp, args.sout, args.eout, args.sample, args.log_file) + elif args.command == 'rank': + rank_entities(args.entity_input, args.sentiment, args.sample, args.reverse) From 1242524956ffbcede1e824c3b7e2ef2971f0d223 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Tue, 19 Jul 2016 19:47:11 -0700 Subject: [PATCH 06/36] cast samples to int Change-Id: I33a6c3259dc5a9b0cc1ad081cd57acc452fc6b1a --- language/movie_nl/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 9d6380738c46..bebdd59c3fdd 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -326,6 +326,6 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): args = parser.parse_args() if args.command == 'analyze': - analyze(args.inp, args.sout, args.eout, args.sample, args.log_file) + analyze(args.inp, args.sout, args.eout, int(args.sample), args.log_file) elif args.command == 'rank': - rank_entities(args.entity_input, args.sentiment, args.sample, args.reverse) + rank_entities(args.entity_input, args.sentiment, int(args.sample), args.reverse) From 3e4deb12216fd61f7c1acebd4e2d7d167f2ab38d Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Tue, 19 Jul 2016 19:52:20 -0700 Subject: [PATCH 07/36] added sample variable Change-Id: Ic2b721f775601d2dfb718be26d425e7c2e91eb8a --- language/movie_nl/main.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index bebdd59c3fdd..36f36109e272 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -325,7 +325,12 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): args = parser.parse_args() + sample = args.sample + + if args.sample is not None: + sample = int(args.sample) + if args.command == 'analyze': - analyze(args.inp, args.sout, args.eout, int(args.sample), args.log_file) + analyze(args.inp, args.sout, args.eout, sample, args.log_file) elif args.command == 'rank': - rank_entities(args.entity_input, args.sentiment, int(args.sample), args.reverse) + rank_entities(args.entity_input, args.sentiment, sample, args.reverse) From 4ffc546478f78db358c3981781a9551931946db5 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Tue, 19 Jul 2016 20:09:12 -0700 Subject: [PATCH 08/36] fixed indentation bug Change-Id: I2eac2d41e3f8d96c1d049dbbc7b48c96d5fded0a --- language/movie_nl/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 36f36109e272..9238be175698 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -105,15 +105,15 @@ def extract_all_sentences(self, service): sentences = response.get('sentences', []) sent_list = [ - sentence.get('text').get('content')for sentence in sentences + sentence.get('text').get('content') for sentence in sentences ] - for entity in entities: - ent_type = entity.get('type') - wiki_url = entity.get('metadata', {}).get('wikipedia_url') + for entity in entities: + ent_type = entity.get('type') + wiki_url = entity.get('metadata', {}).get('wikipedia_url') - if ent_type == 'PERSON' and wiki_url is not None: - ent_list.append(wiki_url) + if ent_type == 'PERSON' and wiki_url is not None: + ent_list.append(wiki_url) self.sentent_pair = (sent_list, ent_list) From 7ca9918f84f8e71934b24bfe9385563ee8266f17 Mon Sep 17 00:00:00 2001 From: Jerjou Cheng Date: Tue, 19 Jul 2016 20:07:40 -0700 Subject: [PATCH 09/36] Fix lint errors Change-Id: I026963be93ca3ef776420807e14abdcfcc5fe95f --- language/movie_nl/main.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 9238be175698..b7b342af9d56 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -303,12 +303,14 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): rank_parser = subparsers.add_parser('rank') rank_parser.add_argument( - 'entity_input', help='location of entity input', type=argparse.FileType('r')) + 'entity_input', help='location of entity input', + type=argparse.FileType('r')) rank_parser.add_argument( '--sentiment', help='filter sentiment as "neg" or "pos"') rank_parser.add_argument( '--reverse', help='reverse the order of the items') - rank_parser.add_argument('--sample', help='number of top items to process') + rank_parser.add_argument( + '--sample', help='number of top items to process', type=int) analyze_parser = subparsers.add_parser('analyze') @@ -320,17 +322,14 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): analyze_parser.add_argument( '--eout', help='location of the entity output', required=True, type=argparse.FileType('w')) - analyze_parser.add_argument('--sample', help='number of top items to process') + analyze_parser.add_argument( + '--sample', help='number of top items to process', type=int) analyze_parser.add_argument('--log_file', default='movie.log') args = parser.parse_args() - sample = args.sample - - if args.sample is not None: - sample = int(args.sample) - if args.command == 'analyze': - analyze(args.inp, args.sout, args.eout, sample, args.log_file) + analyze(args.inp, args.sout, args.eout, args.sample, args.log_file) elif args.command == 'rank': - rank_entities(args.entity_input, args.sentiment, sample, args.reverse) + rank_entities( + args.entity_input, args.sentiment, args.sample, args.reverse) From 0be30eb388c4f21f667ec5d6e2e3fb5652c433f8 Mon Sep 17 00:00:00 2001 From: Jerjou Cheng Date: Tue, 19 Jul 2016 20:13:53 -0700 Subject: [PATCH 10/36] Remove movie_nl sample until it's more stable. Change-Id: Iab58378f40c760e88f2b3cc0806985894ce77e76 --- language/README.md | 3 - language/movie_nl/README.md | 152 ------------- language/movie_nl/main.py | 335 ----------------------------- language/movie_nl/main_test.py | 82 ------- language/movie_nl/requirements.txt | 3 - 5 files changed, 575 deletions(-) delete mode 100644 language/movie_nl/README.md delete mode 100644 language/movie_nl/main.py delete mode 100644 language/movie_nl/main_test.py delete mode 100644 language/movie_nl/requirements.txt diff --git a/language/README.md b/language/README.md index e63d45eb9a6a..130ce66ff83e 100644 --- a/language/README.md +++ b/language/README.md @@ -5,9 +5,6 @@ This directory contains Python examples that use the - [api](api) has a simple command line tool that shows off the API's features. -- [movie_nl](movie_nl) combines sentiment and entity analysis to come up with -actors/directors who are the most and least popular in the imdb movie reviews. - - [ocr_nl](ocr_nl) uses the [Cloud Vision API](https://cloud.google.com/vision/) to extract text from images, then uses the NL API to extract entity information from those texts, and stores the extracted information in a database in support diff --git a/language/movie_nl/README.md b/language/movie_nl/README.md deleted file mode 100644 index dab79d20c6d1..000000000000 --- a/language/movie_nl/README.md +++ /dev/null @@ -1,152 +0,0 @@ -# Introduction -This sample is an application of the Google Cloud Platform Natural Language API. -It uses the [imdb movie reviews data set](https://www.cs.cornell.edu/people/pabo/movie-review-data/) -from [Cornell University](http://www.cs.cornell.edu/) and performs sentiment & entity -analysis on it. It combines the capabilities of sentiment analysis and entity recognition -to come up with actors/directors who are the most and least popular. - -### Set Up to Authenticate With Your Project's Credentials - -Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) -steps in the Quickstart doc to create a project and enable the -Cloud Natural Language API. Following those steps, make sure that you -[Set Up a Service Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), -and export the following environment variable: - -``` -export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json -``` - -**Note:** If you get an error saying your API hasn't been enabled, make sure -that you have correctly set this environment variable, and that the project that -you got the service account from has the Natural Language API enabled. - -## How it works -This sample uses the Natural Language API to annotate the input text. The -movie review document is broken into sentences using the `extract_syntax` feature. -Each sentence is sent to the API for sentiment analysis. The positive and negative -sentiment values are combined to come up with a single overall sentiment of the -movie document. - -In addition to the sentiment, the program also extracts the entities of type -`PERSON`, who are the actors in the movie (including the director and anyone -important). These entities are assigned the sentiment value of the document to -come up with the most and least popular actors/directors. - -### Movie document -We define a movie document as a set of reviews. These reviews are individual -sentences and we use the NL API to extract the sentences from the document. See -an example movie document below. - -``` - Sample review sentence 1. Sample review sentence 2. Sample review sentence 3. -``` - -### Sentences and Sentiment -Each sentence from the above document is assigned a sentiment as below. - -``` - Sample review sentence 1 => Sentiment 1 - Sample review sentence 2 => Sentiment 2 - Sample review sentence 3 => Sentiment 3 -``` - -### Sentiment computation -The final sentiment is computed by simply adding the sentence sentiments. - -``` - Total Sentiment = Sentiment 1 + Sentiment 2 + Sentiment 3 -``` - - -### Entity extraction and Sentiment assignment -Entities with type `PERSON` are extracted from the movie document using the NL -API. Since these entities are mentioned in their respective movie document, -they are associated with the document sentiment. - -``` - Document 1 => Sentiment 1 - - Person 1 - Person 2 - Person 3 - - Document 2 => Sentiment 2 - - Person 2 - Person 4 - Person 5 -``` - -Based on the above data we can calculate the sentiment associated with Person 2: - -``` - Person 2 => (Sentiment 1 + Sentiment 2) -``` - -## Movie Data Set -We have used the Cornell Movie Review data as our input. Please follow the instructions below to download and extract the data. - -### Download Instructions - -``` - $ curl -O http://www.cs.cornell.edu/people/pabo/movie-review-data/mix20_rand700_tokens.zip - $ unzip mix20_rand700_tokens.zip -``` - -## Command Line Usage -In order to use the movie analyzer, follow the instructions below. (Note that the `--sample` parameter below runs the script on -fewer documents, and can be omitted to run it on the entire corpus) - -### Install Dependencies - -Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. - -Then, install dependencies by running the following pip command: - -``` -$ pip install -r requirements.txt -``` -### How to Run - -``` -$ python main.py analyze --inp "tokens/*/*" \ - --sout sentiment.json \ - --eout entity.json \ - --sample 5 -``` - -You should see the log file `movie.log` created. - -## Output Data -The program produces sentiment and entity output in json format. For example: - -### Sentiment Output -``` - { - "doc_id": "cv310_tok-16557.txt", - "sentiment": 3.099, - "label": -1 - } -``` - -### Entity Output - -``` - { - "name": "Sean Patrick Flanery", - "wiki_url": "http://en.wikipedia.org/wiki/Sean_Patrick_Flanery", - "sentiment": 3.099 - } -``` - -### Entity Output Sorting -In order to sort and rank the entities generated, use the same `main.py` script. For example, -this will print the top 5 actors with negative sentiment: - -``` -$ python main.py rank entity.json \ - --sentiment neg \ - --reverse True \ - --sample 5 -``` diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py deleted file mode 100644 index b7b342af9d56..000000000000 --- a/language/movie_nl/main.py +++ /dev/null @@ -1,335 +0,0 @@ -# Copyright 2016 Google, Inc -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import codecs -import glob -import json -import logging -import os - -from googleapiclient import discovery -import httplib2 -from oauth2client.client import GoogleCredentials -import requests - - -def analyze_document(service, document): - """Analyze the document and get the distribution of sentiments and - the movie name.""" - logging.info('Analyzing {}'.format(document.doc_id)) - - sentences, entities = document.extract_all_sentences(service) - sentiments = [get_sentiment(service, sentence) for sentence in sentences] - - return sentiments, entities - - -def get_request_body(text, syntax=True, entities=True, sentiment=True): - """Creates the body of the request to the language api in - order to get an appropriate api response.""" - body = { - 'document': { - 'type': 'PLAIN_TEXT', - 'content': text, - }, - 'features': { - 'extract_syntax': syntax, - 'extract_entities': entities, - 'extract_document_sentiment': sentiment, - }, - 'encoding_type': 'UTF32' - } - - return body - - -def get_sentiment(service, sentence): - """Get the sentence-level sentiment.""" - body = get_request_body( - sentence, syntax=False, entities=True, sentiment=True) - - docs = service.documents() - request = docs.annotateText(body=body) - response = request.execute() - sentiment = response.get("documentSentiment") - - if sentiment is None: - return (None, None) - else: - pol = sentiment.get("polarity") - mag = sentiment.get("magnitude") - - if pol is None and mag is not None: - pol = 0 - return (pol, mag) - - -class Document(object): - """Document class captures a single document of movie reviews.""" - - def __init__(self, text, doc_id, doc_path): - self.text = text - self.doc_id = doc_id - self.doc_path = doc_path - self.sentent_pair = None - self.label = None - - def extract_all_sentences(self, service): - """Extract the sentences in a document.""" - - if self.sentent_pair is None: - docs = service.documents() - request_body = get_request_body( - self.text, - syntax=True, - entities=True, - sentiment=False) - request = docs.annotateText(body=request_body) - - ent_list = [] - - response = request.execute() - entities = response.get('entities', []) - sentences = response.get('sentences', []) - - sent_list = [ - sentence.get('text').get('content') for sentence in sentences - ] - - for entity in entities: - ent_type = entity.get('type') - wiki_url = entity.get('metadata', {}).get('wikipedia_url') - - if ent_type == 'PERSON' and wiki_url is not None: - ent_list.append(wiki_url) - - self.sentent_pair = (sent_list, ent_list) - - return self.sentent_pair - - -def to_sentiment_json(doc_id, sent, label): - """Convert the sentiment info to json.""" - json_doc = {} - - json_doc['doc_id'] = doc_id - json_doc['sentiment'] = float('%.3f' % sent) - json_doc['label'] = label - - return json.dumps(json_doc) - - -def get_wiki_title(wiki_url): - """Get the wikipedia page title for a given wikipedia URL.""" - try: - content = requests.get(wiki_url).text - return content.split('title')[1].split('-')[0].split('>')[1].strip() - except: - return os.path.basename(wiki_url).replace('_', ' ') - - -def to_entity_json(entity, e_tuple): - """Convert the entity info to json.""" - json_doc = {} - - avg_sentiment = float(e_tuple[0]) / float(e_tuple[1]) - - json_doc['wiki_url'] = entity - json_doc['name'] = get_wiki_title(entity) - json_doc['sentiment'] = float('%.3f' % e_tuple[0]) - json_doc['avg_sentiment'] = float('%.3f' % avg_sentiment) - - return json.dumps(json_doc) - - -def get_sentiment_entities(service, document): - """Compute the overall sentiment volume in the document""" - sentiments, entities = analyze_document(service, document) - - sentiments = [sent for sent in sentiments if sent[0] is not None] - negative_sentiments = [ - polarity for polarity, magnitude in sentiments if polarity < 0.0] - positive_sentiments = [ - polarity for polarity, magnitude in sentiments if polarity > 0.0] - - negative = sum(negative_sentiments) - positive = sum(positive_sentiments) - total = positive + negative - - return (total, entities) - - -def get_sentiment_label(sentiment): - """Return the sentiment label based on the sentiment quantity.""" - if sentiment < 0: - return -1 - elif sentiment > 0: - return 1 - else: - return 0 - - -def process_movie_reviews(service, reader, sentiment_writer, entity_writer): - """Perform some sentiment math and come up with movie review.""" - collected_entities = {} - - for document in reader: - sentiment_total, entities = get_sentiment_entities( - service, document) - document.label = get_sentiment_label(sentiment_total) - - sentiment_writer.write( - to_sentiment_json( - document.doc_id, - sentiment_total, - document.label - ) - ) - - sentiment_writer.write('\n') - - for ent in entities: - ent_sent, frequency = collected_entities.get(ent, (0, 0)) - ent_sent += sentiment_total - frequency += 1 - - collected_entities[ent] = (ent_sent, frequency) - - for entity, e_tuple in collected_entities.items(): - entity_writer.write(to_entity_json(entity, e_tuple)) - entity_writer.write('\n') - - sentiment_writer.flush() - entity_writer.flush() - - -def document_generator(dir_path_pattern, count=None): - """Generator for the input movie documents.""" - for running_count, item in enumerate(glob.iglob(dir_path_pattern)): - if count and running_count >= count: - raise StopIteration() - - doc_id = os.path.basename(item) - - with codecs.open(item, encoding='utf-8') as f: - try: - text = f.read() - except UnicodeDecodeError: - continue - - yield Document(text, doc_id, item) - - -def rank_entities(reader, sentiment=None, topn=None, reverse_bool=False): - """Rank the entities (actors) based on their sentiment - assigned from the movie.""" - - items = [] - for item in reader: - json_item = json.loads(item) - sent = json_item.get('sentiment') - entity_item = (sent, json_item) - - if sentiment: - if sentiment == 'pos' and sent > 0: - items.append(entity_item) - elif sentiment == 'neg' and sent < 0: - items.append(entity_item) - else: - items.append(entity_item) - - items.sort(reverse=True) - items = [json.dumps(item[1]) for item in items] - - if reverse_bool: - items.reverse() - - if topn: - print('\n'.join(items[:topn])) - else: - print('\n'.join(items)) - - -def get_service(): - """Build a client to the Google Cloud Natural Language API.""" - - credentials = GoogleCredentials.get_application_default() - scoped_credentials = credentials.create_scoped( - ['https://www.googleapis.com/auth/cloud-platform']) - http = httplib2.Http() - scoped_credentials.authorize(http) - return discovery.build('language', 'v1beta1', http=http) - - -def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): - """Movie demo main program""" - - # Create logger settings - logging.basicConfig(filename=log_file, level=logging.DEBUG) - - # Create a Google Service object - service = get_service() - - reader = document_generator(input_dir, sample) - - # Process the movie documents - process_movie_reviews(service, reader, sentiment_writer, entity_writer) - - # close reader and writers - sentiment_writer.close() - entity_writer.close() - reader.close() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - - subparsers = parser.add_subparsers(dest='command') - - rank_parser = subparsers.add_parser('rank') - - rank_parser.add_argument( - 'entity_input', help='location of entity input', - type=argparse.FileType('r')) - rank_parser.add_argument( - '--sentiment', help='filter sentiment as "neg" or "pos"') - rank_parser.add_argument( - '--reverse', help='reverse the order of the items') - rank_parser.add_argument( - '--sample', help='number of top items to process', type=int) - - analyze_parser = subparsers.add_parser('analyze') - - analyze_parser.add_argument( - '--inp', help='location of the input', required=True) - analyze_parser.add_argument( - '--sout', help='location of the sentiment output', required=True, - type=argparse.FileType('w')) - analyze_parser.add_argument( - '--eout', help='location of the entity output', required=True, - type=argparse.FileType('w')) - analyze_parser.add_argument( - '--sample', help='number of top items to process', type=int) - analyze_parser.add_argument('--log_file', default='movie.log') - - args = parser.parse_args() - - if args.command == 'analyze': - analyze(args.inp, args.sout, args.eout, args.sample, args.log_file) - elif args.command == 'rank': - rank_entities( - args.entity_input, args.sentiment, args.sample, args.reverse) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py deleted file mode 100644 index 96907908018d..000000000000 --- a/language/movie_nl/main_test.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright 2016 Google, Inc -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import io -import json - -import main - - -def test_get_request_body(): - text = 'hello world' - body = main.get_request_body(text, syntax=True, entities=True, - sentiment=False) - assert body.get('document').get('content') == text - - assert body.get('features').get('extract_syntax') is True - assert body.get('features').get('extract_entities') is True - assert body.get('features').get('extract_document_sentiment') is False - - -def test_get_sentiment_label(): - assert main.get_sentiment_label(20.50) == 1 - assert main.get_sentiment_label(-42.34) == -1 - - -def test_to_sentiment_json(): - doc_id = '12345' - sentiment = 23.344564 - label = 1 - - sentiment_json = json.loads( - main.to_sentiment_json(doc_id, sentiment, label) - ) - - assert sentiment_json.get('doc_id') == doc_id - assert sentiment_json.get('sentiment') == 23.345 - assert sentiment_json.get('label') == label - - -def test_process_movie_reviews(): - service = main.get_service() - - doc1 = main.Document('Top Gun was awesome and Tom Cruise rocked!', 'doc1', - 'doc1') - doc2 = main.Document('Tom Cruise is a great actor.', 'doc2', 'doc2') - - reader = [doc1, doc2] - swriter = io.StringIO() - ewriter = io.StringIO() - - main.process_movie_reviews(service, reader, swriter, ewriter) - - sentiments = swriter.getvalue().strip().split('\n') - entities = ewriter.getvalue().strip().split('\n') - - sentiments = [json.loads(sentiment) for sentiment in sentiments] - entities = [json.loads(entity) for entity in entities] - - # assert sentiments - assert sentiments[0].get('sentiment') == 1.0 - assert sentiments[0].get('label') == 1 - - assert sentiments[1].get('sentiment') == 1.0 - assert sentiments[1].get('label') == 1 - - # assert entities - assert len(entities) == 1 - assert entities[0].get('name') == 'Tom Cruise' - assert (entities[0].get('wiki_url') == - 'http://en.wikipedia.org/wiki/Tom_Cruise') - assert entities[0].get('sentiment') == 2.0 diff --git a/language/movie_nl/requirements.txt b/language/movie_nl/requirements.txt deleted file mode 100644 index 391be2e98434..000000000000 --- a/language/movie_nl/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -urlparse2==1.1.1 -google-api-python-client==1.5.1 -requests==2.10.0 From 47c7c638f536ee18cf2d9dd458faacdd953cfcb4 Mon Sep 17 00:00:00 2001 From: Jerjou Cheng Date: Tue, 19 Jul 2016 20:14:55 -0700 Subject: [PATCH 11/36] Revert "Remove movie_nl sample until it's more stable." This reverts commit 0be30eb388c4f21f667ec5d6e2e3fb5652c433f8. --- language/README.md | 3 + language/movie_nl/README.md | 152 +++++++++++++ language/movie_nl/main.py | 335 +++++++++++++++++++++++++++++ language/movie_nl/main_test.py | 82 +++++++ language/movie_nl/requirements.txt | 3 + 5 files changed, 575 insertions(+) create mode 100644 language/movie_nl/README.md create mode 100644 language/movie_nl/main.py create mode 100644 language/movie_nl/main_test.py create mode 100644 language/movie_nl/requirements.txt diff --git a/language/README.md b/language/README.md index 130ce66ff83e..e63d45eb9a6a 100644 --- a/language/README.md +++ b/language/README.md @@ -5,6 +5,9 @@ This directory contains Python examples that use the - [api](api) has a simple command line tool that shows off the API's features. +- [movie_nl](movie_nl) combines sentiment and entity analysis to come up with +actors/directors who are the most and least popular in the imdb movie reviews. + - [ocr_nl](ocr_nl) uses the [Cloud Vision API](https://cloud.google.com/vision/) to extract text from images, then uses the NL API to extract entity information from those texts, and stores the extracted information in a database in support diff --git a/language/movie_nl/README.md b/language/movie_nl/README.md new file mode 100644 index 000000000000..dab79d20c6d1 --- /dev/null +++ b/language/movie_nl/README.md @@ -0,0 +1,152 @@ +# Introduction +This sample is an application of the Google Cloud Platform Natural Language API. +It uses the [imdb movie reviews data set](https://www.cs.cornell.edu/people/pabo/movie-review-data/) +from [Cornell University](http://www.cs.cornell.edu/) and performs sentiment & entity +analysis on it. It combines the capabilities of sentiment analysis and entity recognition +to come up with actors/directors who are the most and least popular. + +### Set Up to Authenticate With Your Project's Credentials + +Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) +steps in the Quickstart doc to create a project and enable the +Cloud Natural Language API. Following those steps, make sure that you +[Set Up a Service Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), +and export the following environment variable: + +``` +export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json +``` + +**Note:** If you get an error saying your API hasn't been enabled, make sure +that you have correctly set this environment variable, and that the project that +you got the service account from has the Natural Language API enabled. + +## How it works +This sample uses the Natural Language API to annotate the input text. The +movie review document is broken into sentences using the `extract_syntax` feature. +Each sentence is sent to the API for sentiment analysis. The positive and negative +sentiment values are combined to come up with a single overall sentiment of the +movie document. + +In addition to the sentiment, the program also extracts the entities of type +`PERSON`, who are the actors in the movie (including the director and anyone +important). These entities are assigned the sentiment value of the document to +come up with the most and least popular actors/directors. + +### Movie document +We define a movie document as a set of reviews. These reviews are individual +sentences and we use the NL API to extract the sentences from the document. See +an example movie document below. + +``` + Sample review sentence 1. Sample review sentence 2. Sample review sentence 3. +``` + +### Sentences and Sentiment +Each sentence from the above document is assigned a sentiment as below. + +``` + Sample review sentence 1 => Sentiment 1 + Sample review sentence 2 => Sentiment 2 + Sample review sentence 3 => Sentiment 3 +``` + +### Sentiment computation +The final sentiment is computed by simply adding the sentence sentiments. + +``` + Total Sentiment = Sentiment 1 + Sentiment 2 + Sentiment 3 +``` + + +### Entity extraction and Sentiment assignment +Entities with type `PERSON` are extracted from the movie document using the NL +API. Since these entities are mentioned in their respective movie document, +they are associated with the document sentiment. + +``` + Document 1 => Sentiment 1 + + Person 1 + Person 2 + Person 3 + + Document 2 => Sentiment 2 + + Person 2 + Person 4 + Person 5 +``` + +Based on the above data we can calculate the sentiment associated with Person 2: + +``` + Person 2 => (Sentiment 1 + Sentiment 2) +``` + +## Movie Data Set +We have used the Cornell Movie Review data as our input. Please follow the instructions below to download and extract the data. + +### Download Instructions + +``` + $ curl -O http://www.cs.cornell.edu/people/pabo/movie-review-data/mix20_rand700_tokens.zip + $ unzip mix20_rand700_tokens.zip +``` + +## Command Line Usage +In order to use the movie analyzer, follow the instructions below. (Note that the `--sample` parameter below runs the script on +fewer documents, and can be omitted to run it on the entire corpus) + +### Install Dependencies + +Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. + +Then, install dependencies by running the following pip command: + +``` +$ pip install -r requirements.txt +``` +### How to Run + +``` +$ python main.py analyze --inp "tokens/*/*" \ + --sout sentiment.json \ + --eout entity.json \ + --sample 5 +``` + +You should see the log file `movie.log` created. + +## Output Data +The program produces sentiment and entity output in json format. For example: + +### Sentiment Output +``` + { + "doc_id": "cv310_tok-16557.txt", + "sentiment": 3.099, + "label": -1 + } +``` + +### Entity Output + +``` + { + "name": "Sean Patrick Flanery", + "wiki_url": "http://en.wikipedia.org/wiki/Sean_Patrick_Flanery", + "sentiment": 3.099 + } +``` + +### Entity Output Sorting +In order to sort and rank the entities generated, use the same `main.py` script. For example, +this will print the top 5 actors with negative sentiment: + +``` +$ python main.py rank entity.json \ + --sentiment neg \ + --reverse True \ + --sample 5 +``` diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py new file mode 100644 index 000000000000..b7b342af9d56 --- /dev/null +++ b/language/movie_nl/main.py @@ -0,0 +1,335 @@ +# Copyright 2016 Google, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import codecs +import glob +import json +import logging +import os + +from googleapiclient import discovery +import httplib2 +from oauth2client.client import GoogleCredentials +import requests + + +def analyze_document(service, document): + """Analyze the document and get the distribution of sentiments and + the movie name.""" + logging.info('Analyzing {}'.format(document.doc_id)) + + sentences, entities = document.extract_all_sentences(service) + sentiments = [get_sentiment(service, sentence) for sentence in sentences] + + return sentiments, entities + + +def get_request_body(text, syntax=True, entities=True, sentiment=True): + """Creates the body of the request to the language api in + order to get an appropriate api response.""" + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'features': { + 'extract_syntax': syntax, + 'extract_entities': entities, + 'extract_document_sentiment': sentiment, + }, + 'encoding_type': 'UTF32' + } + + return body + + +def get_sentiment(service, sentence): + """Get the sentence-level sentiment.""" + body = get_request_body( + sentence, syntax=False, entities=True, sentiment=True) + + docs = service.documents() + request = docs.annotateText(body=body) + response = request.execute() + sentiment = response.get("documentSentiment") + + if sentiment is None: + return (None, None) + else: + pol = sentiment.get("polarity") + mag = sentiment.get("magnitude") + + if pol is None and mag is not None: + pol = 0 + return (pol, mag) + + +class Document(object): + """Document class captures a single document of movie reviews.""" + + def __init__(self, text, doc_id, doc_path): + self.text = text + self.doc_id = doc_id + self.doc_path = doc_path + self.sentent_pair = None + self.label = None + + def extract_all_sentences(self, service): + """Extract the sentences in a document.""" + + if self.sentent_pair is None: + docs = service.documents() + request_body = get_request_body( + self.text, + syntax=True, + entities=True, + sentiment=False) + request = docs.annotateText(body=request_body) + + ent_list = [] + + response = request.execute() + entities = response.get('entities', []) + sentences = response.get('sentences', []) + + sent_list = [ + sentence.get('text').get('content') for sentence in sentences + ] + + for entity in entities: + ent_type = entity.get('type') + wiki_url = entity.get('metadata', {}).get('wikipedia_url') + + if ent_type == 'PERSON' and wiki_url is not None: + ent_list.append(wiki_url) + + self.sentent_pair = (sent_list, ent_list) + + return self.sentent_pair + + +def to_sentiment_json(doc_id, sent, label): + """Convert the sentiment info to json.""" + json_doc = {} + + json_doc['doc_id'] = doc_id + json_doc['sentiment'] = float('%.3f' % sent) + json_doc['label'] = label + + return json.dumps(json_doc) + + +def get_wiki_title(wiki_url): + """Get the wikipedia page title for a given wikipedia URL.""" + try: + content = requests.get(wiki_url).text + return content.split('title')[1].split('-')[0].split('>')[1].strip() + except: + return os.path.basename(wiki_url).replace('_', ' ') + + +def to_entity_json(entity, e_tuple): + """Convert the entity info to json.""" + json_doc = {} + + avg_sentiment = float(e_tuple[0]) / float(e_tuple[1]) + + json_doc['wiki_url'] = entity + json_doc['name'] = get_wiki_title(entity) + json_doc['sentiment'] = float('%.3f' % e_tuple[0]) + json_doc['avg_sentiment'] = float('%.3f' % avg_sentiment) + + return json.dumps(json_doc) + + +def get_sentiment_entities(service, document): + """Compute the overall sentiment volume in the document""" + sentiments, entities = analyze_document(service, document) + + sentiments = [sent for sent in sentiments if sent[0] is not None] + negative_sentiments = [ + polarity for polarity, magnitude in sentiments if polarity < 0.0] + positive_sentiments = [ + polarity for polarity, magnitude in sentiments if polarity > 0.0] + + negative = sum(negative_sentiments) + positive = sum(positive_sentiments) + total = positive + negative + + return (total, entities) + + +def get_sentiment_label(sentiment): + """Return the sentiment label based on the sentiment quantity.""" + if sentiment < 0: + return -1 + elif sentiment > 0: + return 1 + else: + return 0 + + +def process_movie_reviews(service, reader, sentiment_writer, entity_writer): + """Perform some sentiment math and come up with movie review.""" + collected_entities = {} + + for document in reader: + sentiment_total, entities = get_sentiment_entities( + service, document) + document.label = get_sentiment_label(sentiment_total) + + sentiment_writer.write( + to_sentiment_json( + document.doc_id, + sentiment_total, + document.label + ) + ) + + sentiment_writer.write('\n') + + for ent in entities: + ent_sent, frequency = collected_entities.get(ent, (0, 0)) + ent_sent += sentiment_total + frequency += 1 + + collected_entities[ent] = (ent_sent, frequency) + + for entity, e_tuple in collected_entities.items(): + entity_writer.write(to_entity_json(entity, e_tuple)) + entity_writer.write('\n') + + sentiment_writer.flush() + entity_writer.flush() + + +def document_generator(dir_path_pattern, count=None): + """Generator for the input movie documents.""" + for running_count, item in enumerate(glob.iglob(dir_path_pattern)): + if count and running_count >= count: + raise StopIteration() + + doc_id = os.path.basename(item) + + with codecs.open(item, encoding='utf-8') as f: + try: + text = f.read() + except UnicodeDecodeError: + continue + + yield Document(text, doc_id, item) + + +def rank_entities(reader, sentiment=None, topn=None, reverse_bool=False): + """Rank the entities (actors) based on their sentiment + assigned from the movie.""" + + items = [] + for item in reader: + json_item = json.loads(item) + sent = json_item.get('sentiment') + entity_item = (sent, json_item) + + if sentiment: + if sentiment == 'pos' and sent > 0: + items.append(entity_item) + elif sentiment == 'neg' and sent < 0: + items.append(entity_item) + else: + items.append(entity_item) + + items.sort(reverse=True) + items = [json.dumps(item[1]) for item in items] + + if reverse_bool: + items.reverse() + + if topn: + print('\n'.join(items[:topn])) + else: + print('\n'.join(items)) + + +def get_service(): + """Build a client to the Google Cloud Natural Language API.""" + + credentials = GoogleCredentials.get_application_default() + scoped_credentials = credentials.create_scoped( + ['https://www.googleapis.com/auth/cloud-platform']) + http = httplib2.Http() + scoped_credentials.authorize(http) + return discovery.build('language', 'v1beta1', http=http) + + +def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): + """Movie demo main program""" + + # Create logger settings + logging.basicConfig(filename=log_file, level=logging.DEBUG) + + # Create a Google Service object + service = get_service() + + reader = document_generator(input_dir, sample) + + # Process the movie documents + process_movie_reviews(service, reader, sentiment_writer, entity_writer) + + # close reader and writers + sentiment_writer.close() + entity_writer.close() + reader.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + + subparsers = parser.add_subparsers(dest='command') + + rank_parser = subparsers.add_parser('rank') + + rank_parser.add_argument( + 'entity_input', help='location of entity input', + type=argparse.FileType('r')) + rank_parser.add_argument( + '--sentiment', help='filter sentiment as "neg" or "pos"') + rank_parser.add_argument( + '--reverse', help='reverse the order of the items') + rank_parser.add_argument( + '--sample', help='number of top items to process', type=int) + + analyze_parser = subparsers.add_parser('analyze') + + analyze_parser.add_argument( + '--inp', help='location of the input', required=True) + analyze_parser.add_argument( + '--sout', help='location of the sentiment output', required=True, + type=argparse.FileType('w')) + analyze_parser.add_argument( + '--eout', help='location of the entity output', required=True, + type=argparse.FileType('w')) + analyze_parser.add_argument( + '--sample', help='number of top items to process', type=int) + analyze_parser.add_argument('--log_file', default='movie.log') + + args = parser.parse_args() + + if args.command == 'analyze': + analyze(args.inp, args.sout, args.eout, args.sample, args.log_file) + elif args.command == 'rank': + rank_entities( + args.entity_input, args.sentiment, args.sample, args.reverse) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py new file mode 100644 index 000000000000..96907908018d --- /dev/null +++ b/language/movie_nl/main_test.py @@ -0,0 +1,82 @@ +# Copyright 2016 Google, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import json + +import main + + +def test_get_request_body(): + text = 'hello world' + body = main.get_request_body(text, syntax=True, entities=True, + sentiment=False) + assert body.get('document').get('content') == text + + assert body.get('features').get('extract_syntax') is True + assert body.get('features').get('extract_entities') is True + assert body.get('features').get('extract_document_sentiment') is False + + +def test_get_sentiment_label(): + assert main.get_sentiment_label(20.50) == 1 + assert main.get_sentiment_label(-42.34) == -1 + + +def test_to_sentiment_json(): + doc_id = '12345' + sentiment = 23.344564 + label = 1 + + sentiment_json = json.loads( + main.to_sentiment_json(doc_id, sentiment, label) + ) + + assert sentiment_json.get('doc_id') == doc_id + assert sentiment_json.get('sentiment') == 23.345 + assert sentiment_json.get('label') == label + + +def test_process_movie_reviews(): + service = main.get_service() + + doc1 = main.Document('Top Gun was awesome and Tom Cruise rocked!', 'doc1', + 'doc1') + doc2 = main.Document('Tom Cruise is a great actor.', 'doc2', 'doc2') + + reader = [doc1, doc2] + swriter = io.StringIO() + ewriter = io.StringIO() + + main.process_movie_reviews(service, reader, swriter, ewriter) + + sentiments = swriter.getvalue().strip().split('\n') + entities = ewriter.getvalue().strip().split('\n') + + sentiments = [json.loads(sentiment) for sentiment in sentiments] + entities = [json.loads(entity) for entity in entities] + + # assert sentiments + assert sentiments[0].get('sentiment') == 1.0 + assert sentiments[0].get('label') == 1 + + assert sentiments[1].get('sentiment') == 1.0 + assert sentiments[1].get('label') == 1 + + # assert entities + assert len(entities) == 1 + assert entities[0].get('name') == 'Tom Cruise' + assert (entities[0].get('wiki_url') == + 'http://en.wikipedia.org/wiki/Tom_Cruise') + assert entities[0].get('sentiment') == 2.0 diff --git a/language/movie_nl/requirements.txt b/language/movie_nl/requirements.txt new file mode 100644 index 000000000000..391be2e98434 --- /dev/null +++ b/language/movie_nl/requirements.txt @@ -0,0 +1,3 @@ +urlparse2==1.1.1 +google-api-python-client==1.5.1 +requests==2.10.0 From 0a8d04007441bfca7e833981ec5bf7c2b427a517 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 11:18:26 -0700 Subject: [PATCH 12/36] catch HttpError and log it Change-Id: Ib2f1a2b04c0d00cd62d178d92430379b64ea9781 --- language/movie_nl/main.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 9238be175698..8df497ad7ea5 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -20,6 +20,7 @@ import os from googleapiclient import discovery +from googleapiclient.errors import HttpError import httplib2 from oauth2client.client import GoogleCredentials import requests @@ -31,7 +32,11 @@ def analyze_document(service, document): logging.info('Analyzing {}'.format(document.doc_id)) sentences, entities = document.extract_all_sentences(service) - sentiments = [get_sentiment(service, sentence) for sentence in sentences] + + try: + sentiments = [get_sentiment(service, sentence) for sentence in sentences] + except HttpError as e: + raise e return sentiments, entities @@ -62,7 +67,12 @@ def get_sentiment(service, sentence): docs = service.documents() request = docs.annotateText(body=body) - response = request.execute() + + try: + response = request.execute() + except HttpError as e: + raise e + sentiment = response.get("documentSentiment") if sentiment is None: @@ -156,7 +166,11 @@ def to_entity_json(entity, e_tuple): def get_sentiment_entities(service, document): """Compute the overall sentiment volume in the document""" - sentiments, entities = analyze_document(service, document) + + try: + sentiments, entities = analyze_document(service, document) + except HttpError as e: + raise e sentiments = [sent for sent in sentiments if sent[0] is not None] negative_sentiments = [ @@ -186,8 +200,13 @@ def process_movie_reviews(service, reader, sentiment_writer, entity_writer): collected_entities = {} for document in reader: - sentiment_total, entities = get_sentiment_entities( - service, document) + try: + sentiment_total, entities = get_sentiment_entities( + service, document) + except HttpError as e: + logging.error("Error in process_movie_reviews {}".format(e.content)) + continue + document.label = get_sentiment_label(sentiment_total) sentiment_writer.write( From 8076a6303407159d1cfd37d091767d976bbacccf Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 13:57:21 -0700 Subject: [PATCH 13/36] added retry in the request Change-Id: Iea8672b60bd6ebe9602378bb8b2622a44b48e608 --- language/movie_nl/main.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 8df497ad7ea5..e3eda6c6e96e 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -24,7 +24,7 @@ import httplib2 from oauth2client.client import GoogleCredentials import requests - +import socket def analyze_document(service, document): """Analyze the document and get the distribution of sentiments and @@ -59,6 +59,22 @@ def get_request_body(text, syntax=True, entities=True, sentiment=True): return body +def get_response_with_retry(request, tries, retry=3): + """Get the response using re-try""" + try: + response = request.execute() + return response + except HttpError as e: + raise e + except socket.error as se: + if tries > retry: + raise se + + logging.error('Re-trying the request {}'.format(tries)) + + tries+=1 + return get_response_with_retry(request, tries, retry) + def get_sentiment(service, sentence): """Get the sentence-level sentiment.""" @@ -68,10 +84,7 @@ def get_sentiment(service, sentence): docs = service.documents() request = docs.annotateText(body=body) - try: - response = request.execute() - except HttpError as e: - raise e + response = get_response_with_retry(request, 1) sentiment = response.get("documentSentiment") From d4655662c36a6145cd3311f28ce011ce884057d6 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 14:51:27 -0700 Subject: [PATCH 14/36] removed reverse bool --- language/movie_nl/main.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index aa7ebe643288..fa81f51e5e52 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -282,12 +282,9 @@ def rank_entities(reader, sentiment=None, topn=None, reverse_bool=False): else: items.append(entity_item) - items.sort(reverse=True) + items.sort(reverse=reverse_bool) items = [json.dumps(item[1]) for item in items] - if reverse_bool: - items.reverse() - if topn: print('\n'.join(items[:topn])) else: @@ -340,7 +337,9 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): rank_parser.add_argument( '--sentiment', help='filter sentiment as "neg" or "pos"') rank_parser.add_argument( - '--reverse', help='reverse the order of the items') + '--reverse', help='reverse the order of the items', type=bool, + default=False + ) rank_parser.add_argument( '--sample', help='number of top items to process', type=int) From cf12fced34b37476b59c14c9931c69010e3b528e Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 15:30:51 -0700 Subject: [PATCH 15/36] fixed PR comments --- language/movie_nl/main.py | 82 +++++++++++++++------------------------ 1 file changed, 31 insertions(+), 51 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index fa81f51e5e52..af7ed782db1c 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -33,10 +33,7 @@ def analyze_document(service, document): sentences, entities = document.extract_all_sentences(service) - try: - sentiments = [get_sentiment(service, sentence) for sentence in sentences] - except HttpError as e: - raise e + sentiments = [get_sentiment(service, sentence) for sentence in sentences] return sentiments, entities @@ -59,22 +56,6 @@ def get_request_body(text, syntax=True, entities=True, sentiment=True): return body -def get_response_with_retry(request, tries, retry=3): - """Get the response using re-try""" - try: - response = request.execute() - return response - except HttpError as e: - raise e - except socket.error as se: - if tries > retry: - raise se - - logging.error('Re-trying the request {}'.format(tries)) - - tries+=1 - return get_response_with_retry(request, tries, retry) - def get_sentiment(service, sentence): """Get the sentence-level sentiment.""" @@ -84,15 +65,15 @@ def get_sentiment(service, sentence): docs = service.documents() request = docs.annotateText(body=body) - response = get_response_with_retry(request, 1) + response = request.execute(num_retries=3) - sentiment = response.get("documentSentiment") + sentiment = response.get('documentSentiment') if sentiment is None: return (None, None) else: - pol = sentiment.get("polarity") - mag = sentiment.get("magnitude") + pol = sentiment.get('polarity') + mag = sentiment.get('magnitude') if pol is None and mag is not None: pol = 0 @@ -106,41 +87,43 @@ def __init__(self, text, doc_id, doc_path): self.text = text self.doc_id = doc_id self.doc_path = doc_path - self.sentent_pair = None + self.sentence_entity_pair = None self.label = None def extract_all_sentences(self, service): """Extract the sentences in a document.""" - if self.sentent_pair is None: - docs = service.documents() - request_body = get_request_body( - self.text, - syntax=True, - entities=True, - sentiment=False) - request = docs.annotateText(body=request_body) + if self.sentence_entity_pair is not None: + return self.sentence_entity_pair + + docs = service.documents() + request_body = get_request_body( + self.text, + syntax=True, + entities=True, + sentiment=False) + request = docs.annotateText(body=request_body) - ent_list = [] + ent_list = [] - response = request.execute() - entities = response.get('entities', []) - sentences = response.get('sentences', []) + response = request.execute() + entities = response.get('entities', []) + sentences = response.get('sentences', []) - sent_list = [ - sentence.get('text').get('content') for sentence in sentences - ] + sent_list = [ + sentence.get('text', {}).get('content') for sentence in sentences + ] - for entity in entities: - ent_type = entity.get('type') - wiki_url = entity.get('metadata', {}).get('wikipedia_url') + for entity in entities: + ent_type = entity.get('type') + wiki_url = entity.get('metadata', {}).get('wikipedia_url') - if ent_type == 'PERSON' and wiki_url is not None: - ent_list.append(wiki_url) + if ent_type == 'PERSON' and wiki_url is not None: + ent_list.append(wiki_url) - self.sentent_pair = (sent_list, ent_list) + self.sentence_entity_pair = (sent_list, ent_list) - return self.sentent_pair + return self.sentence_entity_pair def to_sentiment_json(doc_id, sent, label): @@ -180,10 +163,7 @@ def to_entity_json(entity, e_tuple): def get_sentiment_entities(service, document): """Compute the overall sentiment volume in the document""" - try: - sentiments, entities = analyze_document(service, document) - except HttpError as e: - raise e + sentiments, entities = analyze_document(service, document) sentiments = [sent for sent in sentiments if sent[0] is not None] negative_sentiments = [ From 729e96ed1cedc6d247635efc6a35ac7bd994d09f Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 15:33:16 -0700 Subject: [PATCH 16/36] fixed nox issues --- language/movie_nl/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index af7ed782db1c..111873ec3ab4 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -26,6 +26,7 @@ import requests import socket + def analyze_document(service, document): """Analyze the document and get the distribution of sentiments and the movie name.""" From 483e66d36d9dc9f77f94ac1be0b15d2dfd53e744 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 15:44:39 -0700 Subject: [PATCH 17/36] changed from io.StringIO to StringIO.StringIO --- language/movie_nl/main_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py index 96907908018d..9c7ae9d1efbc 100644 --- a/language/movie_nl/main_test.py +++ b/language/movie_nl/main_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import StringIO import json import main @@ -56,8 +56,8 @@ def test_process_movie_reviews(): doc2 = main.Document('Tom Cruise is a great actor.', 'doc2', 'doc2') reader = [doc1, doc2] - swriter = io.StringIO() - ewriter = io.StringIO() + swriter = StringIO.StringIO() + ewriter = StringIO.StringIO() main.process_movie_reviews(service, reader, swriter, ewriter) From 8d974f39e9fc4477497e14277332d13b34a56147 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 15:58:38 -0700 Subject: [PATCH 18/36] fixed nox issues --- language/movie_nl/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 111873ec3ab4..38416ecab06d 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -24,7 +24,6 @@ import httplib2 from oauth2client.client import GoogleCredentials import requests -import socket def analyze_document(service, document): @@ -198,7 +197,7 @@ def process_movie_reviews(service, reader, sentiment_writer, entity_writer): sentiment_total, entities = get_sentiment_entities( service, document) except HttpError as e: - logging.error("Error in process_movie_reviews {}".format(e.content)) + logging.error("Error process_movie_reviews {}".format(e.content)) continue document.label = get_sentiment_label(sentiment_total) From f2930fe43b9005562b492a2aa854c7472f055986 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 16:43:52 -0700 Subject: [PATCH 19/36] added rank_entities tests --- language/movie_nl/main_test.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py index 9c7ae9d1efbc..4ad6e2f0883d 100644 --- a/language/movie_nl/main_test.py +++ b/language/movie_nl/main_test.py @@ -13,6 +13,7 @@ # limitations under the License. import StringIO +import sys import json import main @@ -80,3 +81,31 @@ def test_process_movie_reviews(): assert (entities[0].get('wiki_url') == 'http://en.wikipedia.org/wiki/Tom_Cruise') assert entities[0].get('sentiment') == 2.0 + +def test_rank_positive_entities(): + reader = [ + '{"avg_sentiment": -12.0, "name": "Patrick Macnee", "sentiment": -12.0}', + '{"avg_sentiment": 5.0, "name": "Paul Rudd", "sentiment": 5.0}', + '{"avg_sentiment": -5.0, "name": "Martha Plimpton", "sentiment": -5.0}', + '{"avg_sentiment": 7.0, "name": "Lucy (2014 film)", "sentiment": 7.0}' + ] + + sys.stdout = writer = StringIO.StringIO() + main.rank_entities(reader, 'pos', topn=1, reverse_bool=False) + + sys.stdout = sys.__stdout__ + assert writer.getvalue().strip() == '{"avg_sentiment": 5.0, "name": "Paul Rudd", "sentiment": 5.0}' + +def test_rank_negative_entities(): + reader = [ + '{"avg_sentiment": -12.0, "name": "Patrick Macnee", "sentiment": -12.0}', + '{"avg_sentiment": 5.0, "name": "Paul Rudd", "sentiment": 5.0}', + '{"avg_sentiment": -5.0, "name": "Martha Plimpton", "sentiment": -5.0}', + '{"avg_sentiment": 7.0, "name": "Lucy (2014 film)", "sentiment": 7.0}' + ] + + sys.stdout = writer = StringIO.StringIO() + main.rank_entities(reader, 'neg', topn=1, reverse_bool=True) + + sys.stdout = sys.__stdout__ + assert writer.getvalue().strip() == '{"avg_sentiment": -5.0, "name": "Martha Plimpton", "sentiment": -5.0}' From ec185c7d1a9bb3697481aa1464728ab04c6636e5 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 16:44:06 -0700 Subject: [PATCH 20/36] removed urlparse --- language/movie_nl/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/language/movie_nl/requirements.txt b/language/movie_nl/requirements.txt index 391be2e98434..c385fb4e4e03 100644 --- a/language/movie_nl/requirements.txt +++ b/language/movie_nl/requirements.txt @@ -1,3 +1,2 @@ -urlparse2==1.1.1 google-api-python-client==1.5.1 requests==2.10.0 From 7a8a962f73d82ef5522edb25a2b8b27bd1dccf8f Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 16:48:43 -0700 Subject: [PATCH 21/36] fixed PR comments --- language/movie_nl/main.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 38416ecab06d..0ba555caed1d 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -21,7 +21,6 @@ from googleapiclient import discovery from googleapiclient.errors import HttpError -import httplib2 from oauth2client.client import GoogleCredentials import requests @@ -265,25 +264,20 @@ def rank_entities(reader, sentiment=None, topn=None, reverse_bool=False): items.sort(reverse=reverse_bool) items = [json.dumps(item[1]) for item in items] - if topn: - print('\n'.join(items[:topn])) - else: - print('\n'.join(items)) + print('\n'.join(items[:topn])) def get_service(): """Build a client to the Google Cloud Natural Language API.""" credentials = GoogleCredentials.get_application_default() - scoped_credentials = credentials.create_scoped( - ['https://www.googleapis.com/auth/cloud-platform']) - http = httplib2.Http() - scoped_credentials.authorize(http) - return discovery.build('language', 'v1beta1', http=http) + + return discovery.build('language', 'v1beta1', + credentials=credentials) def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): - """Movie demo main program""" + """Analyze the document for sentiment and entities""" # Create logger settings logging.basicConfig(filename=log_file, level=logging.DEBUG) @@ -321,7 +315,9 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): default=False ) rank_parser.add_argument( - '--sample', help='number of top items to process', type=int) + '--sample', help='number of top items to process', type=int, + default=None + ) analyze_parser = subparsers.add_parser('analyze') From e77231dbeffeac529c21966497b2c60fd4c4b480 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 16:51:28 -0700 Subject: [PATCH 22/36] renamed e_tuple to better name --- language/movie_nl/main.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 0ba555caed1d..e9958442e68b 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -145,15 +145,15 @@ def get_wiki_title(wiki_url): return os.path.basename(wiki_url).replace('_', ' ') -def to_entity_json(entity, e_tuple): +def to_entity_json(entity, entity_sentiment): """Convert the entity info to json.""" json_doc = {} - avg_sentiment = float(e_tuple[0]) / float(e_tuple[1]) + avg_sentiment = float(entity_sentiment[0]) / float(entity_sentiment[1]) json_doc['wiki_url'] = entity json_doc['name'] = get_wiki_title(entity) - json_doc['sentiment'] = float('%.3f' % e_tuple[0]) + json_doc['sentiment'] = float('%.3f' % entity_sentiment[0]) json_doc['avg_sentiment'] = float('%.3f' % avg_sentiment) return json.dumps(json_doc) @@ -218,8 +218,8 @@ def process_movie_reviews(service, reader, sentiment_writer, entity_writer): collected_entities[ent] = (ent_sent, frequency) - for entity, e_tuple in collected_entities.items(): - entity_writer.write(to_entity_json(entity, e_tuple)) + for entity, entity_sentiment in collected_entities.items(): + entity_writer.write(to_entity_json(entity, entity_sentiment)) entity_writer.write('\n') sentiment_writer.flush() From 11d6b0baca67b0acc9376ff2327745e24608b459 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 17:16:44 -0700 Subject: [PATCH 23/36] fixed nox issues for main_test --- language/movie_nl/main_test.py | 36 +++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py index 4ad6e2f0883d..4e66cb490155 100644 --- a/language/movie_nl/main_test.py +++ b/language/movie_nl/main_test.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import StringIO import sys -import json import main @@ -82,30 +82,44 @@ def test_process_movie_reviews(): 'http://en.wikipedia.org/wiki/Tom_Cruise') assert entities[0].get('sentiment') == 2.0 + def test_rank_positive_entities(): reader = [ - '{"avg_sentiment": -12.0, "name": "Patrick Macnee", "sentiment": -12.0}', - '{"avg_sentiment": 5.0, "name": "Paul Rudd", "sentiment": 5.0}', - '{"avg_sentiment": -5.0, "name": "Martha Plimpton", "sentiment": -5.0}', - '{"avg_sentiment": 7.0, "name": "Lucy (2014 film)", "sentiment": 7.0}' + ('{"avg_sentiment": -12.0, ' + '"name": "Patrick Macnee", "sentiment": -12.0}'), + ('{"avg_sentiment": 5.0, ' + '"name": "Paul Rudd", "sentiment": 5.0}'), + ('{"avg_sentiment": -5.0, ' + '"name": "Martha Plimpton", "sentiment": -5.0}'), + ('{"avg_sentiment": 7.0, ' + '"name": "Lucy (2014 film)", "sentiment": 7.0}') ] sys.stdout = writer = StringIO.StringIO() main.rank_entities(reader, 'pos', topn=1, reverse_bool=False) sys.stdout = sys.__stdout__ - assert writer.getvalue().strip() == '{"avg_sentiment": 5.0, "name": "Paul Rudd", "sentiment": 5.0}' + expected = ('{"avg_sentiment": 5.0, ' + '"name": "Paul Rudd", "sentiment": 5.0}') + assert writer.getvalue().strip() == expected + def test_rank_negative_entities(): reader = [ - '{"avg_sentiment": -12.0, "name": "Patrick Macnee", "sentiment": -12.0}', - '{"avg_sentiment": 5.0, "name": "Paul Rudd", "sentiment": 5.0}', - '{"avg_sentiment": -5.0, "name": "Martha Plimpton", "sentiment": -5.0}', - '{"avg_sentiment": 7.0, "name": "Lucy (2014 film)", "sentiment": 7.0}' + ('{"avg_sentiment": -12.0, ' + '"name": "Patrick Macnee", "sentiment": -12.0}'), + ('{"avg_sentiment": 5.0, ' + '"name": "Paul Rudd", "sentiment": 5.0}'), + ('{"avg_sentiment": -5.0, ' + '"name": "Martha Plimpton", "sentiment": -5.0}'), + ('{"avg_sentiment": 7.0, ' + '"name": "Lucy (2014 film)", "sentiment": 7.0}') ] sys.stdout = writer = StringIO.StringIO() main.rank_entities(reader, 'neg', topn=1, reverse_bool=True) sys.stdout = sys.__stdout__ - assert writer.getvalue().strip() == '{"avg_sentiment": -5.0, "name": "Martha Plimpton", "sentiment": -5.0}' + expected = ('{"avg_sentiment": -5.0, ' + '"name": "Martha Plimpton", "sentiment": -5.0}') + assert writer.getvalue().strip() == expected From 0bea72b43b195b0ca3e2b7ff9875945241ec38fb Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 17:50:16 -0700 Subject: [PATCH 24/36] replaced StringIO.StringIO with io.BytesIO --- language/movie_nl/main.py | 2 +- language/movie_nl/main_test.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index e9958442e68b..4d163bca2dbf 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -196,7 +196,7 @@ def process_movie_reviews(service, reader, sentiment_writer, entity_writer): sentiment_total, entities = get_sentiment_entities( service, document) except HttpError as e: - logging.error("Error process_movie_reviews {}".format(e.content)) + logging.error('Error process_movie_reviews {}'.format(e.content)) continue document.label = get_sentiment_label(sentiment_total) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py index 4e66cb490155..999995b9b6d4 100644 --- a/language/movie_nl/main_test.py +++ b/language/movie_nl/main_test.py @@ -13,7 +13,7 @@ # limitations under the License. import json -import StringIO +import io import sys import main @@ -57,8 +57,8 @@ def test_process_movie_reviews(): doc2 = main.Document('Tom Cruise is a great actor.', 'doc2', 'doc2') reader = [doc1, doc2] - swriter = StringIO.StringIO() - ewriter = StringIO.StringIO() + swriter = io.BytesIO() + ewriter = io.BytesIO() main.process_movie_reviews(service, reader, swriter, ewriter) @@ -95,7 +95,7 @@ def test_rank_positive_entities(): '"name": "Lucy (2014 film)", "sentiment": 7.0}') ] - sys.stdout = writer = StringIO.StringIO() + sys.stdout = writer = io.BytesIO() main.rank_entities(reader, 'pos', topn=1, reverse_bool=False) sys.stdout = sys.__stdout__ @@ -116,7 +116,7 @@ def test_rank_negative_entities(): '"name": "Lucy (2014 film)", "sentiment": 7.0}') ] - sys.stdout = writer = StringIO.StringIO() + sys.stdout = writer = io.BytesIO() main.rank_entities(reader, 'neg', topn=1, reverse_bool=True) sys.stdout = sys.__stdout__ From 5242b74af411d671bd308af7e0f5064eae7badc2 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 17:51:07 -0700 Subject: [PATCH 25/36] changed order of io --- language/movie_nl/main_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py index 999995b9b6d4..1f93a1d7f5d3 100644 --- a/language/movie_nl/main_test.py +++ b/language/movie_nl/main_test.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import io +import json import sys import main From f5f1ec1939c874320ffba82b58f725070fdb12f7 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 17:55:07 -0700 Subject: [PATCH 26/36] used capsys to capture stdout output --- language/movie_nl/main_test.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py index 1f93a1d7f5d3..b03b121de025 100644 --- a/language/movie_nl/main_test.py +++ b/language/movie_nl/main_test.py @@ -14,7 +14,6 @@ import io import json -import sys import main @@ -83,7 +82,7 @@ def test_process_movie_reviews(): assert entities[0].get('sentiment') == 2.0 -def test_rank_positive_entities(): +def test_rank_positive_entities(capsys): reader = [ ('{"avg_sentiment": -12.0, ' '"name": "Patrick Macnee", "sentiment": -12.0}'), @@ -95,16 +94,15 @@ def test_rank_positive_entities(): '"name": "Lucy (2014 film)", "sentiment": 7.0}') ] - sys.stdout = writer = io.BytesIO() main.rank_entities(reader, 'pos', topn=1, reverse_bool=False) + out, err = capsys.readouterr() - sys.stdout = sys.__stdout__ expected = ('{"avg_sentiment": 5.0, ' '"name": "Paul Rudd", "sentiment": 5.0}') - assert writer.getvalue().strip() == expected + assert out.strip() == expected -def test_rank_negative_entities(): +def test_rank_negative_entities(capsys): reader = [ ('{"avg_sentiment": -12.0, ' '"name": "Patrick Macnee", "sentiment": -12.0}'), @@ -116,10 +114,9 @@ def test_rank_negative_entities(): '"name": "Lucy (2014 film)", "sentiment": 7.0}') ] - sys.stdout = writer = io.BytesIO() main.rank_entities(reader, 'neg', topn=1, reverse_bool=True) + out, err = capsys.readouterr() - sys.stdout = sys.__stdout__ expected = ('{"avg_sentiment": -5.0, ' '"name": "Martha Plimpton", "sentiment": -5.0}') - assert writer.getvalue().strip() == expected + assert out.strip() == expected From 6730388f8f6f2c845d9a04a1e33e348d1b885bfa Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 17:59:05 -0700 Subject: [PATCH 27/36] fixed docstring --- language/movie_nl/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 4d163bca2dbf..b44d770bc5e2 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -146,7 +146,7 @@ def get_wiki_title(wiki_url): def to_entity_json(entity, entity_sentiment): - """Convert the entity info to json.""" + """Convert entities and their associated sentiment to json.""" json_doc = {} avg_sentiment = float(entity_sentiment[0]) / float(entity_sentiment[1]) From 6f01839e4b6ea60892b26a9b5b70120dea736d5c Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 18:04:20 -0700 Subject: [PATCH 28/36] imported six.StringIO --- language/movie_nl/main_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py index b03b121de025..d347954a9c28 100644 --- a/language/movie_nl/main_test.py +++ b/language/movie_nl/main_test.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io import json import main +import six def test_get_request_body(): @@ -56,8 +56,8 @@ def test_process_movie_reviews(): doc2 = main.Document('Tom Cruise is a great actor.', 'doc2', 'doc2') reader = [doc1, doc2] - swriter = io.BytesIO() - ewriter = io.BytesIO() + swriter = six.StringIO() + ewriter = six.StringIO() main.process_movie_reviews(service, reader, swriter, ewriter) From 7e2cde51e57edfc38cf5a68cd553bf132340a9f3 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 18:08:19 -0700 Subject: [PATCH 29/36] fixed ordering of the expected out --- language/movie_nl/main_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py index d347954a9c28..2368647ccb09 100644 --- a/language/movie_nl/main_test.py +++ b/language/movie_nl/main_test.py @@ -99,7 +99,7 @@ def test_rank_positive_entities(capsys): expected = ('{"avg_sentiment": 5.0, ' '"name": "Paul Rudd", "sentiment": 5.0}') - assert out.strip() == expected + assert out.strip().sort() == expected.sort() def test_rank_negative_entities(capsys): @@ -119,4 +119,4 @@ def test_rank_negative_entities(capsys): expected = ('{"avg_sentiment": -5.0, ' '"name": "Martha Plimpton", "sentiment": -5.0}') - assert out.strip() == expected + assert out.strip().sort() == expected.sort() From b31a76eab994082219796545f7b89c41bbd46c38 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Wed, 20 Jul 2016 18:15:04 -0700 Subject: [PATCH 30/36] fixed sorted string --- language/movie_nl/main_test.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py index 2368647ccb09..fc69e9bccfea 100644 --- a/language/movie_nl/main_test.py +++ b/language/movie_nl/main_test.py @@ -99,7 +99,10 @@ def test_rank_positive_entities(capsys): expected = ('{"avg_sentiment": 5.0, ' '"name": "Paul Rudd", "sentiment": 5.0}') - assert out.strip().sort() == expected.sort() + + expected = ''.join(sorted(expected)) + out = ''.join(sorted(out.strip())) + assert out == expected def test_rank_negative_entities(capsys): @@ -119,4 +122,7 @@ def test_rank_negative_entities(capsys): expected = ('{"avg_sentiment": -5.0, ' '"name": "Martha Plimpton", "sentiment": -5.0}') - assert out.strip().sort() == expected.sort() + + expected = ''.join(sorted(expected)) + out = ''.join(sorted(out.strip())) + assert out == expected From 0820734c8c4cb581164b2dc86f530df1f390c1ba Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Thu, 21 Jul 2016 10:45:45 -0700 Subject: [PATCH 31/36] added docstrings for the argument --- language/movie_nl/main.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index b44d770bc5e2..aef64a75be58 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -145,15 +145,25 @@ def get_wiki_title(wiki_url): return os.path.basename(wiki_url).replace('_', ' ') -def to_entity_json(entity, entity_sentiment): - """Convert entities and their associated sentiment to json.""" +def to_entity_json(entity, entity_sentiment, entity_frequency): + """Convert entities and their associated sentiment to json. + + Args: + entity: Wikipedia entity name + entity_sentiment: Sentiment associated with the entity + entity_frequency: Frequency of the entity in the corpus + + Returns: + Json string representation of input + + """ json_doc = {} - avg_sentiment = float(entity_sentiment[0]) / float(entity_sentiment[1]) + avg_sentiment = float(entity_sentiment) / float(entity_frequency) json_doc['wiki_url'] = entity json_doc['name'] = get_wiki_title(entity) - json_doc['sentiment'] = float('%.3f' % entity_sentiment[0]) + json_doc['sentiment'] = float('%.3f' % entity_sentiment) json_doc['avg_sentiment'] = float('%.3f' % avg_sentiment) return json.dumps(json_doc) @@ -218,8 +228,9 @@ def process_movie_reviews(service, reader, sentiment_writer, entity_writer): collected_entities[ent] = (ent_sent, frequency) - for entity, entity_sentiment in collected_entities.items(): - entity_writer.write(to_entity_json(entity, entity_sentiment)) + for entity, sentiment_frequency in collected_entities.items(): + entity_writer.write(to_entity_json(entity, sentiment_frequency[0], + sentiment_frequency[1])) entity_writer.write('\n') sentiment_writer.flush() From 769eaed2ac3275ad91df9d6ee487c3ea47d059d2 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Thu, 21 Jul 2016 10:59:49 -0700 Subject: [PATCH 32/36] added arguments description --- language/movie_nl/main.py | 44 +++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index aef64a75be58..1244842338f5 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -126,7 +126,17 @@ def extract_all_sentences(self, service): def to_sentiment_json(doc_id, sent, label): - """Convert the sentiment info to json.""" + """Convert the sentiment info to json. + + Args: + doc_id: Document id + sent: Overall Sentiment for the document + label: Actual label +1, 0, -1 for the document + + Returns: + String json representation of the input + + """ json_doc = {} json_doc['doc_id'] = doc_id @@ -137,7 +147,15 @@ def to_sentiment_json(doc_id, sent, label): def get_wiki_title(wiki_url): - """Get the wikipedia page title for a given wikipedia URL.""" + """Get the wikipedia page title for a given wikipedia URL. + + Args: + wiki_url: Wikipedia URL + + Returns: + Wikipedia canonical name + + """ try: content = requests.get(wiki_url).text return content.split('title')[1].split('-')[0].split('>')[1].strip() @@ -170,7 +188,16 @@ def to_entity_json(entity, entity_sentiment, entity_frequency): def get_sentiment_entities(service, document): - """Compute the overall sentiment volume in the document""" + """Compute the overall sentiment volume in the document. + + Args: + service: Client to Google Natural Language API + document: Movie review document (See Document object) + + Returns: + Tuple of total sentiment and entities found in the document + + """ sentiments, entities = analyze_document(service, document) @@ -238,7 +265,16 @@ def process_movie_reviews(service, reader, sentiment_writer, entity_writer): def document_generator(dir_path_pattern, count=None): - """Generator for the input movie documents.""" + """Generator for the input movie documents. + + Args: + dir_path_pattern: Input dir pattern e.g., "foo/bar/*/*" + count: Number of documents to read else everything if None + + Returns: + Generator which contains Document (See above) + + """ for running_count, item in enumerate(glob.iglob(dir_path_pattern)): if count and running_count >= count: raise StopIteration() From 850771d816fbe71389ad1e94447400b07069dd0e Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Thu, 21 Jul 2016 11:03:53 -0700 Subject: [PATCH 33/36] added wikipedia url example --- language/movie_nl/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 1244842338f5..121ea80d3077 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -150,10 +150,10 @@ def get_wiki_title(wiki_url): """Get the wikipedia page title for a given wikipedia URL. Args: - wiki_url: Wikipedia URL + wiki_url: Wikipedia URL e.g., http://en.wikipedia.org/wiki/Sean_Connery Returns: - Wikipedia canonical name + Wikipedia canonical name e.g., Sean Connery """ try: From 36ffcb1b69e50b432d2d4b882ffc9575543abfc1 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Thu, 21 Jul 2016 12:37:11 -0700 Subject: [PATCH 34/36] removed File type from argsparse --- language/movie_nl/main.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 121ea80d3077..d5113280115f 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -337,11 +337,6 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): # Process the movie documents process_movie_reviews(service, reader, sentiment_writer, entity_writer) - # close reader and writers - sentiment_writer.close() - entity_writer.close() - reader.close() - if __name__ == '__main__': parser = argparse.ArgumentParser( @@ -371,11 +366,9 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): analyze_parser.add_argument( '--inp', help='location of the input', required=True) analyze_parser.add_argument( - '--sout', help='location of the sentiment output', required=True, - type=argparse.FileType('w')) + '--sout', help='location of the sentiment output', required=True) analyze_parser.add_argument( - '--eout', help='location of the entity output', required=True, - type=argparse.FileType('w')) + '--eout', help='location of the entity output', required=True) analyze_parser.add_argument( '--sample', help='number of top items to process', type=int) analyze_parser.add_argument('--log_file', default='movie.log') @@ -383,7 +376,9 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): args = parser.parse_args() if args.command == 'analyze': - analyze(args.inp, args.sout, args.eout, args.sample, args.log_file) + with open(args.sout, 'w') as sout, open(args.eout, 'w') as eout: + analyze(args.inp, sout, eout, args.sample, args.log_file) elif args.command == 'rank': - rank_entities( - args.entity_input, args.sentiment, args.sample, args.reverse) + with open(args.entity_input, 'r') as entity_input: + rank_entities( + entity_input, args.sentiment, args.sample, args.reverse) From 9c610f0d0142bbe1e4ac85a168ca2aae6ea27365 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Thu, 21 Jul 2016 14:59:36 -0700 Subject: [PATCH 35/36] updated README.md to work due to wrong input name --- language/movie_nl/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language/movie_nl/README.md b/language/movie_nl/README.md index dab79d20c6d1..687a6c4058ab 100644 --- a/language/movie_nl/README.md +++ b/language/movie_nl/README.md @@ -145,7 +145,7 @@ In order to sort and rank the entities generated, use the same `main.py` script. this will print the top 5 actors with negative sentiment: ``` -$ python main.py rank entity.json \ +$ python main.py rank --entity_input entity.json \ --sentiment neg \ --reverse True \ --sample 5 From ae433ee66d4df58c9ad4da194d824e18adacdb64 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Thu, 21 Jul 2016 14:59:48 -0700 Subject: [PATCH 36/36] removed FileType from argsparse --- language/movie_nl/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index d5113280115f..ba5c63b60b98 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -348,8 +348,7 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): rank_parser = subparsers.add_parser('rank') rank_parser.add_argument( - 'entity_input', help='location of entity input', - type=argparse.FileType('r')) + '--entity_input', help='location of entity input') rank_parser.add_argument( '--sentiment', help='filter sentiment as "neg" or "pos"') rank_parser.add_argument(