Add inspect table code sample for DLP and some nit fixes (#1921)

* Remove claim that redact.py operates on strings Reflect in the comments that this particular code sample does not support text redaction. * Add code sample for inspecting table, fix requirements for running tests, quickstart example refactor * Remove newline, if -> elif * formatting * More formatting
GoogleCloudPlatform · Jan 25, 2019 · 94ced73 · 94ced73
1 parent c1ec40c
commit 94ced73
Show file tree

Hide file tree

Showing 7 changed files with 237 additions and 25 deletions.
diff --git a/dlp/README.rst b/dlp/README.rst
@@ -58,6 +58,15 @@ Install Dependencies
 .. _pip: https://pip.pypa.io/
 .. _virtualenv: https://virtualenv.pypa.io/
 
+#. For running *_test.py files, install test dependencies
+
+    .. code-block:: bash
+
+        $ pip install -r requirements-test.txt
+        $ pytest inspect_content_test.py
+
+** *_test.py files are demo wrappers and make API calls. You may get rate limited for making high number of requests. **
+
 Samples
 -------------------------------------------------------------------------------
 
@@ -74,7 +83,7 @@ To run this sample:
 
 .. code-block:: bash
 
-    $ python quickstart.py
+    $ python quickstart.py <project-id>
 
 
 Inspect Content

diff --git a/dlp/deid.py b/dlp/deid.py
@@ -402,7 +402,7 @@ def write_data(data):
             write_file.writerow(map(write_data, row.values))
     # Print status
     print('Successfully saved date-shift output to {}'.format(
-                output_csv_file))
+        output_csv_file))
 # [END dlp_deidentify_date_shift]
 
 
@@ -450,8 +450,8 @@ def write_data(data):
              'If unspecified, the three above examples will be used.',
         default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
     fpe_parser.add_argument(
-         'project',
-         help='The Google Cloud project id to use as a parent resource.')
+        'project',
+        help='The Google Cloud project id to use as a parent resource.')
     fpe_parser.add_argument(
         'item',
         help='The string to deidentify. '

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
@@ -19,6 +19,7 @@
 
 import argparse
 import os
+import json
 
 
 # [START dlp_inspect_string]
@@ -77,7 +78,7 @@ def inspect_string(project, content_string, info_types,
         'min_likelihood': min_likelihood,
         'include_quote': include_quote,
         'limits': {'max_findings_per_request': max_findings},
-      }
+    }
 
     # Construct the `item`.
     item = {'value': content_string}
@@ -102,8 +103,130 @@ def inspect_string(project, content_string, info_types,
         print('No findings.')
 # [END dlp_inspect_string]
 
+# [START dlp_inspect_table]
+
+
+def inspect_table(project, data, info_types,
+                  custom_dictionaries=None, custom_regexes=None,
+                  min_likelihood=None, max_findings=None, include_quote=True):
+    """Uses the Data Loss Prevention API to analyze strings for protected data.
+    Args:
+        project: The Google Cloud project id to use as a parent resource.
+        data: Json string representing table data.
+        info_types: A list of strings representing info types to look for.
+            A full list of info type categories can be fetched from the API.
+        min_likelihood: A string representing the minimum likelihood threshold
+            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
+            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
+        max_findings: The maximum number of findings to report; 0 = no maximum.
+        include_quote: Boolean for whether to display a quote of the detected
+            information in the results.
+    Returns:
+        None; the response from the API is printed to the terminal.
+    Example:
+        data = {
+            "header":[
+                "email",
+                "phone number"
+            ],
+            "rows":[
+                [
+                    "robertfrost@xyz.com",
+                    "4232342345"
+                ],
+                [
+                    "johndoe@pqr.com",
+                    "4253458383"
+                ]
+            ]
+        }
+
+        >> $ python inspect_content.py table \
+        '{"header": ["email", "phone number"],
+        "rows": [["robertfrost@xyz.com", "4232342345"],
+        ["johndoe@pqr.com", "4253458383"]]}'
+        >>  Quote: robertfrost@xyz.com
+            Info type: EMAIL_ADDRESS
+            Likelihood: 4
+            Quote: johndoe@pqr.com
+            Info type: EMAIL_ADDRESS
+            Likelihood: 4
+    """
+
+    # Import the client library.
+    import google.cloud.dlp
+
+    # Instantiate a client.
+    dlp = google.cloud.dlp.DlpServiceClient()
+
+    # Prepare info_types by converting the list of strings into a list of
+    # dictionaries (protos are also accepted).
+    info_types = [{'name': info_type} for info_type in info_types]
+
+    # Prepare custom_info_types by parsing the dictionary word lists and
+    # regex patterns.
+    if custom_dictionaries is None:
+        custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dict.split(',')}
+        }
+    } for i, custom_dict in enumerate(custom_dictionaries)]
+    if custom_regexes is None:
+        custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regex}
+    } for i, custom_regex in enumerate(custom_regexes)]
+    custom_info_types = dictionaries + regexes
+
+    # Construct the configuration dictionary. Keys which are None may
+    # optionally be omitted entirely.
+    inspect_config = {
+        'info_types': info_types,
+        'custom_info_types': custom_info_types,
+        'min_likelihood': min_likelihood,
+        'include_quote': include_quote,
+        'limits': {'max_findings_per_request': max_findings},
+    }
+
+    # Construct the `table`. For more details on the table schema, please see
+    # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
+    headers = [{"name": val} for val in data["header"]]
+    rows = []
+    for row in data["rows"]:
+        rows.append({
+            "values": [{"string_value": cell_val} for cell_val in row]
+        })
+
+    table = {}
+    table["headers"] = headers
+    table["rows"] = rows
+    item = {"table": table}
+    # Convert the project id into a full resource id.
+    parent = dlp.project_path(project)
+
+    # Call the API.
+    response = dlp.inspect_content(parent, inspect_config, item)
+
+    # Print out the results.
+    if response.result.findings:
+        for finding in response.result.findings:
+            try:
+                if finding.quote:
+                    print('Quote: {}'.format(finding.quote))
+            except AttributeError:
+                pass
+            print('Info type: {}'.format(finding.info_type.name))
+            print('Likelihood: {}'.format(finding.likelihood))
+    else:
+        print('No findings.')
+# [END dlp_inspect_table]
 
 # [START dlp_inspect_file]
+
+
 def inspect_file(project, filename, info_types, min_likelihood=None,
                  custom_dictionaries=None, custom_regexes=None,
                  max_findings=None, include_quote=True, mime_type=None):
@@ -284,8 +407,8 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
     storage_config = {
         'cloud_storage_options': {
             'file_set': {'url': url}
-            }
         }
+    }
 
     # Convert the project id into a full resource id.
     parent = dlp.project_path(project)
@@ -309,7 +432,6 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
     subscriber = google.cloud.pubsub.SubscriberClient()
     subscription_path = subscriber.subscription_path(
         project, subscription_id)
-    subscription = subscriber.subscribe(subscription_path)
 
     # Set up a callback to acknowledge a message. This closes around an event
     # so that it can signal that it is done and the main thread can continue.
@@ -341,8 +463,7 @@ def callback(message):
             print(e)
             raise
 
-    # Register the callback and wait on the event.
-    subscription.open(callback)
+    subscriber.subscribe(subscription_path, callback=callback)
     finished = job_done.wait(timeout=timeout)
     if not finished:
         print('No event received before the timeout. Please verify that the '
@@ -460,7 +581,6 @@ def inspect_datastore(project, datastore_project, kind,
     subscriber = google.cloud.pubsub.SubscriberClient()
     subscription_path = subscriber.subscription_path(
         project, subscription_id)
-    subscription = subscriber.subscribe(subscription_path)
 
     # Set up a callback to acknowledge a message. This closes around an event
     # so that it can signal that it is done and the main thread can continue.
@@ -493,7 +613,8 @@ def callback(message):
             raise
 
     # Register the callback and wait on the event.
-    subscription.open(callback)
+    subscriber.subscribe(subscription_path, callback=callback)
+
     finished = job_done.wait(timeout=timeout)
     if not finished:
         print('No event received before the timeout. Please verify that the '
@@ -609,7 +730,6 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
     subscriber = google.cloud.pubsub.SubscriberClient()
     subscription_path = subscriber.subscription_path(
         project, subscription_id)
-    subscription = subscriber.subscribe(subscription_path)
 
     # Set up a callback to acknowledge a message. This closes around an event
     # so that it can signal that it is done and the main thread can continue.
@@ -642,7 +762,7 @@ def callback(message):
             raise
 
     # Register the callback and wait on the event.
-    subscription.open(callback)
+    subscriber.subscribe(subscription_path, callback=callback)
     finished = job_done.wait(timeout=timeout)
     if not finished:
         print('No event received before the timeout. Please verify that the '
@@ -698,6 +818,46 @@ def callback(message):
              'information in the results.',
         default=True)
 
+    parser_table = subparsers.add_parser('table', help='Inspect a table.')
+    parser_table.add_argument(
+        'data', help='Json string representing a table.', type=json.loads)
+    parser_table.add_argument(
+        '--project',
+        help='The Google Cloud project id to use as a parent resource.',
+        default=default_project)
+    parser_table.add_argument(
+        '--info_types', action='append',
+        help='Strings representing info types to look for. A full list of '
+             'info categories and types is available from the API. Examples '
+             'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+             'If unspecified, the three above examples will be used.',
+        default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_table.add_argument(
+        '--custom_dictionaries', action='append',
+        help='Strings representing comma-delimited lists of dictionary words'
+             ' to search for as custom info types. Each string is a comma '
+             'delimited list of words representing a distinct dictionary.',
+        default=None)
+    parser_table.add_argument(
+        '--custom_regexes', action='append',
+        help='Strings representing regex patterns to search for as custom '
+             ' info types.',
+        default=None)
+    parser_table.add_argument(
+        '--min_likelihood',
+        choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
+                 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'],
+        help='A string representing the minimum likelihood threshold that '
+             'constitutes a match.')
+    parser_table.add_argument(
+        '--max_findings', type=int,
+        help='The maximum number of findings to report; 0 = no maximum.')
+    parser_table.add_argument(
+        '--include_quote', type=bool,
+        help='A boolean for whether to display a quote of the detected '
+             'information in the results.',
+        default=True)
+
     parser_file = subparsers.add_parser('file', help='Inspect a local file.')
     parser_file.add_argument(
         'filename', help='The path to the file to inspect.')
@@ -923,6 +1083,14 @@ def callback(message):
             min_likelihood=args.min_likelihood,
             max_findings=args.max_findings,
             include_quote=args.include_quote)
+    elif args.content == 'table':
+        inspect_table(
+            args.project, args.data, args.info_types,
+            custom_dictionaries=args.custom_dictionaries,
+            custom_regexes=args.custom_regexes,
+            min_likelihood=args.min_likelihood,
+            max_findings=args.max_findings,
+            include_quote=args.include_quote)
     elif args.content == 'file':
         inspect_file(
             args.project, args.filename, args.info_types,

diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py
@@ -24,7 +24,6 @@
 import google.cloud.storage
 
 import pytest
-
 import inspect_content
 
 
@@ -170,6 +169,35 @@ def test_inspect_string(capsys):
     assert 'Info type: EMAIL_ADDRESS' in out
 
 
+def test_inspect_table(capsys):
+    test_tabular_data = {
+        "header": [
+            "email",
+            "phone number"
+        ],
+        "rows": [
+            [
+                "robertfrost@xyz.com",
+                "4232342345"
+            ],
+            [
+                "johndoe@pqr.com",
+                "4253458383"
+            ]
+        ]
+    }
+
+    inspect_content.inspect_table(
+        GCLOUD_PROJECT,
+        test_tabular_data,
+        ['PHONE_NUMBER', 'EMAIL_ADDRESS'],
+        include_quote=True)
+
+    out, _ = capsys.readouterr()
+    assert 'Info type: PHONE_NUMBER' in out
+    assert 'Info type: EMAIL_ADDRESS' in out
+
+
 def test_inspect_string_with_custom_info_types(capsys):
     test_string = 'My name is Gary Smith and my email is gary@example.com'
     dictionaries = ['Gary Smith']

diff --git a/dlp/quickstart.py b/dlp/quickstart.py
@@ -17,17 +17,17 @@
 
 from __future__ import print_function
 
+import sys
+import argparse
 
-def quickstart():
+
+def quickstart(project_id):
     """Demonstrates use of the Data Loss Prevention API client library."""
 
     # [START dlp_quickstart]
     # Import the client library
     import google.cloud.dlp
 
-    # Edit this with your Google Cloud Project ID.
-    project = 'your-project'
-
     # Instantiate a client.
     dlp_client = google.cloud.dlp.DlpServiceClient()
 
@@ -84,4 +84,11 @@ def quickstart():
 
 
 if __name__ == '__main__':
-    quickstart()
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "project_id", help="Enter your GCP project id.", type=str)
+    args = parser.parse_args()
+    if len(sys.argv) == 1:
+        parser.print_usage()
+        sys.exit(1)
+    quickstart(args.project_id)