Skip to content

Commit

Permalink
Add inspect table code sample for DLP and some nit fixes (#1921)
Browse files Browse the repository at this point in the history
* Remove claim that redact.py operates on strings

Reflect in the comments that this particular code sample does not support text redaction.

* Add code sample for inspecting table, fix requirements for running tests, quickstart example refactor

* Remove newline, if -> elif

* formatting

* More formatting
  • Loading branch information
ackul authored and engelke committed Jan 25, 2019
1 parent c1ec40c commit 94ced73
Show file tree
Hide file tree
Showing 7 changed files with 237 additions and 25 deletions.
11 changes: 10 additions & 1 deletion dlp/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,15 @@ Install Dependencies
.. _pip: https://pip.pypa.io/
.. _virtualenv: https://virtualenv.pypa.io/

#. For running *_test.py files, install test dependencies
.. code-block:: bash
$ pip install -r requirements-test.txt
$ pytest inspect_content_test.py
** *_test.py files are demo wrappers and make API calls. You may get rate limited for making high number of requests. **

Samples
-------------------------------------------------------------------------------

Expand All @@ -74,7 +83,7 @@ To run this sample:

.. code-block:: bash
$ python quickstart.py
$ python quickstart.py <project-id>
Inspect Content
Expand Down
6 changes: 3 additions & 3 deletions dlp/deid.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ def write_data(data):
write_file.writerow(map(write_data, row.values))
# Print status
print('Successfully saved date-shift output to {}'.format(
output_csv_file))
output_csv_file))
# [END dlp_deidentify_date_shift]


Expand Down Expand Up @@ -450,8 +450,8 @@ def write_data(data):
'If unspecified, the three above examples will be used.',
default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
fpe_parser.add_argument(
'project',
help='The Google Cloud project id to use as a parent resource.')
'project',
help='The Google Cloud project id to use as a parent resource.')
fpe_parser.add_argument(
'item',
help='The string to deidentify. '
Expand Down
186 changes: 177 additions & 9 deletions dlp/inspect_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import argparse
import os
import json


# [START dlp_inspect_string]
Expand Down Expand Up @@ -77,7 +78,7 @@ def inspect_string(project, content_string, info_types,
'min_likelihood': min_likelihood,
'include_quote': include_quote,
'limits': {'max_findings_per_request': max_findings},
}
}

# Construct the `item`.
item = {'value': content_string}
Expand All @@ -102,8 +103,130 @@ def inspect_string(project, content_string, info_types,
print('No findings.')
# [END dlp_inspect_string]

# [START dlp_inspect_table]


def inspect_table(project, data, info_types,
custom_dictionaries=None, custom_regexes=None,
min_likelihood=None, max_findings=None, include_quote=True):
"""Uses the Data Loss Prevention API to analyze strings for protected data.
Args:
project: The Google Cloud project id to use as a parent resource.
data: Json string representing table data.
info_types: A list of strings representing info types to look for.
A full list of info type categories can be fetched from the API.
min_likelihood: A string representing the minimum likelihood threshold
that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
max_findings: The maximum number of findings to report; 0 = no maximum.
include_quote: Boolean for whether to display a quote of the detected
information in the results.
Returns:
None; the response from the API is printed to the terminal.
Example:
data = {
"header":[
"email",
"phone number"
],
"rows":[
[
"robertfrost@xyz.com",
"4232342345"
],
[
"johndoe@pqr.com",
"4253458383"
]
]
}
>> $ python inspect_content.py table \
'{"header": ["email", "phone number"],
"rows": [["robertfrost@xyz.com", "4232342345"],
["johndoe@pqr.com", "4253458383"]]}'
>> Quote: robertfrost@xyz.com
Info type: EMAIL_ADDRESS
Likelihood: 4
Quote: johndoe@pqr.com
Info type: EMAIL_ADDRESS
Likelihood: 4
"""

# Import the client library.
import google.cloud.dlp

# Instantiate a client.
dlp = google.cloud.dlp.DlpServiceClient()

# Prepare info_types by converting the list of strings into a list of
# dictionaries (protos are also accepted).
info_types = [{'name': info_type} for info_type in info_types]

# Prepare custom_info_types by parsing the dictionary word lists and
# regex patterns.
if custom_dictionaries is None:
custom_dictionaries = []
dictionaries = [{
'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
'dictionary': {
'word_list': {'words': custom_dict.split(',')}
}
} for i, custom_dict in enumerate(custom_dictionaries)]
if custom_regexes is None:
custom_regexes = []
regexes = [{
'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
'regex': {'pattern': custom_regex}
} for i, custom_regex in enumerate(custom_regexes)]
custom_info_types = dictionaries + regexes

# Construct the configuration dictionary. Keys which are None may
# optionally be omitted entirely.
inspect_config = {
'info_types': info_types,
'custom_info_types': custom_info_types,
'min_likelihood': min_likelihood,
'include_quote': include_quote,
'limits': {'max_findings_per_request': max_findings},
}

# Construct the `table`. For more details on the table schema, please see
# https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
headers = [{"name": val} for val in data["header"]]
rows = []
for row in data["rows"]:
rows.append({
"values": [{"string_value": cell_val} for cell_val in row]
})

table = {}
table["headers"] = headers
table["rows"] = rows
item = {"table": table}
# Convert the project id into a full resource id.
parent = dlp.project_path(project)

# Call the API.
response = dlp.inspect_content(parent, inspect_config, item)

# Print out the results.
if response.result.findings:
for finding in response.result.findings:
try:
if finding.quote:
print('Quote: {}'.format(finding.quote))
except AttributeError:
pass
print('Info type: {}'.format(finding.info_type.name))
print('Likelihood: {}'.format(finding.likelihood))
else:
print('No findings.')
# [END dlp_inspect_table]

# [START dlp_inspect_file]


def inspect_file(project, filename, info_types, min_likelihood=None,
custom_dictionaries=None, custom_regexes=None,
max_findings=None, include_quote=True, mime_type=None):
Expand Down Expand Up @@ -284,8 +407,8 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
storage_config = {
'cloud_storage_options': {
'file_set': {'url': url}
}
}
}

# Convert the project id into a full resource id.
parent = dlp.project_path(project)
Expand All @@ -309,7 +432,6 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
subscriber = google.cloud.pubsub.SubscriberClient()
subscription_path = subscriber.subscription_path(
project, subscription_id)
subscription = subscriber.subscribe(subscription_path)

# Set up a callback to acknowledge a message. This closes around an event
# so that it can signal that it is done and the main thread can continue.
Expand Down Expand Up @@ -341,8 +463,7 @@ def callback(message):
print(e)
raise

# Register the callback and wait on the event.
subscription.open(callback)
subscriber.subscribe(subscription_path, callback=callback)
finished = job_done.wait(timeout=timeout)
if not finished:
print('No event received before the timeout. Please verify that the '
Expand Down Expand Up @@ -460,7 +581,6 @@ def inspect_datastore(project, datastore_project, kind,
subscriber = google.cloud.pubsub.SubscriberClient()
subscription_path = subscriber.subscription_path(
project, subscription_id)
subscription = subscriber.subscribe(subscription_path)

# Set up a callback to acknowledge a message. This closes around an event
# so that it can signal that it is done and the main thread can continue.
Expand Down Expand Up @@ -493,7 +613,8 @@ def callback(message):
raise

# Register the callback and wait on the event.
subscription.open(callback)
subscriber.subscribe(subscription_path, callback=callback)

finished = job_done.wait(timeout=timeout)
if not finished:
print('No event received before the timeout. Please verify that the '
Expand Down Expand Up @@ -609,7 +730,6 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
subscriber = google.cloud.pubsub.SubscriberClient()
subscription_path = subscriber.subscription_path(
project, subscription_id)
subscription = subscriber.subscribe(subscription_path)

# Set up a callback to acknowledge a message. This closes around an event
# so that it can signal that it is done and the main thread can continue.
Expand Down Expand Up @@ -642,7 +762,7 @@ def callback(message):
raise

# Register the callback and wait on the event.
subscription.open(callback)
subscriber.subscribe(subscription_path, callback=callback)
finished = job_done.wait(timeout=timeout)
if not finished:
print('No event received before the timeout. Please verify that the '
Expand Down Expand Up @@ -698,6 +818,46 @@ def callback(message):
'information in the results.',
default=True)

parser_table = subparsers.add_parser('table', help='Inspect a table.')
parser_table.add_argument(
'data', help='Json string representing a table.', type=json.loads)
parser_table.add_argument(
'--project',
help='The Google Cloud project id to use as a parent resource.',
default=default_project)
parser_table.add_argument(
'--info_types', action='append',
help='Strings representing info types to look for. A full list of '
'info categories and types is available from the API. Examples '
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
'If unspecified, the three above examples will be used.',
default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
parser_table.add_argument(
'--custom_dictionaries', action='append',
help='Strings representing comma-delimited lists of dictionary words'
' to search for as custom info types. Each string is a comma '
'delimited list of words representing a distinct dictionary.',
default=None)
parser_table.add_argument(
'--custom_regexes', action='append',
help='Strings representing regex patterns to search for as custom '
' info types.',
default=None)
parser_table.add_argument(
'--min_likelihood',
choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
'POSSIBLE', 'LIKELY', 'VERY_LIKELY'],
help='A string representing the minimum likelihood threshold that '
'constitutes a match.')
parser_table.add_argument(
'--max_findings', type=int,
help='The maximum number of findings to report; 0 = no maximum.')
parser_table.add_argument(
'--include_quote', type=bool,
help='A boolean for whether to display a quote of the detected '
'information in the results.',
default=True)

parser_file = subparsers.add_parser('file', help='Inspect a local file.')
parser_file.add_argument(
'filename', help='The path to the file to inspect.')
Expand Down Expand Up @@ -923,6 +1083,14 @@ def callback(message):
min_likelihood=args.min_likelihood,
max_findings=args.max_findings,
include_quote=args.include_quote)
elif args.content == 'table':
inspect_table(
args.project, args.data, args.info_types,
custom_dictionaries=args.custom_dictionaries,
custom_regexes=args.custom_regexes,
min_likelihood=args.min_likelihood,
max_findings=args.max_findings,
include_quote=args.include_quote)
elif args.content == 'file':
inspect_file(
args.project, args.filename, args.info_types,
Expand Down
30 changes: 29 additions & 1 deletion dlp/inspect_content_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import google.cloud.storage

import pytest

import inspect_content


Expand Down Expand Up @@ -170,6 +169,35 @@ def test_inspect_string(capsys):
assert 'Info type: EMAIL_ADDRESS' in out


def test_inspect_table(capsys):
test_tabular_data = {
"header": [
"email",
"phone number"
],
"rows": [
[
"robertfrost@xyz.com",
"4232342345"
],
[
"johndoe@pqr.com",
"4253458383"
]
]
}

inspect_content.inspect_table(
GCLOUD_PROJECT,
test_tabular_data,
['PHONE_NUMBER', 'EMAIL_ADDRESS'],
include_quote=True)

out, _ = capsys.readouterr()
assert 'Info type: PHONE_NUMBER' in out
assert 'Info type: EMAIL_ADDRESS' in out


def test_inspect_string_with_custom_info_types(capsys):
test_string = 'My name is Gary Smith and my email is gary@example.com'
dictionaries = ['Gary Smith']
Expand Down
17 changes: 12 additions & 5 deletions dlp/quickstart.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@

from __future__ import print_function

import sys
import argparse

def quickstart():

def quickstart(project_id):
"""Demonstrates use of the Data Loss Prevention API client library."""

# [START dlp_quickstart]
# Import the client library
import google.cloud.dlp

# Edit this with your Google Cloud Project ID.
project = 'your-project'

# Instantiate a client.
dlp_client = google.cloud.dlp.DlpServiceClient()

Expand Down Expand Up @@ -84,4 +84,11 @@ def quickstart():


if __name__ == '__main__':
quickstart()
parser = argparse.ArgumentParser()
parser.add_argument(
"project_id", help="Enter your GCP project id.", type=str)
args = parser.parse_args()
if len(sys.argv) == 1:
parser.print_usage()
sys.exit(1)
quickstart(args.project_id)
Loading

0 comments on commit 94ced73

Please sign in to comment.