From b6b413660e48027a5b02b41ca12eabad13b326fa Mon Sep 17 00:00:00 2001 From: Seth Moore Date: Tue, 9 Jun 2020 17:56:04 -0700 Subject: [PATCH] Add DLP sample for redacting all image text (#4018) The sample shows how to remove all text found in an image with DLP. The sample is integrated into the existing redact.py CLI application. --- dlp/README.rst | 30 +++--------- dlp/redact.py | 116 +++++++++++++++++++++++++++++++++++++-------- dlp/redact_test.py | 14 ++++++ 3 files changed, 118 insertions(+), 42 deletions(-) diff --git a/dlp/README.rst b/dlp/README.rst index 6fcaf09887b8..18958c36e7d9 100644 --- a/dlp/README.rst +++ b/dlp/README.rst @@ -136,37 +136,21 @@ To run this sample: $ python redact.py - usage: redact.py [-h] [--project PROJECT] - [--info_types INFO_TYPES [INFO_TYPES ...]] - [--min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}] - [--mime_type MIME_TYPE] - filename output_filename + usage: redact.py [-h] {info_types,all_text} ... Sample app that uses the Data Loss Prevent API to redact the contents of an image file. positional arguments: - filename The path to the file to inspect. - output_filename The path to which the redacted image will be written. + {info_types,all_text} + Select which content should be redacted. + info_types Redact specific infoTypes from an image. + all_text Redact all text from an image. The MIME type of the + file is inferred via the Python standard library's + mimetypes module. optional arguments: -h, --help show this help message and exit - --project PROJECT The Google Cloud project id to use as a parent - resource. - --info_types INFO_TYPES [INFO_TYPES ...] - Strings representing info types to look for. A full - list of info categories and types is available from - the API. Examples include "FIRST_NAME", "LAST_NAME", - "EMAIL_ADDRESS". If unspecified, the three above - examples will be used. - --min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY} - A string representing the minimum likelihood threshold - that constitutes a match. - --mime_type MIME_TYPE - The MIME type of the file. If not specified, the type - is inferred via the Python standard library's - mimetypes module. - Metadata diff --git a/dlp/redact.py b/dlp/redact.py index 66072de7b282..8a1650a262db 100644 --- a/dlp/redact.py +++ b/dlp/redact.py @@ -121,23 +121,87 @@ def redact_image( # [END dlp_redact_image] +# [START dlp_redact_image_all_text] -if __name__ == "__main__": - default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") - parser = argparse.ArgumentParser(description=__doc__) +def redact_image_all_text( + project, + filename, + output_filename, +): + """Uses the Data Loss Prevention API to redact all text in an image. - parser.add_argument("filename", help="The path to the file to inspect.") - parser.add_argument( - "output_filename", - help="The path to which the redacted image will be written.", + Args: + project: The Google Cloud project id to use as a parent resource. + filename: The path to the file to inspect. + output_filename: The path to which the redacted image will be written. + + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct the image_redaction_configs, indicating to DLP that all text in + # the input image should be redacted. + image_redaction_configs = [{ + "redact_all_text": True, + }] + + # Construct the byte_item, containing the file's byte data. + with open(filename, mode="rb") as f: + byte_item = {"type": "IMAGE", "data": f.read()} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.redact_image( + parent, + image_redaction_configs=image_redaction_configs, + byte_item=byte_item, ) - parser.add_argument( + + # Write out the results. + with open(output_filename, mode="wb") as f: + f.write(response.redacted_image) + + print("Wrote {byte_count} to {filename}".format( + byte_count=len(response.redacted_image), filename=output_filename)) + + +# [END dlp_redact_image_all_text] + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + common_args_parser = argparse.ArgumentParser(add_help=False) + common_args_parser.add_argument( "--project", help="The Google Cloud project id to use as a parent resource.", default=default_project, ) - parser.add_argument( + common_args_parser.add_argument( + "filename", help="The path to the file to inspect.") + common_args_parser.add_argument( + "output_filename", + help="The path to which the redacted image will be written.", + ) + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select which content should be redacted.") + subparsers.required = True + + info_types_parser = subparsers.add_parser( + "info_types", + help="Redact specific infoTypes from an image.", + parents=[common_args_parser], + ) + info_types_parser.add_argument( "--info_types", nargs="+", help="Strings representing info types to look for. A full list of " @@ -146,7 +210,7 @@ def redact_image( "If unspecified, the three above examples will be used.", default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], ) - parser.add_argument( + info_types_parser.add_argument( "--min_likelihood", choices=[ "LIKELIHOOD_UNSPECIFIED", @@ -159,19 +223,33 @@ def redact_image( help="A string representing the minimum likelihood threshold that " "constitutes a match.", ) - parser.add_argument( + info_types_parser.add_argument( "--mime_type", help="The MIME type of the file. If not specified, the type is " "inferred via the Python standard library's mimetypes module.", ) + all_text_parser = subparsers.add_parser( + "all_text", + help="Redact all text from an image. The MIME type of the file is " + "inferred via the Python standard library's mimetypes module.", + parents=[common_args_parser], + ) + args = parser.parse_args() - redact_image( - args.project, - args.filename, - args.output_filename, - args.info_types, - min_likelihood=args.min_likelihood, - mime_type=args.mime_type, - ) + if args.content == "info_types": + redact_image( + args.project, + args.filename, + args.output_filename, + args.info_types, + min_likelihood=args.min_likelihood, + mime_type=args.mime_type, + ) + elif args.content == "all_text": + redact_image_all_text( + args.project, + args.filename, + args.output_filename, + ) diff --git a/dlp/redact_test.py b/dlp/redact_test.py index cb3740353b5b..0cce514eb1a6 100644 --- a/dlp/redact_test.py +++ b/dlp/redact_test.py @@ -44,3 +44,17 @@ def test_redact_image_file(tempdir, capsys): out, _ = capsys.readouterr() assert output_filepath in out + + +def test_redact_image_all_text(tempdir, capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") + output_filepath = os.path.join(tempdir, "redacted.png") + + redact.redact_image_all_text( + GCLOUD_PROJECT, + test_filepath, + output_filepath, + ) + + out, _ = capsys.readouterr() + assert output_filepath in out