Add DLP sample for redacting all image text (#4018)

The sample shows how to remove all text found in an image with DLP. The sample is integrated into the existing redact.py CLI application.
GoogleCloudPlatform · Jun 10, 2020 · b6b4136 · b6b4136
1 parent 21a25b9
commit b6b4136
Show file tree

Hide file tree

Showing 3 changed files with 118 additions and 42 deletions.
diff --git a/dlp/README.rst b/dlp/README.rst
@@ -136,37 +136,21 @@ To run this sample:
 
     $ python redact.py
 
-    usage: redact.py [-h] [--project PROJECT]
-                     [--info_types INFO_TYPES [INFO_TYPES ...]]
-                     [--min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}]
-                     [--mime_type MIME_TYPE]
-                     filename output_filename
+    usage: redact.py [-h] {info_types,all_text} ...
 
     Sample app that uses the Data Loss Prevent API to redact the contents of an
     image file.
 
     positional arguments:
-      filename              The path to the file to inspect.
-      output_filename       The path to which the redacted image will be written.
+      {info_types,all_text}
+                            Select which content should be redacted.
+        info_types          Redact specific infoTypes from an image.
+        all_text            Redact all text from an image. The MIME type of the
+                            file is inferred via the Python standard library's
+                            mimetypes module.
 
     optional arguments:
       -h, --help            show this help message and exit
-      --project PROJECT     The Google Cloud project id to use as a parent
-                            resource.
-      --info_types INFO_TYPES [INFO_TYPES ...]
-                            Strings representing info types to look for. A full
-                            list of info categories and types is available from
-                            the API. Examples include "FIRST_NAME", "LAST_NAME",
-                            "EMAIL_ADDRESS". If unspecified, the three above
-                            examples will be used.
-      --min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}
-                            A string representing the minimum likelihood threshold
-                            that constitutes a match.
-      --mime_type MIME_TYPE
-                            The MIME type of the file. If not specified, the type
-                            is inferred via the Python standard library's
-                            mimetypes module.
-
 
 
 Metadata

diff --git a/dlp/redact.py b/dlp/redact.py
@@ -121,23 +121,87 @@ def redact_image(
 
 # [END dlp_redact_image]
 
+# [START dlp_redact_image_all_text]
 
-if __name__ == "__main__":
-    default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")
 
-    parser = argparse.ArgumentParser(description=__doc__)
+def redact_image_all_text(
+    project,
+    filename,
+    output_filename,
+):
+    """Uses the Data Loss Prevention API to redact all text in an image.
 
-    parser.add_argument("filename", help="The path to the file to inspect.")
-    parser.add_argument(
-        "output_filename",
-        help="The path to which the redacted image will be written.",
+    Args:
+        project: The Google Cloud project id to use as a parent resource.
+        filename: The path to the file to inspect.
+        output_filename: The path to which the redacted image will be written.
+
+    Returns:
+        None; the response from the API is printed to the terminal.
+    """
+    # Import the client library
+    import google.cloud.dlp
+
+    # Instantiate a client.
+    dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+    # Construct the image_redaction_configs, indicating to DLP that all text in
+    # the input image should be redacted.
+    image_redaction_configs = [{
+        "redact_all_text": True,
+    }]
+
+    # Construct the byte_item, containing the file's byte data.
+    with open(filename, mode="rb") as f:
+        byte_item = {"type": "IMAGE", "data": f.read()}
+
+    # Convert the project id into a full resource id.
+    parent = dlp.project_path(project)
+
+    # Call the API.
+    response = dlp.redact_image(
+        parent,
+        image_redaction_configs=image_redaction_configs,
+        byte_item=byte_item,
     )
-    parser.add_argument(
+
+    # Write out the results.
+    with open(output_filename, mode="wb") as f:
+        f.write(response.redacted_image)
+
+    print("Wrote {byte_count} to {filename}".format(
+        byte_count=len(response.redacted_image), filename=output_filename))
+
+
+# [END dlp_redact_image_all_text]
+
+if __name__ == "__main__":
+    default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")
+
+    common_args_parser = argparse.ArgumentParser(add_help=False)
+    common_args_parser.add_argument(
         "--project",
         help="The Google Cloud project id to use as a parent resource.",
         default=default_project,
     )
-    parser.add_argument(
+    common_args_parser.add_argument(
+        "filename", help="The path to the file to inspect.")
+    common_args_parser.add_argument(
+        "output_filename",
+        help="The path to which the redacted image will be written.",
+    )
+
+    parser = argparse.ArgumentParser(description=__doc__)
+    subparsers = parser.add_subparsers(
+        dest="content", help="Select which content should be redacted.")
+    subparsers.required = True
+
+    info_types_parser = subparsers.add_parser(
+        "info_types",
+        help="Redact specific infoTypes from an image.",
+        parents=[common_args_parser],
+    )
+    info_types_parser.add_argument(
         "--info_types",
         nargs="+",
         help="Strings representing info types to look for. A full list of "
@@ -146,7 +210,7 @@ def redact_image(
         "If unspecified, the three above examples will be used.",
         default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
     )
-    parser.add_argument(
+    info_types_parser.add_argument(
         "--min_likelihood",
         choices=[
             "LIKELIHOOD_UNSPECIFIED",
@@ -159,19 +223,33 @@ def redact_image(
         help="A string representing the minimum likelihood threshold that "
         "constitutes a match.",
     )
-    parser.add_argument(
+    info_types_parser.add_argument(
         "--mime_type",
         help="The MIME type of the file. If not specified, the type is "
         "inferred via the Python standard library's mimetypes module.",
     )
 
+    all_text_parser = subparsers.add_parser(
+        "all_text",
+        help="Redact all text from an image. The MIME type of the file is "
+        "inferred via the Python standard library's mimetypes module.",
+        parents=[common_args_parser],
+    )
+
     args = parser.parse_args()
 
-    redact_image(
-        args.project,
-        args.filename,
-        args.output_filename,
-        args.info_types,
-        min_likelihood=args.min_likelihood,
-        mime_type=args.mime_type,
-    )
+    if args.content == "info_types":
+        redact_image(
+            args.project,
+            args.filename,
+            args.output_filename,
+            args.info_types,
+            min_likelihood=args.min_likelihood,
+            mime_type=args.mime_type,
+        )
+    elif args.content == "all_text":
+        redact_image_all_text(
+            args.project,
+            args.filename,
+            args.output_filename,
+        )
diff --git a/dlp/redact_test.py b/dlp/redact_test.py
@@ -44,3 +44,17 @@ def test_redact_image_file(tempdir, capsys):
 
     out, _ = capsys.readouterr()
     assert output_filepath in out
+
+
+def test_redact_image_all_text(tempdir, capsys):
+    test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png")
+    output_filepath = os.path.join(tempdir, "redacted.png")
+
+    redact.redact_image_all_text(
+        GCLOUD_PROJECT,
+        test_filepath,
+        output_filepath,
+    )
+
+    out, _ = capsys.readouterr()
+    assert output_filepath in out