Skip to content
This repository has been archived by the owner on Dec 10, 2023. It is now read-only.

Commit

Permalink
Add text redaction sample using DLP [(#3964)](GoogleCloudPlatform/pyt…
Browse files Browse the repository at this point in the history
…hon-docs-samples#3964)

* Add text redaction sample using DLP

* Update dlp/deid.py

Co-authored-by: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com>

* Rename string parameter to item

Co-authored-by: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com>
  • Loading branch information
sethmoo and busunkim96 committed Jun 5, 2020
1 parent 634bd0b commit ae90593
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 4 deletions.
9 changes: 5 additions & 4 deletions samples/snippets/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -339,13 +339,12 @@ To run this sample:
.. code-block:: bash
$ python deid.py
usage: deid.py [-h] {deid_mask,deid_fpe,reid_fpe,deid_date_shift} ...
usage: deid.py [-h] {deid_mask,deid_fpe,reid_fpe,deid_date_shift,redact} ...
Uses of the Data Loss Prevention API for deidentifying sensitive data.
positional arguments:
{deid_mask,deid_fpe,reid_fpe,deid_date_shift}
{deid_mask,deid_fpe,reid_fpe,deid_date_shift,redact}
Select how to submit content to the API.
deid_mask Deidentify sensitive data in a string by masking it
with a character.
Expand All @@ -355,6 +354,8 @@ To run this sample:
Preserving Encryption (FPE).
deid_date_shift Deidentify dates in a CSV file by pseudorandomly
shifting them.
redact Redact sensitive data in a string by replacing it with
the info type of the data.
optional arguments:
-h, --help show this help message and exit
Expand All @@ -378,4 +379,4 @@ to `browse the source`_ and `report issues`_.
https://github.com/GoogleCloudPlatform/google-cloud-python/issues
.. _Google Cloud SDK: https://cloud.google.com/sdk/
.. _Google Cloud SDK: https://cloud.google.com/sdk/
85 changes: 85 additions & 0 deletions samples/snippets/deid.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,61 @@ def write_data(data):
# [END dlp_deidentify_date_shift]


# [START dlp_redact_sensitive_data]
def redact_sensitive_data(project, item, info_types):
"""Uses the Data Loss Prevention API to redact sensitive data in a
string by replacing it with the info type.
Args:
project: The Google Cloud project id to use as a parent resource.
item: The string to redact (will be treated as text).
info_types: A list of strings representing info types to look for.
A full list of info type categories can be fetched from the API.
Returns:
None; the response from the API is printed to the terminal.
"""

# Import the client library
import google.cloud.dlp

# Instantiate a client
dlp = google.cloud.dlp_v2.DlpServiceClient()

# Convert the project id into a full resource id.
parent = dlp.project_path(project)

# Construct inspect configuration dictionary
inspect_config = {
"info_types": [{"name": info_type} for info_type in info_types]
}

# Construct deidentify configuration dictionary
deidentify_config = {
"info_type_transformations": {
"transformations": [
{
"primitive_transformation": {
"replace_with_info_type_config": {}
}
}
]
}
}

# Call the API
response = dlp.deidentify_content(
parent,
inspect_config=inspect_config,
deidentify_config=deidentify_config,
item={"value": item},
)

# Print out the results.
print(response.item.value)


# [END dlp_redact_sensitive_data]


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
subparsers = parser.add_subparsers(
Expand Down Expand Up @@ -626,6 +681,30 @@ def write_data(data):
"key_name.",
)

redact_parser = subparsers.add_parser(
"redact",
help="Redact sensitive data in a string by replacing it with the "
"info type of the data.",
)
redact_parser.add_argument(
"--info_types",
action="append",
help="Strings representing info types to look for. A full list of "
"info categories and types is available from the API. Examples "
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
"If unspecified, the three above examples will be used.",
default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
)
redact_parser.add_argument(
"project",
help="The Google Cloud project id to use as a parent resource.",
)
redact_parser.add_argument(
"item",
help="The string to redact."
"Example: 'My credit card is 4242 4242 4242 4242'",
)

args = parser.parse_args()

if args.content == "deid_mask":
Expand Down Expand Up @@ -667,3 +746,9 @@ def write_data(data):
wrapped_key=args.wrapped_key,
key_name=args.key_name,
)
elif args.content == "redact":
redact_sensitive_data(
args.project,
item=args.item,
info_types=args.info_types,
)
14 changes: 14 additions & 0 deletions samples/snippets/deid_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,17 @@ def test_reidentify_with_fpe(capsys):
out, _ = capsys.readouterr()

assert "731997681" not in out


def test_redact_sensitive_data(capsys):
url_to_redact = "https://cloud.google.com"
deid.redact_sensitive_data(
GCLOUD_PROJECT,
"My favorite site is " + url_to_redact,
["URL"],
)

out, _ = capsys.readouterr()

assert url_to_redact not in out
assert "My favorite site is [URL]" in out

0 comments on commit ae90593

Please sign in to comment.