Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update DLP samples to use dlp_v2 client. #2580

Merged
merged 5 commits into from
Dec 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
538 changes: 299 additions & 239 deletions dlp/deid.py

Large diffs are not rendered by default.

107 changes: 60 additions & 47 deletions dlp/deid_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,39 +20,45 @@

import deid

HARMFUL_STRING = 'My SSN is 372819127'
HARMLESS_STRING = 'My favorite color is blue'
GCLOUD_PROJECT = os.getenv('GCLOUD_PROJECT')
WRAPPED_KEY = ('CiQAz0hX4+go8fJwn80Fr8pVImwx+tmZdqU7JL+7TN/S5JxBU9gSSQDhFHpFVy'
'uzJps0YH9ls480mU+JLG7jI/0lL04i6XJRWqmI6gUSZRUtECYcLH5gXK4SXHlL'
'rotx7Chxz/4z7SIpXFOBY61z0/U=')
KEY_NAME = ('projects/python-docs-samples-tests/locations/global/keyRings/'
'dlp-test/cryptoKeys/dlp-test')
SURROGATE_TYPE = 'SSN_TOKEN'
CSV_FILE = os.path.join(os.path.dirname(__file__), 'resources/dates.csv')
HARMFUL_STRING = "My SSN is 372819127"
HARMLESS_STRING = "My favorite color is blue"
GCLOUD_PROJECT = os.getenv("GCLOUD_PROJECT")
WRAPPED_KEY = (
"CiQAz0hX4+go8fJwn80Fr8pVImwx+tmZdqU7JL+7TN/S5JxBU9gSSQDhFHpFVy"
"uzJps0YH9ls480mU+JLG7jI/0lL04i6XJRWqmI6gUSZRUtECYcLH5gXK4SXHlL"
"rotx7Chxz/4z7SIpXFOBY61z0/U="
)
KEY_NAME = (
"projects/python-docs-samples-tests/locations/global/keyRings/"
"dlp-test/cryptoKeys/dlp-test"
)
SURROGATE_TYPE = "SSN_TOKEN"
CSV_FILE = os.path.join(os.path.dirname(__file__), "resources/dates.csv")
DATE_SHIFTED_AMOUNT = 30
DATE_FIELDS = ['birth_date', 'register_date']
CSV_CONTEXT_FIELD = 'name'
DATE_FIELDS = ["birth_date", "register_date"]
CSV_CONTEXT_FIELD = "name"


@pytest.fixture(scope='module')
@pytest.fixture(scope="module")
def tempdir():
tempdir = tempfile.mkdtemp()
yield tempdir
shutil.rmtree(tempdir)


def test_deidentify_with_mask(capsys):
deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING,
['US_SOCIAL_SECURITY_NUMBER'])
deid.deidentify_with_mask(
GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"]
)

out, _ = capsys.readouterr()
assert 'My SSN is *********' in out
assert "My SSN is *********" in out


def test_deidentify_with_mask_ignore_insensitive_data(capsys):
deid.deidentify_with_mask(GCLOUD_PROJECT, HARMLESS_STRING,
['US_SOCIAL_SECURITY_NUMBER'])
deid.deidentify_with_mask(
GCLOUD_PROJECT, HARMLESS_STRING, ["US_SOCIAL_SECURITY_NUMBER"]
)

out, _ = capsys.readouterr()
assert HARMLESS_STRING in out
Expand All @@ -62,82 +68,87 @@ def test_deidentify_with_mask_masking_character_specified(capsys):
deid.deidentify_with_mask(
GCLOUD_PROJECT,
HARMFUL_STRING,
['US_SOCIAL_SECURITY_NUMBER'],
masking_character='#')
["US_SOCIAL_SECURITY_NUMBER"],
masking_character="#",
)

out, _ = capsys.readouterr()
assert 'My SSN is #########' in out
assert "My SSN is #########" in out


def test_deidentify_with_mask_masking_number_specified(capsys):
deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING,
['US_SOCIAL_SECURITY_NUMBER'],
number_to_mask=7)
deid.deidentify_with_mask(
GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"], number_to_mask=7
)

out, _ = capsys.readouterr()
assert 'My SSN is *******27' in out
assert "My SSN is *******27" in out


def test_deidentify_with_fpe(capsys):
deid.deidentify_with_fpe(
GCLOUD_PROJECT,
HARMFUL_STRING,
['US_SOCIAL_SECURITY_NUMBER'],
alphabet='NUMERIC',
["US_SOCIAL_SECURITY_NUMBER"],
alphabet="NUMERIC",
wrapped_key=WRAPPED_KEY,
key_name=KEY_NAME)
key_name=KEY_NAME,
)

out, _ = capsys.readouterr()
assert 'My SSN is' in out
assert '372819127' not in out
assert "My SSN is" in out
assert "372819127" not in out


def test_deidentify_with_fpe_uses_surrogate_info_types(capsys):
deid.deidentify_with_fpe(
GCLOUD_PROJECT,
HARMFUL_STRING,
['US_SOCIAL_SECURITY_NUMBER'],
alphabet='NUMERIC',
["US_SOCIAL_SECURITY_NUMBER"],
alphabet="NUMERIC",
wrapped_key=WRAPPED_KEY,
key_name=KEY_NAME,
surrogate_type=SURROGATE_TYPE)
surrogate_type=SURROGATE_TYPE,
)

out, _ = capsys.readouterr()
assert 'My SSN is SSN_TOKEN' in out
assert '372819127' not in out
assert "My SSN is SSN_TOKEN" in out
assert "372819127" not in out


def test_deidentify_with_fpe_ignores_insensitive_data(capsys):
deid.deidentify_with_fpe(
GCLOUD_PROJECT,
HARMLESS_STRING,
['US_SOCIAL_SECURITY_NUMBER'],
alphabet='NUMERIC',
["US_SOCIAL_SECURITY_NUMBER"],
alphabet="NUMERIC",
wrapped_key=WRAPPED_KEY,
key_name=KEY_NAME)
key_name=KEY_NAME,
)

out, _ = capsys.readouterr()
assert HARMLESS_STRING in out


def test_deidentify_with_date_shift(tempdir, capsys):
output_filepath = os.path.join(tempdir, 'dates-shifted.csv')
output_filepath = os.path.join(tempdir, "dates-shifted.csv")

deid.deidentify_with_date_shift(
GCLOUD_PROJECT,
input_csv_file=CSV_FILE,
output_csv_file=output_filepath,
lower_bound_days=DATE_SHIFTED_AMOUNT,
upper_bound_days=DATE_SHIFTED_AMOUNT,
date_fields=DATE_FIELDS)
date_fields=DATE_FIELDS,
)

out, _ = capsys.readouterr()

assert 'Successful' in out
assert "Successful" in out


def test_deidentify_with_date_shift_using_context_field(tempdir, capsys):
output_filepath = os.path.join(tempdir, 'dates-shifted.csv')
output_filepath = os.path.join(tempdir, "dates-shifted.csv")

deid.deidentify_with_date_shift(
GCLOUD_PROJECT,
Expand All @@ -148,24 +159,26 @@ def test_deidentify_with_date_shift_using_context_field(tempdir, capsys):
date_fields=DATE_FIELDS,
context_field_id=CSV_CONTEXT_FIELD,
wrapped_key=WRAPPED_KEY,
key_name=KEY_NAME)
key_name=KEY_NAME,
)

out, _ = capsys.readouterr()

assert 'Successful' in out
assert "Successful" in out


def test_reidentify_with_fpe(capsys):
labeled_fpe_string = 'My SSN is SSN_TOKEN(9):731997681'
labeled_fpe_string = "My SSN is SSN_TOKEN(9):731997681"

deid.reidentify_with_fpe(
GCLOUD_PROJECT,
labeled_fpe_string,
surrogate_type=SURROGATE_TYPE,
wrapped_key=WRAPPED_KEY,
key_name=KEY_NAME,
alphabet='NUMERIC')
alphabet="NUMERIC",
)

out, _ = capsys.readouterr()

assert '731997681' not in out
assert "731997681" not in out
Loading