diff --git a/dlp/README.md b/dlp/README.md index 2c976b5a73f..37180819cdc 100644 --- a/dlp/README.md +++ b/dlp/README.md @@ -6,8 +6,10 @@ a powerful detection engine for personally identifiable information and other pr ## Setup - A Google Cloud project with billing enabled - [Enable](https://console.cloud.google.com/launcher/details/google/dlp.googleapis.com) the DLP API. -- (Local testing)[Create a service account](https://cloud.google.com/docs/authentication/getting-started) +- (Local testing) [Create a service account](https://cloud.google.com/docs/authentication/getting-started) and set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable pointing to the downloaded credentials file. +- (Local testing) Set the `DLP_DEID_WRAPPED_KEY` environment variable to an AES-256 key encrypted ('wrapped') [with a Cloud Key Management Service (KMS) key](https://cloud.google.com/kms/docs/encrypt-decrypt). +- (Local testing) Set the `DLP_DEID_KEY_NAME` environment variable to the path-name of the Cloud KMS key you wrapped `DLP_DEID_WRAPPED_KEY` with. ## Build This project uses the [Assembly Plugin](https://maven.apache.org/plugins/maven-assembly-plugin/usage.html) to build an uber jar. diff --git a/dlp/pom.xml b/dlp/pom.xml index 2d59b05f1f5..5d9f75c3cd0 100644 --- a/dlp/pom.xml +++ b/dlp/pom.xml @@ -42,7 +42,7 @@ com.google.cloud google-cloud-dlp - 0.25.0-alpha + 0.26.0-alpha diff --git a/dlp/src/main/java/com/example/dlp/DeIdentification.java b/dlp/src/main/java/com/example/dlp/DeIdentification.java new file mode 100644 index 00000000000..296fc582f78 --- /dev/null +++ b/dlp/src/main/java/com/example/dlp/DeIdentification.java @@ -0,0 +1,274 @@ +/** + * Copyright 2017 Google Inc. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.example.dlp; + +import com.google.cloud.dlp.v2beta1.DlpServiceClient; +import com.google.common.io.BaseEncoding; +import com.google.privacy.dlp.v2beta1.CharacterMaskConfig; +import com.google.privacy.dlp.v2beta1.ContentItem; +import com.google.privacy.dlp.v2beta1.CryptoKey; +import com.google.privacy.dlp.v2beta1.CryptoReplaceFfxFpeConfig; +import com.google.privacy.dlp.v2beta1.CryptoReplaceFfxFpeConfig.FfxCommonNativeAlphabet; +import com.google.privacy.dlp.v2beta1.DeidentifyConfig; +import com.google.privacy.dlp.v2beta1.DeidentifyContentRequest; +import com.google.privacy.dlp.v2beta1.DeidentifyContentResponse; +import com.google.privacy.dlp.v2beta1.InfoTypeTransformations; +import com.google.privacy.dlp.v2beta1.InfoTypeTransformations.InfoTypeTransformation; +import com.google.privacy.dlp.v2beta1.KmsWrappedCryptoKey; +import com.google.privacy.dlp.v2beta1.PrimitiveTransformation; +import com.google.protobuf.ByteString; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionGroup; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; + +public class DeIdentification { + + private static void deIdentifyWithMask( + String string, + Character maskingCharacter, + int numberToMask) { + // [START dlp_deidentify_mask] + /** + * Deidentify a string by masking sensitive information with a character using the DLP API. + * @param string The string to deidentify. + * @param maskingCharacter (Optional) The character to mask sensitive data with. + * @param numberToMask (Optional) The number of characters' worth of sensitive data to mask. + * Omitting this value or setting it to 0 masks all sensitive chars. + */ + + // instantiate a client + try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) { + + // string = "My SSN is 372819127"; + // numberToMask = 5; + // maskingCharacter = 'x'; + + ContentItem contentItem = + ContentItem.newBuilder() + .setType("text/plain") + .setValue(string) + .build(); + + CharacterMaskConfig characterMaskConfig = + CharacterMaskConfig.newBuilder() + .setMaskingCharacter(maskingCharacter.toString()) + .setNumberToMask(numberToMask) + .build(); + + // Create the deidentification transformation configuration + PrimitiveTransformation primitiveTransformation = + PrimitiveTransformation.newBuilder() + .setCharacterMaskConfig(characterMaskConfig) + .build(); + + InfoTypeTransformation infoTypeTransformationObject = + InfoTypeTransformation.newBuilder() + .setPrimitiveTransformation(primitiveTransformation) + .build(); + + InfoTypeTransformations infoTypeTransformationArray = + InfoTypeTransformations.newBuilder() + .addTransformations(infoTypeTransformationObject) + .build(); + + // Create the deidentification request object + DeidentifyConfig deidentifyConfig = + DeidentifyConfig.newBuilder() + .setInfoTypeTransformations(infoTypeTransformationArray) + .build(); + + DeidentifyContentRequest request = + DeidentifyContentRequest.newBuilder() + .setDeidentifyConfig(deidentifyConfig) + .addItems(contentItem) + .build(); + + // Execute the deidentification request + DeidentifyContentResponse response = dlpServiceClient.deidentifyContent(request); + + // Print the character-masked input value + // e.g. "My SSN is 123456789" --> "My SSN is *********" + for (ContentItem item : response.getItemsList()) { + System.out.println(item.getValue()); + } + } catch (Exception e) { + System.out.println("Error in deidentifyWithMask: " + e.getMessage()); + } + // [END dlp_deidentify_mask] + } + + private static void deIdentifyWithFpe( + String string, FfxCommonNativeAlphabet alphabet, String keyName, String wrappedKey) { + // [START dlp_deidentify_fpe] + /** + * Deidentify a string by encrypting sensitive information while preserving format. + * @param string The string to deidentify. + * @param alphabet The set of characters to use when encrypting the input. For more information, + * see cloud.google.com/dlp/docs/reference/rest/v2beta1/content/deidentify + * @param keyName The name of the Cloud KMS key to use when decrypting the wrapped key. + * @param wrappedKey The encrypted (or "wrapped") AES-256 encryption key. + */ + + // instantiate a client + try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) { + + // string = "My SSN is 372819127"; + // alphabet = FfxCommonNativeAlphabet.ALPHA_NUMERIC; + // keyName = "projects/GCP_PROJECT/locations/REGION/keyRings/KEYRING_ID/cryptoKeys/KEY_NAME"; + // wrappedKey = "YOUR_ENCRYPTED_AES_256_KEY" + + ContentItem contentItem = + ContentItem.newBuilder() + .setType("text/plain") + .setValue(string) + .build(); + + // Create the format-preserving encryption (FPE) configuration + KmsWrappedCryptoKey kmsWrappedCryptoKey = + KmsWrappedCryptoKey.newBuilder() + .setWrappedKey(ByteString.copyFrom(BaseEncoding.base64().decode(wrappedKey))) + .setCryptoKeyName(keyName) + .build(); + + CryptoKey cryptoKey = + CryptoKey.newBuilder() + .setKmsWrapped(kmsWrappedCryptoKey) + .build(); + + CryptoReplaceFfxFpeConfig cryptoReplaceFfxFpeConfig = + CryptoReplaceFfxFpeConfig.newBuilder() + .setCryptoKey(cryptoKey) + .setCommonAlphabet(alphabet) + .build(); + + // Create the deidentification transformation configuration + PrimitiveTransformation primitiveTransformation = + PrimitiveTransformation.newBuilder() + .setCryptoReplaceFfxFpeConfig(cryptoReplaceFfxFpeConfig) + .build(); + + InfoTypeTransformation infoTypeTransformationObject = + InfoTypeTransformation.newBuilder() + .setPrimitiveTransformation(primitiveTransformation) + .build(); + + InfoTypeTransformations infoTypeTransformationArray = + InfoTypeTransformations.newBuilder() + .addTransformations(infoTypeTransformationObject) + .build(); + + // Create the deidentification request object + DeidentifyConfig deidentifyConfig = + DeidentifyConfig.newBuilder() + .setInfoTypeTransformations(infoTypeTransformationArray) + .build(); + + DeidentifyContentRequest request = + DeidentifyContentRequest.newBuilder() + .setDeidentifyConfig(deidentifyConfig) + .addItems(contentItem) + .build(); + + // Execute the deidentification request + DeidentifyContentResponse response = dlpServiceClient.deidentifyContent(request); + + // Print the deidentified input value + // e.g. "My SSN is 123456789" --> "My SSN is 7261298621" + for (ContentItem item : response.getItemsList()) { + System.out.println(item.getValue()); + } + } catch (Exception e) { + System.out.println("Error in deidentifyWithFpe: " + e.getMessage()); + } + // [END dlp_deidentify_fpe] + } + + /** + * Command line application to de-identify data using the Data Loss Prevention API. + * Supported data format: strings + */ + public static void main(String[] args) throws Exception { + + OptionGroup optionsGroup = new OptionGroup(); + optionsGroup.setRequired(true); + + Option deidentifyMaskingOption = new Option("m", "mask", true, "deid with character masking"); + optionsGroup.addOption(deidentifyMaskingOption); + + Option deidentifyFpeOption = new Option("f", "fpe", true, "deid with FFX FPE"); + optionsGroup.addOption(deidentifyFpeOption); + + Options commandLineOptions = new Options(); + commandLineOptions.addOptionGroup(optionsGroup); + + Option maskingCharacterOption = + Option.builder("maskingCharacter").hasArg(true).required(false).build(); + commandLineOptions.addOption(maskingCharacterOption); + + Option numberToMaskOption = + Option.builder("numberToMask").hasArg(true).required(false).build(); + commandLineOptions.addOption(numberToMaskOption); + + Option alphabetOption = + Option.builder("commonAlphabet").hasArg(true).required(false).build(); + commandLineOptions.addOption(alphabetOption); + + Option wrappedKeyOption = + Option.builder("wrappedKey").hasArg(true).required(false).build(); + commandLineOptions.addOption(wrappedKeyOption); + + Option keyNameOption = + Option.builder("keyName").hasArg(true).required(false).build(); + commandLineOptions.addOption(keyNameOption); + + CommandLineParser parser = new DefaultParser(); + HelpFormatter formatter = new HelpFormatter(); + CommandLine cmd; + + try { + cmd = parser.parse(commandLineOptions, args); + } catch (ParseException e) { + System.out.println(e.getMessage()); + formatter.printHelp(DeIdentification.class.getName(), commandLineOptions); + System.exit(1); + return; + } + + if (cmd.hasOption("m")) { + // deidentification with character masking + int numberToMask = Integer.parseInt(cmd.getOptionValue(numberToMaskOption.getOpt(), "0")); + char maskingCharacter = cmd.getOptionValue(maskingCharacterOption.getOpt(), "*").charAt(0); + String val = cmd.getOptionValue(deidentifyMaskingOption.getOpt()); + deIdentifyWithMask(val, maskingCharacter, numberToMask); + } else if (cmd.hasOption("f")) { + // deidentification with FPE + String wrappedKey = cmd.getOptionValue(wrappedKeyOption.getOpt()); + String keyName = cmd.getOptionValue(keyNameOption.getOpt()); + String val = cmd.getOptionValue(deidentifyFpeOption.getOpt()); + FfxCommonNativeAlphabet alphabet = + FfxCommonNativeAlphabet.valueOf( + cmd.getOptionValue( + alphabetOption.getOpt(), FfxCommonNativeAlphabet.ALPHA_NUMERIC.name())); + deIdentifyWithFpe(val, alphabet, keyName, wrappedKey); + } + } +} diff --git a/dlp/src/main/java/com/example/dlp/Inspect.java b/dlp/src/main/java/com/example/dlp/Inspect.java index 03e3b616b09..b39be30adc7 100644 --- a/dlp/src/main/java/com/example/dlp/Inspect.java +++ b/dlp/src/main/java/com/example/dlp/Inspect.java @@ -20,6 +20,8 @@ import com.google.cloud.ServiceOptions; import com.google.cloud.dlp.v2beta1.DlpServiceClient; import com.google.longrunning.Operation; +import com.google.privacy.dlp.v2beta1.BigQueryOptions; +import com.google.privacy.dlp.v2beta1.BigQueryTable; import com.google.privacy.dlp.v2beta1.CloudStorageOptions; import com.google.privacy.dlp.v2beta1.CloudStorageOptions.FileSet; import com.google.privacy.dlp.v2beta1.ContentItem; @@ -332,9 +334,88 @@ private static void inspectDatastore( // [END dlp_inspect_datastore] } + private static void inspectBigquery( + String projectId, + String datasetId, + String tableId, + Likelihood minLikelihood, + List infoTypes) { + // [START dlp_inspect_bigquery] + // Instantiates a client + try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) { + + // (Optional) The project ID to run the API call under + // projectId = my-project-id + + // The ID of the dataset to inspect, e.g. 'my_dataset' + // datasetId = "my_dataset"; + + // The ID of the table to inspect, e.g. 'my_table' + // tableId = "my_table"; + + // The minimum likelihood required before returning a match + // minLikelihood = LIKELIHOOD_UNSPECIFIED; + + // The infoTypes of information to match + // infoTypes = ['US_MALE_NAME', 'US_FEMALE_NAME']; + + // Reference to the BigQuery table + BigQueryTable tableReference = + BigQueryTable.newBuilder() + .setProjectId(projectId) + .setDatasetId(datasetId) + .setTableId(tableId) + .build(); + BigQueryOptions bigQueryOptions = + BigQueryOptions.newBuilder() + .setTableReference(tableReference) + .build(); + + // Construct BigQuery configuration to be inspected + StorageConfig storageConfig = + StorageConfig.newBuilder() + .setBigQueryOptions(bigQueryOptions) + .build(); + + InspectConfig inspectConfig = + InspectConfig.newBuilder() + .addAllInfoTypes(infoTypes) + .setMinLikelihood(minLikelihood) + .build(); + + // optionally provide an output configuration to store results, default : none + OutputStorageConfig outputConfig = OutputStorageConfig.getDefaultInstance(); + + // asynchronously submit an inspect operation + OperationFuture responseFuture = + dlpServiceClient.createInspectOperationAsync( + inspectConfig, storageConfig, outputConfig); + + // ... + // block on response, returning job id of the operation + InspectOperationResult inspectOperationResult = responseFuture.get(); + ResultName resultName = inspectOperationResult.getNameAsResultName(); + InspectResult inspectResult = dlpServiceClient.listInspectFindings(resultName).getResult(); + + if (inspectResult.getFindingsCount() > 0) { + System.out.println("Findings: "); + for (Finding finding : inspectResult.getFindingsList()) { + System.out.print("\tInfo type: " + finding.getInfoType().getName()); + System.out.println("\tLikelihood: " + finding.getLikelihood()); + } + } else { + System.out.println("No findings."); + } + } catch (Exception e) { + e.printStackTrace(); + System.out.println("Error in inspectBigguery: " + e.getMessage()); + } + // [END dlp_inspect_bigquery] + } + /** * Command line application to inspect data using the Data Loss Prevention API. - * Supported data formats : string, file, text files on GCS and Datastore entities + * Supported data formats: string, file, text file on GCS, BigQuery table, and Datastore entity */ public static void main(String[] args) throws Exception { @@ -352,6 +433,9 @@ public static void main(String[] args) throws Exception { Option datastoreOption = new Option("ds", "Google Datastore", false, "inspect Datastore kind"); optionsGroup.addOption(datastoreOption); + Option bigqueryOption = new Option("bq", "Google BigQuery", false, "inspect BigQuery table"); + optionsGroup.addOption(bigqueryOption); + Options commandLineOptions = new Options(); commandLineOptions.addOptionGroup(optionsGroup); @@ -377,9 +461,15 @@ public static void main(String[] args) throws Exception { Option gcsFileNameOption = Option.builder("fileName").hasArg(true).required(false).build(); commandLineOptions.addOption(gcsFileNameOption); - Option datastoreProjectIdOption = + Option datasetIdOption = Option.builder("datasetId").hasArg(true).required(false).build(); + commandLineOptions.addOption(datasetIdOption); + + Option tableIdOption = Option.builder("tableId").hasArg(true).required(false).build(); + commandLineOptions.addOption(tableIdOption); + + Option projectIdOption = Option.builder("projectId").hasArg(true).required(false).build(); - commandLineOptions.addOption(datastoreProjectIdOption); + commandLineOptions.addOption(projectIdOption); Option datastoreNamespaceOption = Option.builder("namespace").hasArg(true).required(false).build(); @@ -436,8 +526,16 @@ public static void main(String[] args) throws Exception { // use default project id when project id is not specified String projectId = cmd.getOptionValue( - datastoreProjectIdOption.getOpt(), ServiceOptions.getDefaultProjectId()); + projectIdOption.getOpt(), ServiceOptions.getDefaultProjectId()); inspectDatastore(projectId, namespaceId, kind, minLikelihood, infoTypesList); + } else if (cmd.hasOption("bq")) { + String datasetId = cmd.getOptionValue(datasetIdOption.getOpt()); + String tableId = cmd.getOptionValue(tableIdOption.getOpt()); + // use default project id when project id is not specified + String projectId = + cmd.getOptionValue( + projectIdOption.getOpt(), ServiceOptions.getDefaultProjectId()); + inspectBigquery(projectId, datasetId, tableId, minLikelihood, infoTypesList); } } } diff --git a/dlp/src/main/java/com/example/dlp/RiskAnalysis.java b/dlp/src/main/java/com/example/dlp/RiskAnalysis.java new file mode 100644 index 00000000000..5fc95bdec86 --- /dev/null +++ b/dlp/src/main/java/com/example/dlp/RiskAnalysis.java @@ -0,0 +1,474 @@ +/** + * Copyright 2017 Google Inc. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.example.dlp; + +import com.google.api.gax.rpc.OperationFuture; +import com.google.cloud.ServiceOptions; +import com.google.cloud.dlp.v2beta1.DlpServiceClient; +import com.google.longrunning.Operation; +import com.google.privacy.dlp.v2beta1.AnalyzeDataSourceRiskRequest; +import com.google.privacy.dlp.v2beta1.BigQueryTable; +import com.google.privacy.dlp.v2beta1.FieldId; +import com.google.privacy.dlp.v2beta1.PrivacyMetric; +import com.google.privacy.dlp.v2beta1.PrivacyMetric.CategoricalStatsConfig; +import com.google.privacy.dlp.v2beta1.PrivacyMetric.KAnonymityConfig; +import com.google.privacy.dlp.v2beta1.PrivacyMetric.LDiversityConfig; +import com.google.privacy.dlp.v2beta1.PrivacyMetric.NumericalStatsConfig; +import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationMetadata; +import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult; +import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult.CategoricalStatsResult.CategoricalStatsHistogramBucket; +import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult.KAnonymityResult.KAnonymityEquivalenceClass; +import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult.KAnonymityResult.KAnonymityHistogramBucket; +import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult.LDiversityResult.LDiversityEquivalenceClass; +import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult.LDiversityResult.LDiversityHistogramBucket; +import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult.NumericalStatsResult; +import com.google.privacy.dlp.v2beta1.Value; +import com.google.privacy.dlp.v2beta1.ValueFrequency; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionGroup; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; + +public class RiskAnalysis { + + private static void calculateNumericalStats( + String projectId, String datasetId, String tableId, String columnName) + throws Exception { + // [START dlp_numerical_stats_analysis] + + /** + * Calculate numerical statistics for a column in a BigQuery table using the DLP API. + * @param projectId The Google Cloud Platform project ID to run the API call under. + * @param datasetId The BigQuery dataset to analyze. + * @param tableId The BigQuery table to analyze. + * @param columnName The name of the column to analyze, which must contain only numerical data. + */ + + // instantiate a client + try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) { + + // projectId = process.env.GCLOUD_PROJECT; + // datasetId = "my_dataset"; + // tableId = "my_table"; + // columnName = "firstName"; + + FieldId fieldId = + FieldId.newBuilder() + .setColumnName(columnName) + .build(); + + NumericalStatsConfig numericalStatsConfig = + NumericalStatsConfig.newBuilder() + .setField(fieldId) + .build(); + + BigQueryTable bigQueryTable = + BigQueryTable.newBuilder() + .setProjectId(projectId) + .setDatasetId(datasetId) + .setTableId(tableId) + .build(); + + PrivacyMetric privacyMetric = + PrivacyMetric.newBuilder() + .setNumericalStatsConfig(numericalStatsConfig) + .build(); + + AnalyzeDataSourceRiskRequest request = + AnalyzeDataSourceRiskRequest.newBuilder() + .setPrivacyMetric(privacyMetric) + .setSourceTable(bigQueryTable) + .build(); + + // asynchronously submit a risk analysis operation + OperationFuture + responseFuture = dlpServiceClient.analyzeDataSourceRiskAsync(request); + + // ... + // block on response + RiskAnalysisOperationResult response = responseFuture.get(); + NumericalStatsResult results = + response.getNumericalStatsResult(); + + System.out.println( + "Value range: [" + results.getMaxValue() + ", " + results.getMinValue() + "]"); + + // Print out unique quantiles + String previousValue = ""; + for (int i = 0; i < results.getQuantileValuesCount(); i++) { + Value valueObj = results.getQuantileValues(i); + String value = valueObj.toString(); + + if (!previousValue.equals(value)) { + System.out.println("Value at " + i + "% quantile: " + value.toString()); + previousValue = value; + } + } + } catch (Exception e) { + System.out.println("Error in numericalStatsAnalysis: " + e.getMessage()); + } + // [END dlp_numerical_stats_analysis] + } + + private static void calculateCategoricalStats( + String projectId, String datasetId, String tableId, String columnName) + throws Exception { + // [START dlp_categorical_stats_analysis] + /** + * Calculate categorical statistics for a column in a BigQuery table using the DLP API. + * @param projectId The Google Cloud Platform project ID to run the API call under. + * @param datasetId The BigQuery dataset to analyze. + * @param tableId The BigQuery table to analyze. + * @param columnName The name of the column to analyze, which need not contain numerical data. + */ + + // instantiate a client + try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) { + + // projectId = process.env.GCLOUD_PROJECT; + // datasetId = "my_dataset"; + // tableId = "my_table"; + // columnName = "firstName"; + + FieldId fieldId = + FieldId.newBuilder() + .setColumnName(columnName) + .build(); + + CategoricalStatsConfig categoricalStatsConfig = + CategoricalStatsConfig.newBuilder() + .setField(fieldId) + .build(); + + BigQueryTable bigQueryTable = + BigQueryTable.newBuilder() + .setProjectId(projectId) + .setDatasetId(datasetId) + .setTableId(tableId) + .build(); + + PrivacyMetric privacyMetric = + PrivacyMetric.newBuilder() + .setCategoricalStatsConfig(categoricalStatsConfig) + .build(); + + AnalyzeDataSourceRiskRequest request = + AnalyzeDataSourceRiskRequest.newBuilder() + .setPrivacyMetric(privacyMetric) + .setSourceTable(bigQueryTable) + .build(); + + // asynchronously submit a risk analysis operation + OperationFuture + responseFuture = dlpServiceClient.analyzeDataSourceRiskAsync(request); + + // ... + // block on response + RiskAnalysisOperationResult response = responseFuture.get(); + CategoricalStatsHistogramBucket results = + response.getCategoricalStatsResult().getValueFrequencyHistogramBuckets(0); + + System.out.println( + "Most common value occurs " + results.getValueFrequencyUpperBound() + " time(s)"); + System.out.println( + "Least common value occurs " + results.getValueFrequencyLowerBound() + " time(s)"); + + for (ValueFrequency valueFrequency : results.getBucketValuesList()) { + System.out.println("Value " + + valueFrequency.getValue().toString() + + " occurs " + + valueFrequency.getCount() + + " time(s)." + ); + } + + } catch (Exception e) { + System.out.println("Error in categoricalStatsAnalysis: " + e.getMessage()); + } + // [END dlp_categorical_stats_analysis] + } + + private static void calculateKAnonymity( + String projectId, String datasetId, String tableId, List quasiIds) + throws Exception { + // [START dlp_k_anonymity] + /** + * Calculate k-anonymity for quasi-identifiers in a BigQuery table using the DLP API. + * @param projectId The Google Cloud Platform project ID to run the API call under. + * @param datasetId The BigQuery dataset to analyze. + * @param tableId The BigQuery table to analyze. + * @param quasiIds The names of columns that form a composite key ('quasi-identifiers'). + */ + + // instantiate a client + try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) { + + // projectId = process.env.GCLOUD_PROJECT; + // datasetId = 'my_dataset'; + // tableId = 'my_table'; + // quasiIds = [{ columnName: 'age' }, { columnName: 'city' }]; + + List quasiIdFields = + quasiIds + .stream() + .map(columnName -> FieldId.newBuilder().setColumnName(columnName).build()) + .collect(Collectors.toList()); + + KAnonymityConfig kanonymityConfig = + KAnonymityConfig.newBuilder() + .addAllQuasiIds(quasiIdFields) + .build(); + + BigQueryTable bigQueryTable = + BigQueryTable.newBuilder() + .setProjectId(projectId) + .setDatasetId(datasetId) + .setTableId(tableId) + .build(); + + PrivacyMetric privacyMetric = + PrivacyMetric.newBuilder() + .setKAnonymityConfig(kanonymityConfig) + .build(); + + AnalyzeDataSourceRiskRequest request = + AnalyzeDataSourceRiskRequest.newBuilder() + .setPrivacyMetric(privacyMetric) + .setSourceTable(bigQueryTable) + .build(); + + // asynchronously submit a risk analysis operation + OperationFuture + responseFuture = dlpServiceClient.analyzeDataSourceRiskAsync(request); + + // ... + // block on response + RiskAnalysisOperationResult response = responseFuture.get(); + KAnonymityHistogramBucket results = + response.getKAnonymityResult().getEquivalenceClassHistogramBuckets(0); + + System.out.println("Bucket size range: [" + + results.getEquivalenceClassSizeLowerBound() + + ", " + + results.getEquivalenceClassSizeUpperBound() + + "]" + ); + + for (KAnonymityEquivalenceClass bucket : results.getBucketValuesList()) { + List quasiIdValues = bucket.getQuasiIdsValuesList() + .stream() + .map(v -> v.toString()) + .collect(Collectors.toList()); + + System.out.println("\tQuasi-ID values: " + String.join(", ", quasiIdValues)); + System.out.println("\tClass size: " + bucket.getEquivalenceClassSize()); + } + } catch (Exception e) { + System.out.println("Error in kAnonymityAnalysis: " + e.getMessage()); + } + // [END dlp_k_anonymity] + } + + private static void calculateLDiversity( + String projectId, + String datasetId, + String tableId, + String sensitiveAttribute, + List quasiIds + ) throws Exception { + // [START dlp_l_diversity] + /** + * Calculate l-diversity for an attribute relative to quasi-identifiers in a BigQuery table. + * @param projectId The Google Cloud Platform project ID to run the API call under. + * @param datasetId The BigQuery dataset to analyze. + * @param tableId The BigQuery table to analyze. + * @param sensitiveAttribute The name of the attribute to compare the quasi-ID against + * @param quasiIds A set of column names that form a composite key ('quasi-identifiers'). + */ + + // instantiate a client + try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) { + + // projectId = process.env.GCLOUD_PROJECT; + // datasetId = "my_dataset"; + // tableId = "my_table"; + // sensitiveAttribute = "name"; + // quasiIds = [{ columnName: "age" }, { columnName: "city" }]; + + FieldId sensitiveAttributeField = + FieldId.newBuilder() + .setColumnName(sensitiveAttribute) + .build(); + + List quasiIdFields = + quasiIds + .stream() + .map(columnName -> FieldId.newBuilder().setColumnName(columnName).build()) + .collect(Collectors.toList()); + + LDiversityConfig ldiversityConfig = + LDiversityConfig.newBuilder() + .addAllQuasiIds(quasiIdFields) + .setSensitiveAttribute(sensitiveAttributeField) + .build(); + + BigQueryTable bigQueryTable = + BigQueryTable.newBuilder() + .setProjectId(projectId) + .setDatasetId(datasetId) + .setTableId(tableId) + .build(); + + PrivacyMetric privacyMetric = + PrivacyMetric.newBuilder() + .setLDiversityConfig(ldiversityConfig) + .build(); + + AnalyzeDataSourceRiskRequest request = + AnalyzeDataSourceRiskRequest.newBuilder() + .setPrivacyMetric(privacyMetric) + .setSourceTable(bigQueryTable) + .build(); + + // asynchronously submit a risk analysis operation + OperationFuture + responseFuture = dlpServiceClient.analyzeDataSourceRiskAsync(request); + + // ... + // block on response + RiskAnalysisOperationResult response = responseFuture.get(); + LDiversityHistogramBucket results = + response.getLDiversityResult().getSensitiveValueFrequencyHistogramBuckets(0); + + for (LDiversityEquivalenceClass bucket : results.getBucketValuesList()) { + List quasiIdValues = bucket.getQuasiIdsValuesList() + .stream() + .map(v -> v.toString()) + .collect(Collectors.toList()); + + System.out.println("\tQuasi-ID values: " + String.join(", ", quasiIdValues)); + System.out.println("\tClass size: " + bucket.getEquivalenceClassSize()); + + for (ValueFrequency valueFrequency : bucket.getTopSensitiveValuesList()) { + System.out.println("\t\tSensitive value " + + valueFrequency.getValue().toString() + + " occurs " + + valueFrequency.getCount() + + " time(s)."); + } + } + } catch (Exception e) { + System.out.println("Error in lDiversityAnalysis: " + e.getMessage()); + } + // [END dlp_l_diversity] + } + + + /** + * Command line application to perform risk analysis using the Data Loss Prevention API. + * Supported data format: BigQuery tables + */ + public static void main(String[] args) throws Exception { + + OptionGroup optionsGroup = new OptionGroup(); + optionsGroup.setRequired(true); + + Option numericalAnalysisOption = new Option("n", "numerical"); + optionsGroup.addOption(numericalAnalysisOption); + + Option categoricalAnalysisOption = new Option("c", "categorical"); + optionsGroup.addOption(categoricalAnalysisOption); + + Option kanonymityOption = new Option("k", "kAnonymity"); + optionsGroup.addOption(kanonymityOption); + + Option ldiversityOption = new Option("l", "lDiversity"); + optionsGroup.addOption(ldiversityOption); + + Options commandLineOptions = new Options(); + commandLineOptions.addOptionGroup(optionsGroup); + + Option datasetIdOption = Option.builder("datasetId").hasArg(true).required(false).build(); + commandLineOptions.addOption(datasetIdOption); + + Option tableIdOption = Option.builder("tableId").hasArg(true).required(false).build(); + commandLineOptions.addOption(tableIdOption); + + Option projectIdOption = Option.builder("projectId").hasArg(true).required(false).build(); + commandLineOptions.addOption(projectIdOption); + + Option columnNameOption = + Option.builder("columnName").hasArg(true).required(false).build(); + commandLineOptions.addOption(columnNameOption); + + Option sensitiveAttributeOption = + Option.builder("sensitiveAttribute").hasArg(true).required(false).build(); + commandLineOptions.addOption(sensitiveAttributeOption); + + Option quasiIdColumnNamesOption = + Option.builder("quasiIdColumnNames").hasArg(true).required(false).build(); + commandLineOptions.addOption(quasiIdColumnNamesOption); + + CommandLineParser parser = new DefaultParser(); + HelpFormatter formatter = new HelpFormatter(); + CommandLine cmd; + + try { + cmd = parser.parse(commandLineOptions, args); + } catch (ParseException e) { + System.out.println(e.getMessage()); + formatter.printHelp(RiskAnalysis.class.getName(), commandLineOptions); + System.exit(1); + return; + } + + String datasetId = cmd.getOptionValue(datasetIdOption.getOpt()); + String tableId = cmd.getOptionValue(tableIdOption.getOpt()); + // use default project id when project id is not specified + String projectId = + cmd.getOptionValue( + projectIdOption.getOpt(), ServiceOptions.getDefaultProjectId()); + + if (cmd.hasOption("n")) { + // numerical stats analysis + String columnName = cmd.getOptionValue(columnNameOption.getOpt()); + calculateNumericalStats(projectId, datasetId, tableId, columnName); + } else if (cmd.hasOption("c")) { + // categorical stats analysis + String columnName = cmd.getOptionValue(columnNameOption.getOpt()); + calculateCategoricalStats(projectId, datasetId, tableId, columnName); + } else if (cmd.hasOption("k")) { + // k-anonymity analysis + List quasiIdColumnNames = + Arrays.asList(cmd.getOptionValues(quasiIdColumnNamesOption.getOpt())); + calculateKAnonymity(projectId, datasetId, tableId, quasiIdColumnNames); + } else if (cmd.hasOption("l")) { + // l-diversity analysis + String sensitiveAttribute = cmd.getOptionValue(sensitiveAttributeOption.getOpt()); + List quasiIdColumnNames = + Arrays.asList(cmd.getOptionValues(quasiIdColumnNamesOption.getOpt())); + calculateLDiversity(projectId, datasetId, tableId, sensitiveAttribute, quasiIdColumnNames); + } + } +} diff --git a/dlp/src/test/java/com/example/dlp/DeIdentificationIT.java b/dlp/src/test/java/com/example/dlp/DeIdentificationIT.java new file mode 100644 index 00000000000..0097cee5af1 --- /dev/null +++ b/dlp/src/test/java/com/example/dlp/DeIdentificationIT.java @@ -0,0 +1,84 @@ +/** + * Copyright 2017 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.example.dlp; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.util.regex.Pattern; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class DeIdentificationIT { + private ByteArrayOutputStream bout; + private PrintStream out; + + // Update to wrapped local encryption key + private String wrappedKey = System.getenv("DLP_DEID_WRAPPED_KEY"); + + // Update to name of KMS key used to wrap local encryption key + private String keyName = System.getenv("DLP_DEID_KEY_NAME"); + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + System.setOut(out); // TODO(b/64541432) DLP currently doesn't support GOOGLE DEFAULT AUTH + assertNotNull(System.getenv("GOOGLE_APPLICATION_CREDENTIALS")); + assertNotNull(System.getenv("DLP_DEID_WRAPPED_KEY")); + assertNotNull(System.getenv("DLP_DEID_KEY_NAME")); + } + + @Test + public void testDeidStringMasksCharacters() throws Exception { + String text = "\"My SSN is 372819127\""; + DeIdentification.main(new String[] { + "-m", text, + "-maskingCharacter", "x", + "-numberToMask", "5" + }); + String output = bout.toString(); + assertEquals(output, "My SSN is xxxxx9127\n"); + } + + @Test + public void testDeidStringPerformsFpe() throws Exception { + String text = "\"My SSN is 372819127\""; + DeIdentification.main(new String[] { + "-f", text, + "-wrappedKey", wrappedKey, + "-keyName", keyName + }); + String output = bout.toString(); + assertFalse(output.contains(text)); + assertTrue(Pattern.compile("My SSN is \\w+").matcher(output).find()); + } + + @After + public void tearDown() { + System.setOut(null); + bout.reset(); + } +} diff --git a/dlp/src/test/java/com/example/dlp/InspectIT.java b/dlp/src/test/java/com/example/dlp/InspectIT.java index 788236a72fb..618c96d0a5c 100644 --- a/dlp/src/test/java/com/example/dlp/InspectIT.java +++ b/dlp/src/test/java/com/example/dlp/InspectIT.java @@ -87,7 +87,7 @@ public void testGcsFileInspectionReturnsInfoTypes() throws Exception { assertTrue(output.contains("EMAIL_ADDRESS")); } - // Requires a Datastore kind containing an entity + // Requires a Datastore kind containing an entity // with phone number and email address properties. @Test public void testDatastoreInspectionReturnsInfoTypes() throws Exception { @@ -97,6 +97,17 @@ public void testDatastoreInspectionReturnsInfoTypes() throws Exception { assertTrue(output.contains("EMAIL_ADDRESS")); } + @Test + public void testBigqueryInspectionReturnsInfoTypes() throws Exception { + Inspect.main(new String[] { + "-bq", + "-datasetId", "integration_tests_dlp", + "-tableId", "harmful" + }); + String output = bout.toString(); + assertTrue(output.contains("PHONE_NUMBER")); + } + @After public void tearDown() { System.setOut(null); diff --git a/dlp/src/test/java/com/example/dlp/RiskAnalysisIT.java b/dlp/src/test/java/com/example/dlp/RiskAnalysisIT.java new file mode 100644 index 00000000000..de5fa22c722 --- /dev/null +++ b/dlp/src/test/java/com/example/dlp/RiskAnalysisIT.java @@ -0,0 +1,108 @@ +/** + * Copyright 2017 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.example.dlp; + +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.util.regex.Pattern; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class RiskAnalysisIT { + private ByteArrayOutputStream bout; + private PrintStream out; + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + System.setOut(out); // TODO(b/64541432) DLP currently doesn't support GOOGLE DEFAULT AUTH + assertNotNull(System.getenv("GOOGLE_APPLICATION_CREDENTIALS")); + assertNotNull(System.getenv("DLP_DEID_WRAPPED_KEY")); + assertNotNull(System.getenv("DLP_DEID_KEY_NAME")); + } + + @Test + public void testNumericalStats() throws Exception { + RiskAnalysis.main(new String[] { + "-n", + "-datasetId", "integration_tests_dlp", + "-tableId", "harmful", + "-columnName", "Age" + }); + String output = bout.toString(); + assertTrue(Pattern.compile( + "Value at 0% quantile: integer_value: \\d{2}").matcher(output).find()); + assertTrue(Pattern.compile( + "Value at \\d{2}% quantile: integer_value: \\d{2}").matcher(output).find()); + } + + @Test + public void testCategoricalStats() throws Exception { + RiskAnalysis.main(new String[] { + "-c", + "-datasetId", "integration_tests_dlp", + "-tableId", "harmful", + "-columnName", "Mystery" + }); + String output = bout.toString(); + assertTrue(Pattern.compile( + "Most common value occurs \\d time\\(s\\)").matcher(output).find()); + } + + @Test + public void testKAnonymity() throws Exception { + RiskAnalysis.main(new String[] { + "-k", + "-datasetId", "integration_tests_dlp", + "-tableId", "harmful", + "-quasiIdColumnNames", "Age", "Mystery" + }); + String output = bout.toString(); + assertTrue(Pattern.compile("Bucket size range: \\[\\d, \\d\\]").matcher(output).find()); + assertTrue(output.contains("Quasi-ID values: integer_value: 19")); + assertTrue(output.contains("Class size: 1")); + } + + @Test + public void testLDiversity() throws Exception { + RiskAnalysis.main(new String[] { + "-l", + "-datasetId", "integration_tests_dlp", + "-tableId", "harmful", + "-sensitiveAttribute", "Name", + "-quasiIdColumnNames", "Age", "Mystery" + }); + String output = bout.toString(); + assertTrue(output.contains("Quasi-ID values: integer_value: 19")); + assertTrue(output.contains("Class size: 1")); + assertTrue(output.contains("Sensitive value string_value: \"James\"")); + } + + @After + public void tearDown() { + System.setOut(null); + bout.reset(); + } +}