diff --git a/dlp/src/main/java/com/example/dlp/DeIdentification.java b/dlp/src/main/java/com/example/dlp/DeIdentification.java
new file mode 100644
index 00000000000..296fc582f78
--- /dev/null
+++ b/dlp/src/main/java/com/example/dlp/DeIdentification.java
@@ -0,0 +1,274 @@
+/**
+ * Copyright 2017 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.dlp;
+
+import com.google.cloud.dlp.v2beta1.DlpServiceClient;
+import com.google.common.io.BaseEncoding;
+import com.google.privacy.dlp.v2beta1.CharacterMaskConfig;
+import com.google.privacy.dlp.v2beta1.ContentItem;
+import com.google.privacy.dlp.v2beta1.CryptoKey;
+import com.google.privacy.dlp.v2beta1.CryptoReplaceFfxFpeConfig;
+import com.google.privacy.dlp.v2beta1.CryptoReplaceFfxFpeConfig.FfxCommonNativeAlphabet;
+import com.google.privacy.dlp.v2beta1.DeidentifyConfig;
+import com.google.privacy.dlp.v2beta1.DeidentifyContentRequest;
+import com.google.privacy.dlp.v2beta1.DeidentifyContentResponse;
+import com.google.privacy.dlp.v2beta1.InfoTypeTransformations;
+import com.google.privacy.dlp.v2beta1.InfoTypeTransformations.InfoTypeTransformation;
+import com.google.privacy.dlp.v2beta1.KmsWrappedCryptoKey;
+import com.google.privacy.dlp.v2beta1.PrimitiveTransformation;
+import com.google.protobuf.ByteString;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionGroup;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+
+public class DeIdentification {
+
+ private static void deIdentifyWithMask(
+ String string,
+ Character maskingCharacter,
+ int numberToMask) {
+ // [START dlp_deidentify_mask]
+ /**
+ * Deidentify a string by masking sensitive information with a character using the DLP API.
+ * @param string The string to deidentify.
+ * @param maskingCharacter (Optional) The character to mask sensitive data with.
+ * @param numberToMask (Optional) The number of characters' worth of sensitive data to mask.
+ * Omitting this value or setting it to 0 masks all sensitive chars.
+ */
+
+ // instantiate a client
+ try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
+
+ // string = "My SSN is 372819127";
+ // numberToMask = 5;
+ // maskingCharacter = 'x';
+
+ ContentItem contentItem =
+ ContentItem.newBuilder()
+ .setType("text/plain")
+ .setValue(string)
+ .build();
+
+ CharacterMaskConfig characterMaskConfig =
+ CharacterMaskConfig.newBuilder()
+ .setMaskingCharacter(maskingCharacter.toString())
+ .setNumberToMask(numberToMask)
+ .build();
+
+ // Create the deidentification transformation configuration
+ PrimitiveTransformation primitiveTransformation =
+ PrimitiveTransformation.newBuilder()
+ .setCharacterMaskConfig(characterMaskConfig)
+ .build();
+
+ InfoTypeTransformation infoTypeTransformationObject =
+ InfoTypeTransformation.newBuilder()
+ .setPrimitiveTransformation(primitiveTransformation)
+ .build();
+
+ InfoTypeTransformations infoTypeTransformationArray =
+ InfoTypeTransformations.newBuilder()
+ .addTransformations(infoTypeTransformationObject)
+ .build();
+
+ // Create the deidentification request object
+ DeidentifyConfig deidentifyConfig =
+ DeidentifyConfig.newBuilder()
+ .setInfoTypeTransformations(infoTypeTransformationArray)
+ .build();
+
+ DeidentifyContentRequest request =
+ DeidentifyContentRequest.newBuilder()
+ .setDeidentifyConfig(deidentifyConfig)
+ .addItems(contentItem)
+ .build();
+
+ // Execute the deidentification request
+ DeidentifyContentResponse response = dlpServiceClient.deidentifyContent(request);
+
+ // Print the character-masked input value
+ // e.g. "My SSN is 123456789" --> "My SSN is *********"
+ for (ContentItem item : response.getItemsList()) {
+ System.out.println(item.getValue());
+ }
+ } catch (Exception e) {
+ System.out.println("Error in deidentifyWithMask: " + e.getMessage());
+ }
+ // [END dlp_deidentify_mask]
+ }
+
+ private static void deIdentifyWithFpe(
+ String string, FfxCommonNativeAlphabet alphabet, String keyName, String wrappedKey) {
+ // [START dlp_deidentify_fpe]
+ /**
+ * Deidentify a string by encrypting sensitive information while preserving format.
+ * @param string The string to deidentify.
+ * @param alphabet The set of characters to use when encrypting the input. For more information,
+ * see cloud.google.com/dlp/docs/reference/rest/v2beta1/content/deidentify
+ * @param keyName The name of the Cloud KMS key to use when decrypting the wrapped key.
+ * @param wrappedKey The encrypted (or "wrapped") AES-256 encryption key.
+ */
+
+ // instantiate a client
+ try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
+
+ // string = "My SSN is 372819127";
+ // alphabet = FfxCommonNativeAlphabet.ALPHA_NUMERIC;
+ // keyName = "projects/GCP_PROJECT/locations/REGION/keyRings/KEYRING_ID/cryptoKeys/KEY_NAME";
+ // wrappedKey = "YOUR_ENCRYPTED_AES_256_KEY"
+
+ ContentItem contentItem =
+ ContentItem.newBuilder()
+ .setType("text/plain")
+ .setValue(string)
+ .build();
+
+ // Create the format-preserving encryption (FPE) configuration
+ KmsWrappedCryptoKey kmsWrappedCryptoKey =
+ KmsWrappedCryptoKey.newBuilder()
+ .setWrappedKey(ByteString.copyFrom(BaseEncoding.base64().decode(wrappedKey)))
+ .setCryptoKeyName(keyName)
+ .build();
+
+ CryptoKey cryptoKey =
+ CryptoKey.newBuilder()
+ .setKmsWrapped(kmsWrappedCryptoKey)
+ .build();
+
+ CryptoReplaceFfxFpeConfig cryptoReplaceFfxFpeConfig =
+ CryptoReplaceFfxFpeConfig.newBuilder()
+ .setCryptoKey(cryptoKey)
+ .setCommonAlphabet(alphabet)
+ .build();
+
+ // Create the deidentification transformation configuration
+ PrimitiveTransformation primitiveTransformation =
+ PrimitiveTransformation.newBuilder()
+ .setCryptoReplaceFfxFpeConfig(cryptoReplaceFfxFpeConfig)
+ .build();
+
+ InfoTypeTransformation infoTypeTransformationObject =
+ InfoTypeTransformation.newBuilder()
+ .setPrimitiveTransformation(primitiveTransformation)
+ .build();
+
+ InfoTypeTransformations infoTypeTransformationArray =
+ InfoTypeTransformations.newBuilder()
+ .addTransformations(infoTypeTransformationObject)
+ .build();
+
+ // Create the deidentification request object
+ DeidentifyConfig deidentifyConfig =
+ DeidentifyConfig.newBuilder()
+ .setInfoTypeTransformations(infoTypeTransformationArray)
+ .build();
+
+ DeidentifyContentRequest request =
+ DeidentifyContentRequest.newBuilder()
+ .setDeidentifyConfig(deidentifyConfig)
+ .addItems(contentItem)
+ .build();
+
+ // Execute the deidentification request
+ DeidentifyContentResponse response = dlpServiceClient.deidentifyContent(request);
+
+ // Print the deidentified input value
+ // e.g. "My SSN is 123456789" --> "My SSN is 7261298621"
+ for (ContentItem item : response.getItemsList()) {
+ System.out.println(item.getValue());
+ }
+ } catch (Exception e) {
+ System.out.println("Error in deidentifyWithFpe: " + e.getMessage());
+ }
+ // [END dlp_deidentify_fpe]
+ }
+
+ /**
+ * Command line application to de-identify data using the Data Loss Prevention API.
+ * Supported data format: strings
+ */
+ public static void main(String[] args) throws Exception {
+
+ OptionGroup optionsGroup = new OptionGroup();
+ optionsGroup.setRequired(true);
+
+ Option deidentifyMaskingOption = new Option("m", "mask", true, "deid with character masking");
+ optionsGroup.addOption(deidentifyMaskingOption);
+
+ Option deidentifyFpeOption = new Option("f", "fpe", true, "deid with FFX FPE");
+ optionsGroup.addOption(deidentifyFpeOption);
+
+ Options commandLineOptions = new Options();
+ commandLineOptions.addOptionGroup(optionsGroup);
+
+ Option maskingCharacterOption =
+ Option.builder("maskingCharacter").hasArg(true).required(false).build();
+ commandLineOptions.addOption(maskingCharacterOption);
+
+ Option numberToMaskOption =
+ Option.builder("numberToMask").hasArg(true).required(false).build();
+ commandLineOptions.addOption(numberToMaskOption);
+
+ Option alphabetOption =
+ Option.builder("commonAlphabet").hasArg(true).required(false).build();
+ commandLineOptions.addOption(alphabetOption);
+
+ Option wrappedKeyOption =
+ Option.builder("wrappedKey").hasArg(true).required(false).build();
+ commandLineOptions.addOption(wrappedKeyOption);
+
+ Option keyNameOption =
+ Option.builder("keyName").hasArg(true).required(false).build();
+ commandLineOptions.addOption(keyNameOption);
+
+ CommandLineParser parser = new DefaultParser();
+ HelpFormatter formatter = new HelpFormatter();
+ CommandLine cmd;
+
+ try {
+ cmd = parser.parse(commandLineOptions, args);
+ } catch (ParseException e) {
+ System.out.println(e.getMessage());
+ formatter.printHelp(DeIdentification.class.getName(), commandLineOptions);
+ System.exit(1);
+ return;
+ }
+
+ if (cmd.hasOption("m")) {
+ // deidentification with character masking
+ int numberToMask = Integer.parseInt(cmd.getOptionValue(numberToMaskOption.getOpt(), "0"));
+ char maskingCharacter = cmd.getOptionValue(maskingCharacterOption.getOpt(), "*").charAt(0);
+ String val = cmd.getOptionValue(deidentifyMaskingOption.getOpt());
+ deIdentifyWithMask(val, maskingCharacter, numberToMask);
+ } else if (cmd.hasOption("f")) {
+ // deidentification with FPE
+ String wrappedKey = cmd.getOptionValue(wrappedKeyOption.getOpt());
+ String keyName = cmd.getOptionValue(keyNameOption.getOpt());
+ String val = cmd.getOptionValue(deidentifyFpeOption.getOpt());
+ FfxCommonNativeAlphabet alphabet =
+ FfxCommonNativeAlphabet.valueOf(
+ cmd.getOptionValue(
+ alphabetOption.getOpt(), FfxCommonNativeAlphabet.ALPHA_NUMERIC.name()));
+ deIdentifyWithFpe(val, alphabet, keyName, wrappedKey);
+ }
+ }
+}
diff --git a/dlp/src/main/java/com/example/dlp/Inspect.java b/dlp/src/main/java/com/example/dlp/Inspect.java
index 03e3b616b09..b39be30adc7 100644
--- a/dlp/src/main/java/com/example/dlp/Inspect.java
+++ b/dlp/src/main/java/com/example/dlp/Inspect.java
@@ -20,6 +20,8 @@
import com.google.cloud.ServiceOptions;
import com.google.cloud.dlp.v2beta1.DlpServiceClient;
import com.google.longrunning.Operation;
+import com.google.privacy.dlp.v2beta1.BigQueryOptions;
+import com.google.privacy.dlp.v2beta1.BigQueryTable;
import com.google.privacy.dlp.v2beta1.CloudStorageOptions;
import com.google.privacy.dlp.v2beta1.CloudStorageOptions.FileSet;
import com.google.privacy.dlp.v2beta1.ContentItem;
@@ -332,9 +334,88 @@ private static void inspectDatastore(
// [END dlp_inspect_datastore]
}
+ private static void inspectBigquery(
+ String projectId,
+ String datasetId,
+ String tableId,
+ Likelihood minLikelihood,
+ List infoTypes) {
+ // [START dlp_inspect_bigquery]
+ // Instantiates a client
+ try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
+
+ // (Optional) The project ID to run the API call under
+ // projectId = my-project-id
+
+ // The ID of the dataset to inspect, e.g. 'my_dataset'
+ // datasetId = "my_dataset";
+
+ // The ID of the table to inspect, e.g. 'my_table'
+ // tableId = "my_table";
+
+ // The minimum likelihood required before returning a match
+ // minLikelihood = LIKELIHOOD_UNSPECIFIED;
+
+ // The infoTypes of information to match
+ // infoTypes = ['US_MALE_NAME', 'US_FEMALE_NAME'];
+
+ // Reference to the BigQuery table
+ BigQueryTable tableReference =
+ BigQueryTable.newBuilder()
+ .setProjectId(projectId)
+ .setDatasetId(datasetId)
+ .setTableId(tableId)
+ .build();
+ BigQueryOptions bigQueryOptions =
+ BigQueryOptions.newBuilder()
+ .setTableReference(tableReference)
+ .build();
+
+ // Construct BigQuery configuration to be inspected
+ StorageConfig storageConfig =
+ StorageConfig.newBuilder()
+ .setBigQueryOptions(bigQueryOptions)
+ .build();
+
+ InspectConfig inspectConfig =
+ InspectConfig.newBuilder()
+ .addAllInfoTypes(infoTypes)
+ .setMinLikelihood(minLikelihood)
+ .build();
+
+ // optionally provide an output configuration to store results, default : none
+ OutputStorageConfig outputConfig = OutputStorageConfig.getDefaultInstance();
+
+ // asynchronously submit an inspect operation
+ OperationFuture responseFuture =
+ dlpServiceClient.createInspectOperationAsync(
+ inspectConfig, storageConfig, outputConfig);
+
+ // ...
+ // block on response, returning job id of the operation
+ InspectOperationResult inspectOperationResult = responseFuture.get();
+ ResultName resultName = inspectOperationResult.getNameAsResultName();
+ InspectResult inspectResult = dlpServiceClient.listInspectFindings(resultName).getResult();
+
+ if (inspectResult.getFindingsCount() > 0) {
+ System.out.println("Findings: ");
+ for (Finding finding : inspectResult.getFindingsList()) {
+ System.out.print("\tInfo type: " + finding.getInfoType().getName());
+ System.out.println("\tLikelihood: " + finding.getLikelihood());
+ }
+ } else {
+ System.out.println("No findings.");
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ System.out.println("Error in inspectBigguery: " + e.getMessage());
+ }
+ // [END dlp_inspect_bigquery]
+ }
+
/**
* Command line application to inspect data using the Data Loss Prevention API.
- * Supported data formats : string, file, text files on GCS and Datastore entities
+ * Supported data formats: string, file, text file on GCS, BigQuery table, and Datastore entity
*/
public static void main(String[] args) throws Exception {
@@ -352,6 +433,9 @@ public static void main(String[] args) throws Exception {
Option datastoreOption = new Option("ds", "Google Datastore", false, "inspect Datastore kind");
optionsGroup.addOption(datastoreOption);
+ Option bigqueryOption = new Option("bq", "Google BigQuery", false, "inspect BigQuery table");
+ optionsGroup.addOption(bigqueryOption);
+
Options commandLineOptions = new Options();
commandLineOptions.addOptionGroup(optionsGroup);
@@ -377,9 +461,15 @@ public static void main(String[] args) throws Exception {
Option gcsFileNameOption = Option.builder("fileName").hasArg(true).required(false).build();
commandLineOptions.addOption(gcsFileNameOption);
- Option datastoreProjectIdOption =
+ Option datasetIdOption = Option.builder("datasetId").hasArg(true).required(false).build();
+ commandLineOptions.addOption(datasetIdOption);
+
+ Option tableIdOption = Option.builder("tableId").hasArg(true).required(false).build();
+ commandLineOptions.addOption(tableIdOption);
+
+ Option projectIdOption =
Option.builder("projectId").hasArg(true).required(false).build();
- commandLineOptions.addOption(datastoreProjectIdOption);
+ commandLineOptions.addOption(projectIdOption);
Option datastoreNamespaceOption =
Option.builder("namespace").hasArg(true).required(false).build();
@@ -436,8 +526,16 @@ public static void main(String[] args) throws Exception {
// use default project id when project id is not specified
String projectId =
cmd.getOptionValue(
- datastoreProjectIdOption.getOpt(), ServiceOptions.getDefaultProjectId());
+ projectIdOption.getOpt(), ServiceOptions.getDefaultProjectId());
inspectDatastore(projectId, namespaceId, kind, minLikelihood, infoTypesList);
+ } else if (cmd.hasOption("bq")) {
+ String datasetId = cmd.getOptionValue(datasetIdOption.getOpt());
+ String tableId = cmd.getOptionValue(tableIdOption.getOpt());
+ // use default project id when project id is not specified
+ String projectId =
+ cmd.getOptionValue(
+ projectIdOption.getOpt(), ServiceOptions.getDefaultProjectId());
+ inspectBigquery(projectId, datasetId, tableId, minLikelihood, infoTypesList);
}
}
}
diff --git a/dlp/src/main/java/com/example/dlp/RiskAnalysis.java b/dlp/src/main/java/com/example/dlp/RiskAnalysis.java
new file mode 100644
index 00000000000..5fc95bdec86
--- /dev/null
+++ b/dlp/src/main/java/com/example/dlp/RiskAnalysis.java
@@ -0,0 +1,474 @@
+/**
+ * Copyright 2017 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.dlp;
+
+import com.google.api.gax.rpc.OperationFuture;
+import com.google.cloud.ServiceOptions;
+import com.google.cloud.dlp.v2beta1.DlpServiceClient;
+import com.google.longrunning.Operation;
+import com.google.privacy.dlp.v2beta1.AnalyzeDataSourceRiskRequest;
+import com.google.privacy.dlp.v2beta1.BigQueryTable;
+import com.google.privacy.dlp.v2beta1.FieldId;
+import com.google.privacy.dlp.v2beta1.PrivacyMetric;
+import com.google.privacy.dlp.v2beta1.PrivacyMetric.CategoricalStatsConfig;
+import com.google.privacy.dlp.v2beta1.PrivacyMetric.KAnonymityConfig;
+import com.google.privacy.dlp.v2beta1.PrivacyMetric.LDiversityConfig;
+import com.google.privacy.dlp.v2beta1.PrivacyMetric.NumericalStatsConfig;
+import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationMetadata;
+import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult;
+import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult.CategoricalStatsResult.CategoricalStatsHistogramBucket;
+import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult.KAnonymityResult.KAnonymityEquivalenceClass;
+import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult.KAnonymityResult.KAnonymityHistogramBucket;
+import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult.LDiversityResult.LDiversityEquivalenceClass;
+import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult.LDiversityResult.LDiversityHistogramBucket;
+import com.google.privacy.dlp.v2beta1.RiskAnalysisOperationResult.NumericalStatsResult;
+import com.google.privacy.dlp.v2beta1.Value;
+import com.google.privacy.dlp.v2beta1.ValueFrequency;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionGroup;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+
+public class RiskAnalysis {
+
+ private static void calculateNumericalStats(
+ String projectId, String datasetId, String tableId, String columnName)
+ throws Exception {
+ // [START dlp_numerical_stats_analysis]
+
+ /**
+ * Calculate numerical statistics for a column in a BigQuery table using the DLP API.
+ * @param projectId The Google Cloud Platform project ID to run the API call under.
+ * @param datasetId The BigQuery dataset to analyze.
+ * @param tableId The BigQuery table to analyze.
+ * @param columnName The name of the column to analyze, which must contain only numerical data.
+ */
+
+ // instantiate a client
+ try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
+
+ // projectId = process.env.GCLOUD_PROJECT;
+ // datasetId = "my_dataset";
+ // tableId = "my_table";
+ // columnName = "firstName";
+
+ FieldId fieldId =
+ FieldId.newBuilder()
+ .setColumnName(columnName)
+ .build();
+
+ NumericalStatsConfig numericalStatsConfig =
+ NumericalStatsConfig.newBuilder()
+ .setField(fieldId)
+ .build();
+
+ BigQueryTable bigQueryTable =
+ BigQueryTable.newBuilder()
+ .setProjectId(projectId)
+ .setDatasetId(datasetId)
+ .setTableId(tableId)
+ .build();
+
+ PrivacyMetric privacyMetric =
+ PrivacyMetric.newBuilder()
+ .setNumericalStatsConfig(numericalStatsConfig)
+ .build();
+
+ AnalyzeDataSourceRiskRequest request =
+ AnalyzeDataSourceRiskRequest.newBuilder()
+ .setPrivacyMetric(privacyMetric)
+ .setSourceTable(bigQueryTable)
+ .build();
+
+ // asynchronously submit a risk analysis operation
+ OperationFuture
+ responseFuture = dlpServiceClient.analyzeDataSourceRiskAsync(request);
+
+ // ...
+ // block on response
+ RiskAnalysisOperationResult response = responseFuture.get();
+ NumericalStatsResult results =
+ response.getNumericalStatsResult();
+
+ System.out.println(
+ "Value range: [" + results.getMaxValue() + ", " + results.getMinValue() + "]");
+
+ // Print out unique quantiles
+ String previousValue = "";
+ for (int i = 0; i < results.getQuantileValuesCount(); i++) {
+ Value valueObj = results.getQuantileValues(i);
+ String value = valueObj.toString();
+
+ if (!previousValue.equals(value)) {
+ System.out.println("Value at " + i + "% quantile: " + value.toString());
+ previousValue = value;
+ }
+ }
+ } catch (Exception e) {
+ System.out.println("Error in numericalStatsAnalysis: " + e.getMessage());
+ }
+ // [END dlp_numerical_stats_analysis]
+ }
+
+ private static void calculateCategoricalStats(
+ String projectId, String datasetId, String tableId, String columnName)
+ throws Exception {
+ // [START dlp_categorical_stats_analysis]
+ /**
+ * Calculate categorical statistics for a column in a BigQuery table using the DLP API.
+ * @param projectId The Google Cloud Platform project ID to run the API call under.
+ * @param datasetId The BigQuery dataset to analyze.
+ * @param tableId The BigQuery table to analyze.
+ * @param columnName The name of the column to analyze, which need not contain numerical data.
+ */
+
+ // instantiate a client
+ try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
+
+ // projectId = process.env.GCLOUD_PROJECT;
+ // datasetId = "my_dataset";
+ // tableId = "my_table";
+ // columnName = "firstName";
+
+ FieldId fieldId =
+ FieldId.newBuilder()
+ .setColumnName(columnName)
+ .build();
+
+ CategoricalStatsConfig categoricalStatsConfig =
+ CategoricalStatsConfig.newBuilder()
+ .setField(fieldId)
+ .build();
+
+ BigQueryTable bigQueryTable =
+ BigQueryTable.newBuilder()
+ .setProjectId(projectId)
+ .setDatasetId(datasetId)
+ .setTableId(tableId)
+ .build();
+
+ PrivacyMetric privacyMetric =
+ PrivacyMetric.newBuilder()
+ .setCategoricalStatsConfig(categoricalStatsConfig)
+ .build();
+
+ AnalyzeDataSourceRiskRequest request =
+ AnalyzeDataSourceRiskRequest.newBuilder()
+ .setPrivacyMetric(privacyMetric)
+ .setSourceTable(bigQueryTable)
+ .build();
+
+ // asynchronously submit a risk analysis operation
+ OperationFuture
+ responseFuture = dlpServiceClient.analyzeDataSourceRiskAsync(request);
+
+ // ...
+ // block on response
+ RiskAnalysisOperationResult response = responseFuture.get();
+ CategoricalStatsHistogramBucket results =
+ response.getCategoricalStatsResult().getValueFrequencyHistogramBuckets(0);
+
+ System.out.println(
+ "Most common value occurs " + results.getValueFrequencyUpperBound() + " time(s)");
+ System.out.println(
+ "Least common value occurs " + results.getValueFrequencyLowerBound() + " time(s)");
+
+ for (ValueFrequency valueFrequency : results.getBucketValuesList()) {
+ System.out.println("Value "
+ + valueFrequency.getValue().toString()
+ + " occurs "
+ + valueFrequency.getCount()
+ + " time(s)."
+ );
+ }
+
+ } catch (Exception e) {
+ System.out.println("Error in categoricalStatsAnalysis: " + e.getMessage());
+ }
+ // [END dlp_categorical_stats_analysis]
+ }
+
+ private static void calculateKAnonymity(
+ String projectId, String datasetId, String tableId, List quasiIds)
+ throws Exception {
+ // [START dlp_k_anonymity]
+ /**
+ * Calculate k-anonymity for quasi-identifiers in a BigQuery table using the DLP API.
+ * @param projectId The Google Cloud Platform project ID to run the API call under.
+ * @param datasetId The BigQuery dataset to analyze.
+ * @param tableId The BigQuery table to analyze.
+ * @param quasiIds The names of columns that form a composite key ('quasi-identifiers').
+ */
+
+ // instantiate a client
+ try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
+
+ // projectId = process.env.GCLOUD_PROJECT;
+ // datasetId = 'my_dataset';
+ // tableId = 'my_table';
+ // quasiIds = [{ columnName: 'age' }, { columnName: 'city' }];
+
+ List quasiIdFields =
+ quasiIds
+ .stream()
+ .map(columnName -> FieldId.newBuilder().setColumnName(columnName).build())
+ .collect(Collectors.toList());
+
+ KAnonymityConfig kanonymityConfig =
+ KAnonymityConfig.newBuilder()
+ .addAllQuasiIds(quasiIdFields)
+ .build();
+
+ BigQueryTable bigQueryTable =
+ BigQueryTable.newBuilder()
+ .setProjectId(projectId)
+ .setDatasetId(datasetId)
+ .setTableId(tableId)
+ .build();
+
+ PrivacyMetric privacyMetric =
+ PrivacyMetric.newBuilder()
+ .setKAnonymityConfig(kanonymityConfig)
+ .build();
+
+ AnalyzeDataSourceRiskRequest request =
+ AnalyzeDataSourceRiskRequest.newBuilder()
+ .setPrivacyMetric(privacyMetric)
+ .setSourceTable(bigQueryTable)
+ .build();
+
+ // asynchronously submit a risk analysis operation
+ OperationFuture
+ responseFuture = dlpServiceClient.analyzeDataSourceRiskAsync(request);
+
+ // ...
+ // block on response
+ RiskAnalysisOperationResult response = responseFuture.get();
+ KAnonymityHistogramBucket results =
+ response.getKAnonymityResult().getEquivalenceClassHistogramBuckets(0);
+
+ System.out.println("Bucket size range: ["
+ + results.getEquivalenceClassSizeLowerBound()
+ + ", "
+ + results.getEquivalenceClassSizeUpperBound()
+ + "]"
+ );
+
+ for (KAnonymityEquivalenceClass bucket : results.getBucketValuesList()) {
+ List quasiIdValues = bucket.getQuasiIdsValuesList()
+ .stream()
+ .map(v -> v.toString())
+ .collect(Collectors.toList());
+
+ System.out.println("\tQuasi-ID values: " + String.join(", ", quasiIdValues));
+ System.out.println("\tClass size: " + bucket.getEquivalenceClassSize());
+ }
+ } catch (Exception e) {
+ System.out.println("Error in kAnonymityAnalysis: " + e.getMessage());
+ }
+ // [END dlp_k_anonymity]
+ }
+
+ private static void calculateLDiversity(
+ String projectId,
+ String datasetId,
+ String tableId,
+ String sensitiveAttribute,
+ List quasiIds
+ ) throws Exception {
+ // [START dlp_l_diversity]
+ /**
+ * Calculate l-diversity for an attribute relative to quasi-identifiers in a BigQuery table.
+ * @param projectId The Google Cloud Platform project ID to run the API call under.
+ * @param datasetId The BigQuery dataset to analyze.
+ * @param tableId The BigQuery table to analyze.
+ * @param sensitiveAttribute The name of the attribute to compare the quasi-ID against
+ * @param quasiIds A set of column names that form a composite key ('quasi-identifiers').
+ */
+
+ // instantiate a client
+ try (DlpServiceClient dlpServiceClient = DlpServiceClient.create()) {
+
+ // projectId = process.env.GCLOUD_PROJECT;
+ // datasetId = "my_dataset";
+ // tableId = "my_table";
+ // sensitiveAttribute = "name";
+ // quasiIds = [{ columnName: "age" }, { columnName: "city" }];
+
+ FieldId sensitiveAttributeField =
+ FieldId.newBuilder()
+ .setColumnName(sensitiveAttribute)
+ .build();
+
+ List quasiIdFields =
+ quasiIds
+ .stream()
+ .map(columnName -> FieldId.newBuilder().setColumnName(columnName).build())
+ .collect(Collectors.toList());
+
+ LDiversityConfig ldiversityConfig =
+ LDiversityConfig.newBuilder()
+ .addAllQuasiIds(quasiIdFields)
+ .setSensitiveAttribute(sensitiveAttributeField)
+ .build();
+
+ BigQueryTable bigQueryTable =
+ BigQueryTable.newBuilder()
+ .setProjectId(projectId)
+ .setDatasetId(datasetId)
+ .setTableId(tableId)
+ .build();
+
+ PrivacyMetric privacyMetric =
+ PrivacyMetric.newBuilder()
+ .setLDiversityConfig(ldiversityConfig)
+ .build();
+
+ AnalyzeDataSourceRiskRequest request =
+ AnalyzeDataSourceRiskRequest.newBuilder()
+ .setPrivacyMetric(privacyMetric)
+ .setSourceTable(bigQueryTable)
+ .build();
+
+ // asynchronously submit a risk analysis operation
+ OperationFuture
+ responseFuture = dlpServiceClient.analyzeDataSourceRiskAsync(request);
+
+ // ...
+ // block on response
+ RiskAnalysisOperationResult response = responseFuture.get();
+ LDiversityHistogramBucket results =
+ response.getLDiversityResult().getSensitiveValueFrequencyHistogramBuckets(0);
+
+ for (LDiversityEquivalenceClass bucket : results.getBucketValuesList()) {
+ List quasiIdValues = bucket.getQuasiIdsValuesList()
+ .stream()
+ .map(v -> v.toString())
+ .collect(Collectors.toList());
+
+ System.out.println("\tQuasi-ID values: " + String.join(", ", quasiIdValues));
+ System.out.println("\tClass size: " + bucket.getEquivalenceClassSize());
+
+ for (ValueFrequency valueFrequency : bucket.getTopSensitiveValuesList()) {
+ System.out.println("\t\tSensitive value "
+ + valueFrequency.getValue().toString()
+ + " occurs "
+ + valueFrequency.getCount()
+ + " time(s).");
+ }
+ }
+ } catch (Exception e) {
+ System.out.println("Error in lDiversityAnalysis: " + e.getMessage());
+ }
+ // [END dlp_l_diversity]
+ }
+
+
+ /**
+ * Command line application to perform risk analysis using the Data Loss Prevention API.
+ * Supported data format: BigQuery tables
+ */
+ public static void main(String[] args) throws Exception {
+
+ OptionGroup optionsGroup = new OptionGroup();
+ optionsGroup.setRequired(true);
+
+ Option numericalAnalysisOption = new Option("n", "numerical");
+ optionsGroup.addOption(numericalAnalysisOption);
+
+ Option categoricalAnalysisOption = new Option("c", "categorical");
+ optionsGroup.addOption(categoricalAnalysisOption);
+
+ Option kanonymityOption = new Option("k", "kAnonymity");
+ optionsGroup.addOption(kanonymityOption);
+
+ Option ldiversityOption = new Option("l", "lDiversity");
+ optionsGroup.addOption(ldiversityOption);
+
+ Options commandLineOptions = new Options();
+ commandLineOptions.addOptionGroup(optionsGroup);
+
+ Option datasetIdOption = Option.builder("datasetId").hasArg(true).required(false).build();
+ commandLineOptions.addOption(datasetIdOption);
+
+ Option tableIdOption = Option.builder("tableId").hasArg(true).required(false).build();
+ commandLineOptions.addOption(tableIdOption);
+
+ Option projectIdOption = Option.builder("projectId").hasArg(true).required(false).build();
+ commandLineOptions.addOption(projectIdOption);
+
+ Option columnNameOption =
+ Option.builder("columnName").hasArg(true).required(false).build();
+ commandLineOptions.addOption(columnNameOption);
+
+ Option sensitiveAttributeOption =
+ Option.builder("sensitiveAttribute").hasArg(true).required(false).build();
+ commandLineOptions.addOption(sensitiveAttributeOption);
+
+ Option quasiIdColumnNamesOption =
+ Option.builder("quasiIdColumnNames").hasArg(true).required(false).build();
+ commandLineOptions.addOption(quasiIdColumnNamesOption);
+
+ CommandLineParser parser = new DefaultParser();
+ HelpFormatter formatter = new HelpFormatter();
+ CommandLine cmd;
+
+ try {
+ cmd = parser.parse(commandLineOptions, args);
+ } catch (ParseException e) {
+ System.out.println(e.getMessage());
+ formatter.printHelp(RiskAnalysis.class.getName(), commandLineOptions);
+ System.exit(1);
+ return;
+ }
+
+ String datasetId = cmd.getOptionValue(datasetIdOption.getOpt());
+ String tableId = cmd.getOptionValue(tableIdOption.getOpt());
+ // use default project id when project id is not specified
+ String projectId =
+ cmd.getOptionValue(
+ projectIdOption.getOpt(), ServiceOptions.getDefaultProjectId());
+
+ if (cmd.hasOption("n")) {
+ // numerical stats analysis
+ String columnName = cmd.getOptionValue(columnNameOption.getOpt());
+ calculateNumericalStats(projectId, datasetId, tableId, columnName);
+ } else if (cmd.hasOption("c")) {
+ // categorical stats analysis
+ String columnName = cmd.getOptionValue(columnNameOption.getOpt());
+ calculateCategoricalStats(projectId, datasetId, tableId, columnName);
+ } else if (cmd.hasOption("k")) {
+ // k-anonymity analysis
+ List quasiIdColumnNames =
+ Arrays.asList(cmd.getOptionValues(quasiIdColumnNamesOption.getOpt()));
+ calculateKAnonymity(projectId, datasetId, tableId, quasiIdColumnNames);
+ } else if (cmd.hasOption("l")) {
+ // l-diversity analysis
+ String sensitiveAttribute = cmd.getOptionValue(sensitiveAttributeOption.getOpt());
+ List quasiIdColumnNames =
+ Arrays.asList(cmd.getOptionValues(quasiIdColumnNamesOption.getOpt()));
+ calculateLDiversity(projectId, datasetId, tableId, sensitiveAttribute, quasiIdColumnNames);
+ }
+ }
+}
diff --git a/dlp/src/test/java/com/example/dlp/DeIdentificationIT.java b/dlp/src/test/java/com/example/dlp/DeIdentificationIT.java
new file mode 100644
index 00000000000..0097cee5af1
--- /dev/null
+++ b/dlp/src/test/java/com/example/dlp/DeIdentificationIT.java
@@ -0,0 +1,84 @@
+/**
+ * Copyright 2017 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.dlp;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayOutputStream;
+import java.io.PrintStream;
+import java.util.regex.Pattern;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class DeIdentificationIT {
+ private ByteArrayOutputStream bout;
+ private PrintStream out;
+
+ // Update to wrapped local encryption key
+ private String wrappedKey = System.getenv("DLP_DEID_WRAPPED_KEY");
+
+ // Update to name of KMS key used to wrap local encryption key
+ private String keyName = System.getenv("DLP_DEID_KEY_NAME");
+
+ @Before
+ public void setUp() {
+ bout = new ByteArrayOutputStream();
+ out = new PrintStream(bout);
+ System.setOut(out); // TODO(b/64541432) DLP currently doesn't support GOOGLE DEFAULT AUTH
+ assertNotNull(System.getenv("GOOGLE_APPLICATION_CREDENTIALS"));
+ assertNotNull(System.getenv("DLP_DEID_WRAPPED_KEY"));
+ assertNotNull(System.getenv("DLP_DEID_KEY_NAME"));
+ }
+
+ @Test
+ public void testDeidStringMasksCharacters() throws Exception {
+ String text = "\"My SSN is 372819127\"";
+ DeIdentification.main(new String[] {
+ "-m", text,
+ "-maskingCharacter", "x",
+ "-numberToMask", "5"
+ });
+ String output = bout.toString();
+ assertEquals(output, "My SSN is xxxxx9127\n");
+ }
+
+ @Test
+ public void testDeidStringPerformsFpe() throws Exception {
+ String text = "\"My SSN is 372819127\"";
+ DeIdentification.main(new String[] {
+ "-f", text,
+ "-wrappedKey", wrappedKey,
+ "-keyName", keyName
+ });
+ String output = bout.toString();
+ assertFalse(output.contains(text));
+ assertTrue(Pattern.compile("My SSN is \\w+").matcher(output).find());
+ }
+
+ @After
+ public void tearDown() {
+ System.setOut(null);
+ bout.reset();
+ }
+}
diff --git a/dlp/src/test/java/com/example/dlp/InspectIT.java b/dlp/src/test/java/com/example/dlp/InspectIT.java
index 788236a72fb..618c96d0a5c 100644
--- a/dlp/src/test/java/com/example/dlp/InspectIT.java
+++ b/dlp/src/test/java/com/example/dlp/InspectIT.java
@@ -87,7 +87,7 @@ public void testGcsFileInspectionReturnsInfoTypes() throws Exception {
assertTrue(output.contains("EMAIL_ADDRESS"));
}
- // Requires a Datastore kind containing an entity
+ // Requires a Datastore kind containing an entity
// with phone number and email address properties.
@Test
public void testDatastoreInspectionReturnsInfoTypes() throws Exception {
@@ -97,6 +97,17 @@ public void testDatastoreInspectionReturnsInfoTypes() throws Exception {
assertTrue(output.contains("EMAIL_ADDRESS"));
}
+ @Test
+ public void testBigqueryInspectionReturnsInfoTypes() throws Exception {
+ Inspect.main(new String[] {
+ "-bq",
+ "-datasetId", "integration_tests_dlp",
+ "-tableId", "harmful"
+ });
+ String output = bout.toString();
+ assertTrue(output.contains("PHONE_NUMBER"));
+ }
+
@After
public void tearDown() {
System.setOut(null);
diff --git a/dlp/src/test/java/com/example/dlp/RiskAnalysisIT.java b/dlp/src/test/java/com/example/dlp/RiskAnalysisIT.java
new file mode 100644
index 00000000000..de5fa22c722
--- /dev/null
+++ b/dlp/src/test/java/com/example/dlp/RiskAnalysisIT.java
@@ -0,0 +1,108 @@
+/**
+ * Copyright 2017 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.dlp;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayOutputStream;
+import java.io.PrintStream;
+import java.util.regex.Pattern;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class RiskAnalysisIT {
+ private ByteArrayOutputStream bout;
+ private PrintStream out;
+
+ @Before
+ public void setUp() {
+ bout = new ByteArrayOutputStream();
+ out = new PrintStream(bout);
+ System.setOut(out); // TODO(b/64541432) DLP currently doesn't support GOOGLE DEFAULT AUTH
+ assertNotNull(System.getenv("GOOGLE_APPLICATION_CREDENTIALS"));
+ assertNotNull(System.getenv("DLP_DEID_WRAPPED_KEY"));
+ assertNotNull(System.getenv("DLP_DEID_KEY_NAME"));
+ }
+
+ @Test
+ public void testNumericalStats() throws Exception {
+ RiskAnalysis.main(new String[] {
+ "-n",
+ "-datasetId", "integration_tests_dlp",
+ "-tableId", "harmful",
+ "-columnName", "Age"
+ });
+ String output = bout.toString();
+ assertTrue(Pattern.compile(
+ "Value at 0% quantile: integer_value: \\d{2}").matcher(output).find());
+ assertTrue(Pattern.compile(
+ "Value at \\d{2}% quantile: integer_value: \\d{2}").matcher(output).find());
+ }
+
+ @Test
+ public void testCategoricalStats() throws Exception {
+ RiskAnalysis.main(new String[] {
+ "-c",
+ "-datasetId", "integration_tests_dlp",
+ "-tableId", "harmful",
+ "-columnName", "Mystery"
+ });
+ String output = bout.toString();
+ assertTrue(Pattern.compile(
+ "Most common value occurs \\d time\\(s\\)").matcher(output).find());
+ }
+
+ @Test
+ public void testKAnonymity() throws Exception {
+ RiskAnalysis.main(new String[] {
+ "-k",
+ "-datasetId", "integration_tests_dlp",
+ "-tableId", "harmful",
+ "-quasiIdColumnNames", "Age", "Mystery"
+ });
+ String output = bout.toString();
+ assertTrue(Pattern.compile("Bucket size range: \\[\\d, \\d\\]").matcher(output).find());
+ assertTrue(output.contains("Quasi-ID values: integer_value: 19"));
+ assertTrue(output.contains("Class size: 1"));
+ }
+
+ @Test
+ public void testLDiversity() throws Exception {
+ RiskAnalysis.main(new String[] {
+ "-l",
+ "-datasetId", "integration_tests_dlp",
+ "-tableId", "harmful",
+ "-sensitiveAttribute", "Name",
+ "-quasiIdColumnNames", "Age", "Mystery"
+ });
+ String output = bout.toString();
+ assertTrue(output.contains("Quasi-ID values: integer_value: 19"));
+ assertTrue(output.contains("Class size: 1"));
+ assertTrue(output.contains("Sensitive value string_value: \"James\""));
+ }
+
+ @After
+ public void tearDown() {
+ System.setOut(null);
+ bout.reset();
+ }
+}