Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
247 changes: 127 additions & 120 deletions dlp/src/categorical_stats.php
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,9 @@
* @see https://github.com/GoogleCloudPlatform/php-docs-samples/tree/master/dlp/README.md
*/

// Include Google Cloud dependendencies using Composer
require_once __DIR__ . '/../vendor/autoload.php';

if (count($argv) != 8) {
return print("Usage: php categorical_stats.php CALLING_PROJECT DATA_PROJECT TOPIC SUBSCRIPTION DATASET TABLE COLUMN\n");
}
list($_, $callingProjectId, $dataProjectId, $topicId, $subscriptionId, $datasetId, $tableId, $columnName) = $argv;
namespace Google\Cloud\Samples\Dlp;

# [START dlp_categorical_stats]
/**
* Computes risk metrics of a column of data in a Google BigQuery table.
*/
use Google\Cloud\Dlp\V2\DlpServiceClient;
use Google\Cloud\Dlp\V2\RiskAnalysisJobConfig;
use Google\Cloud\Dlp\V2\BigQueryTable;
Expand All @@ -45,118 +36,134 @@
use Google\Cloud\Dlp\V2\FieldId;
use Google\Cloud\PubSub\PubSubClient;

/** Uncomment and populate these variables in your code */
// $callingProjectId = 'The project ID to run the API call under';
// $dataProjectId = 'The project ID containing the target Datastore';
// $topicId = 'The name of the Pub/Sub topic to notify once the job completes';
// $subscriptionId = 'The name of the Pub/Sub subscription to use when listening for job';
// $datasetId = 'The ID of the dataset to inspect';
// $tableId = 'The ID of the table to inspect';
// $columnName = 'The name of the column to compute risk metrics for, e.g. "age"';

// Instantiate a client.
$dlp = new DlpServiceClient([
'projectId' => $callingProjectId,
]);
$pubsub = new PubSubClient([
'projectId' => $callingProjectId,
]);
$topic = $pubsub->topic($topicId);

// Construct risk analysis config
$columnField = (new FieldId())
->setName($columnName);

$statsConfig = (new CategoricalStatsConfig())
->setField($columnField);

$privacyMetric = (new PrivacyMetric())
->setCategoricalStatsConfig($statsConfig);

// Construct items to be analyzed
$bigqueryTable = (new BigQueryTable())
->setProjectId($dataProjectId)
->setDatasetId($datasetId)
->setTableId($tableId);

// Construct the action to run when job completes
$pubSubAction = (new PublishToPubSub())
->setTopic($topic->name());

$action = (new Action())
->setPubSub($pubSubAction);

// Construct risk analysis job config to run
$riskJob = (new RiskAnalysisJobConfig())
->setPrivacyMetric($privacyMetric)
->setSourceTable($bigqueryTable)
->setActions([$action]);

// Submit request
$parent = "projects/$callingProjectId/locations/global";
$job = $dlp->createDlpJob($parent, [
'riskJob' => $riskJob
]);

// Listen for job notifications via an existing topic/subscription.
$subscription = $topic->subscription($subscriptionId);

// Poll Pub/Sub using exponential backoff until job finishes
// Consider using an asynchronous execution model such as Cloud Functions
$attempt = 1;
$startTime = time();
do {
foreach ($subscription->pull() as $message) {
if (isset($message->attributes()['DlpJobName']) &&
$message->attributes()['DlpJobName'] === $job->getName()) {
$subscription->acknowledge($message);
// Get the updated job. Loop to avoid race condition with DLP API.
do {
$job = $dlp->getDlpJob($job->getName());
} while ($job->getState() == JobState::RUNNING);
break 2; // break from parent do while
}
}
printf('Waiting for job to complete' . PHP_EOL);
// Exponential backoff with max delay of 60 seconds
sleep(min(60, pow(2, ++$attempt)));
} while (time() - $startTime < 600); // 10 minute timeout

// Print finding counts
printf('Job %s status: %s' . PHP_EOL, $job->getName(), JobState::name($job->getState()));
switch ($job->getState()) {
case JobState::DONE:
$histBuckets = $job->getRiskDetails()->getCategoricalStatsResult()->getValueFrequencyHistogramBuckets();

foreach ($histBuckets as $bucketIndex => $histBucket) {
// Print bucket stats
printf('Bucket %s:' . PHP_EOL, $bucketIndex);
printf(' Most common value occurs %s time(s)' . PHP_EOL, $histBucket->getValueFrequencyUpperBound());
printf(' Least common value occurs %s time(s)' . PHP_EOL, $histBucket->getValueFrequencyLowerBound());
printf(' %s unique value(s) total.', $histBucket->getBucketSize());

// Print bucket values
foreach ($histBucket->getBucketValues() as $percent => $quantile) {
printf(
' Value %s occurs %s time(s).' . PHP_EOL,
$quantile->getValue()->serializeToJsonString(),
$quantile->getCount()
);
/**
* Computes risk metrics of a column of data in a Google BigQuery table.
*
* @param string $callingProjectId The project ID to run the API call under
* @param string $dataProjectId The project ID containing the target Datastore
* @param string $topicId The name of the Pub/Sub topic to notify once the job completes
* @param string $subscriptionId The name of the Pub/Sub subscription to use when listening for job
* @param string $datasetId The ID of the dataset to inspect
* @param string $tableId The ID of the table to inspect
* @param string $columnName The name of the column to compute risk metrics for, e.g. "age"
*/
function categorical_stats(
string $callingProjectId,
string $dataProjectId,
string $topicId,
string $subscriptionId,
string $datasetId,
string $tableId,
string $columnName
): void {
// Instantiate a client.
$dlp = new DlpServiceClient([
'projectId' => $callingProjectId,
]);
$pubsub = new PubSubClient([
'projectId' => $callingProjectId,
]);
$topic = $pubsub->topic($topicId);

// Construct risk analysis config
$columnField = (new FieldId())
->setName($columnName);

$statsConfig = (new CategoricalStatsConfig())
->setField($columnField);

$privacyMetric = (new PrivacyMetric())
->setCategoricalStatsConfig($statsConfig);

// Construct items to be analyzed
$bigqueryTable = (new BigQueryTable())
->setProjectId($dataProjectId)
->setDatasetId($datasetId)
->setTableId($tableId);

// Construct the action to run when job completes
$pubSubAction = (new PublishToPubSub())
->setTopic($topic->name());

$action = (new Action())
->setPubSub($pubSubAction);

// Construct risk analysis job config to run
$riskJob = (new RiskAnalysisJobConfig())
->setPrivacyMetric($privacyMetric)
->setSourceTable($bigqueryTable)
->setActions([$action]);

// Submit request
$parent = "projects/$callingProjectId/locations/global";
$job = $dlp->createDlpJob($parent, [
'riskJob' => $riskJob
]);

// Listen for job notifications via an existing topic/subscription.
$subscription = $topic->subscription($subscriptionId);

// Poll Pub/Sub using exponential backoff until job finishes
// Consider using an asynchronous execution model such as Cloud Functions
$attempt = 1;
$startTime = time();
do {
foreach ($subscription->pull() as $message) {
if (isset($message->attributes()['DlpJobName']) &&
$message->attributes()['DlpJobName'] === $job->getName()) {
$subscription->acknowledge($message);
// Get the updated job. Loop to avoid race condition with DLP API.
do {
$job = $dlp->getDlpJob($job->getName());
} while ($job->getState() == JobState::RUNNING);
break 2; // break from parent do while
}
}
printf('Waiting for job to complete' . PHP_EOL);
// Exponential backoff with max delay of 60 seconds
sleep(min(60, pow(2, ++$attempt)));
} while (time() - $startTime < 600); // 10 minute timeout

// Print finding counts
printf('Job %s status: %s' . PHP_EOL, $job->getName(), JobState::name($job->getState()));
switch ($job->getState()) {
case JobState::DONE:
$histBuckets = $job->getRiskDetails()->getCategoricalStatsResult()->getValueFrequencyHistogramBuckets();

foreach ($histBuckets as $bucketIndex => $histBucket) {
// Print bucket stats
printf('Bucket %s:' . PHP_EOL, $bucketIndex);
printf(' Most common value occurs %s time(s)' . PHP_EOL, $histBucket->getValueFrequencyUpperBound());
printf(' Least common value occurs %s time(s)' . PHP_EOL, $histBucket->getValueFrequencyLowerBound());
printf(' %s unique value(s) total.', $histBucket->getBucketSize());

// Print bucket values
foreach ($histBucket->getBucketValues() as $percent => $quantile) {
printf(
' Value %s occurs %s time(s).' . PHP_EOL,
$quantile->getValue()->serializeToJsonString(),
$quantile->getCount()
);
}
}

break;
case JobState::FAILED:
$errors = $job->getErrors();
printf('Job %s had errors:' . PHP_EOL, $job->getName());
foreach ($errors as $error) {
var_dump($error->getDetails());
}
break;
case JobState::PENDING:
printf('Job has not completed. Consider a longer timeout or an asynchronous execution model' . PHP_EOL);
break;
default:
printf('Unexpected job state.');
break;
case JobState::FAILED:
$errors = $job->getErrors();
printf('Job %s had errors:' . PHP_EOL, $job->getName());
foreach ($errors as $error) {
var_dump($error->getDetails());
}
break;
case JobState::PENDING:
printf('Job has not completed. Consider a longer timeout or an asynchronous execution model' . PHP_EOL);
break;
default:
printf('Unexpected job state.');
}
}
# [END dlp_categorical_stats]

// The following 2 lines are only needed to run the samples
require_once __DIR__ . '/../../testing/sample_helpers.php';
\Google\Cloud\Samples\execute_sample(__FILE__, __NAMESPACE__, $argv);
110 changes: 56 additions & 54 deletions dlp/src/create_inspect_template.php
Original file line number Diff line number Diff line change
Expand Up @@ -22,75 +22,77 @@
* @see https://github.com/GoogleCloudPlatform/php-docs-samples/tree/master/dlp/README.md
*/

// Include Google Cloud dependendencies using Composer
require_once __DIR__ . '/../vendor/autoload.php';

if (count($argv) < 3 || count($argv) > 6) {
return print("Usage: php create_inspect_template.php CALLING_PROJECT TEMPLATE [DISPLAY_NAME] [DESCRIPTION] [MAX_FINDINGS]\n");
}
list($_, $callingProjectId, $templateId, $displayName, $description) = $argv;
$displayName = isset($argv[3]) ? $argv[3] : '';
$description = isset($argv[4]) ? $argv[4] : '';
$maxFindings = isset($argv[5]) ? (int) $argv[5] : 0;
namespace Google\Cloud\Samples\Dlp;

// [START dlp_create_inspect_template]
/**
* Create a new DLP inspection configuration template.
*/
use Google\Cloud\Dlp\V2\DlpServiceClient;
use Google\Cloud\Dlp\V2\InfoType;
use Google\Cloud\Dlp\V2\InspectConfig;
use Google\Cloud\Dlp\V2\InspectTemplate;
use Google\Cloud\Dlp\V2\Likelihood;
use Google\Cloud\Dlp\V2\InspectConfig\FindingLimits;

/** Uncomment and populate these variables in your code */
// $callingProjectId = 'The project ID to run the API call under';
// $templateId = 'The name of the template to be created';
// $displayName = ''; // (Optional) The human-readable name to give the template
// $description = ''; // (Optional) A description for the trigger to be created
// $maxFindings = 0; // (Optional) The maximum number of findings to report per request (0 = server maximum)

// Instantiate a client.
$dlp = new DlpServiceClient();
/**
* Create a new DLP inspection configuration template.
*
* @param string $callingProjectId project ID to run the API call under
* @param string $templateId name of the template to be created
* @param string $displayName (Optional) The human-readable name to give the template
* @param string $description (Optional) A description for the trigger to be created
* @param int $maxFindings (Optional) The maximum number of findings to report per request (0 = server maximum)
*/
function create_inspect_template(
string $callingProjectId,
string $templateId,
string $displayName = '',
string $description = '',
int $maxFindings = 0
): void {
// Instantiate a client.
$dlp = new DlpServiceClient();

// ----- Construct inspection config -----
// The infoTypes of information to match
$personNameInfoType = (new InfoType())
->setName('PERSON_NAME');
$phoneNumberInfoType = (new InfoType())
->setName('PHONE_NUMBER');
$infoTypes = [$personNameInfoType, $phoneNumberInfoType];
// ----- Construct inspection config -----
// The infoTypes of information to match
$personNameInfoType = (new InfoType())
->setName('PERSON_NAME');
$phoneNumberInfoType = (new InfoType())
->setName('PHONE_NUMBER');
$infoTypes = [$personNameInfoType, $phoneNumberInfoType];

// Whether to include the matching string in the response
$includeQuote = true;
// Whether to include the matching string in the response
$includeQuote = true;

// The minimum likelihood required before returning a match
$minLikelihood = likelihood::LIKELIHOOD_UNSPECIFIED;
// The minimum likelihood required before returning a match
$minLikelihood = likelihood::LIKELIHOOD_UNSPECIFIED;

// Specify finding limits
$limits = (new FindingLimits())
->setMaxFindingsPerRequest($maxFindings);
// Specify finding limits
$limits = (new FindingLimits())
->setMaxFindingsPerRequest($maxFindings);

// Create the configuration object
$inspectConfig = (new InspectConfig())
->setMinLikelihood($minLikelihood)
->setLimits($limits)
->setInfoTypes($infoTypes)
->setIncludeQuote($includeQuote);
// Create the configuration object
$inspectConfig = (new InspectConfig())
->setMinLikelihood($minLikelihood)
->setLimits($limits)
->setInfoTypes($infoTypes)
->setIncludeQuote($includeQuote);

// Construct inspection template
$inspectTemplate = (new InspectTemplate())
->setInspectConfig($inspectConfig)
->setDisplayName($displayName)
->setDescription($description);
// Construct inspection template
$inspectTemplate = (new InspectTemplate())
->setInspectConfig($inspectConfig)
->setDisplayName($displayName)
->setDescription($description);

// Run request
$parent = "projects/$callingProjectId/locations/global";
$template = $dlp->createInspectTemplate($parent, $inspectTemplate, [
'templateId' => $templateId
]);
// Run request
$parent = "projects/$callingProjectId/locations/global";
$template = $dlp->createInspectTemplate($parent, $inspectTemplate, [
'templateId' => $templateId
]);

// Print results
printf('Successfully created template %s' . PHP_EOL, $template->getName());
// Print results
printf('Successfully created template %s' . PHP_EOL, $template->getName());
}
// [END dlp_create_inspect_template]

// The following 2 lines are only needed to run the samples
require_once __DIR__ . '/../../testing/sample_helpers.php';
\Google\Cloud\Samples\execute_sample(__FILE__, __NAMESPACE__, $argv);
Loading