Skip to content

Commit fee6e59

Browse files
authored
feat(dlp): Sample for Inspect BigQuery for sensitive data with sampling (GoogleCloudPlatform#1835)
1 parent 24de8f1 commit fee6e59

File tree

3 files changed

+433
-146
lines changed

3 files changed

+433
-146
lines changed
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
<?php
2+
3+
/**
4+
* Copyright 2023 Google Inc.
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License");
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
/**
20+
* For instructions on how to run the samples:
21+
*
22+
* @see https://github.com/GoogleCloudPlatform/php-docs-samples/tree/main/dlp/README.md
23+
*/
24+
25+
namespace Google\Cloud\Samples\Dlp;
26+
27+
# [START dlp_inspect_bigquery_with_sampling]
28+
29+
use Google\Cloud\Dlp\V2\DlpServiceClient;
30+
use Google\Cloud\Dlp\V2\BigQueryOptions;
31+
use Google\Cloud\Dlp\V2\InfoType;
32+
use Google\Cloud\Dlp\V2\InspectConfig;
33+
use Google\Cloud\Dlp\V2\StorageConfig;
34+
use Google\Cloud\Dlp\V2\BigQueryTable;
35+
use Google\Cloud\Dlp\V2\DlpJob\JobState;
36+
use Google\Cloud\Dlp\V2\Action;
37+
use Google\Cloud\Dlp\V2\Action\PublishToPubSub;
38+
use Google\Cloud\Dlp\V2\BigQueryOptions\SampleMethod;
39+
use Google\Cloud\Dlp\V2\FieldId;
40+
use Google\Cloud\Dlp\V2\InspectJobConfig;
41+
use Google\Cloud\PubSub\PubSubClient;
42+
43+
/**
44+
* Inspect BigQuery for sensitive data with sampling.
45+
* The following examples demonstrate using the Cloud Data Loss Prevention
46+
* API to scan a 1000-row subset of a BigQuery table. The scan starts from
47+
* a random row.
48+
*
49+
* @param string $callingProjectId The project ID to run the API call under.
50+
* @param string $topicId The Pub/Sub topic ID to notify once the job is completed.
51+
* @param string $subscriptionId The Pub/Sub subscription ID to use when listening for job.
52+
* @param string $projectId The Google Cloud Project ID.
53+
* @param string $datasetId The BigQuery Dataset ID.
54+
* @param string $tableId The BigQuery Table ID to be inspected.
55+
*/
56+
function inspect_bigquery_with_sampling(
57+
string $callingProjectId,
58+
string $topicId,
59+
string $subscriptionId,
60+
string $projectId,
61+
string $datasetId,
62+
string $tableId
63+
): void {
64+
// Instantiate a client.
65+
$dlp = new DlpServiceClient();
66+
$pubsub = new PubSubClient();
67+
$topic = $pubsub->topic($topicId);
68+
69+
// Specify the BigQuery table to be inspected.
70+
$bigqueryTable = (new BigQueryTable())
71+
->setProjectId($projectId)
72+
->setDatasetId($datasetId)
73+
->setTableId($tableId);
74+
75+
$bigQueryOptions = (new BigQueryOptions())
76+
->setTableReference($bigqueryTable)
77+
->setRowsLimit(1000)
78+
->setSampleMethod(SampleMethod::RANDOM_START)
79+
->setIdentifyingFields([
80+
(new FieldId())
81+
->setName('name')
82+
]);
83+
84+
$storageConfig = (new StorageConfig())
85+
->setBigQueryOptions($bigQueryOptions);
86+
87+
// Specify the type of info the inspection will look for.
88+
// See https://cloud.google.com/dlp/docs/infotypes-reference for complete list of info types
89+
$personNameInfoType = (new InfoType())
90+
->setName('PERSON_NAME');
91+
$infoTypes = [$personNameInfoType];
92+
93+
// Specify how the content should be inspected.
94+
$inspectConfig = (new InspectConfig())
95+
->setInfoTypes($infoTypes)
96+
->setIncludeQuote(true);
97+
98+
// Specify the action that is triggered when the job completes.
99+
$pubSubAction = (new PublishToPubSub())
100+
->setTopic($topic->name());
101+
102+
$action = (new Action())
103+
->setPubSub($pubSubAction);
104+
105+
// Configure the long running job we want the service to perform.
106+
$inspectJob = (new InspectJobConfig())
107+
->setInspectConfig($inspectConfig)
108+
->setStorageConfig($storageConfig)
109+
->setActions([$action]);
110+
111+
// Listen for job notifications via an existing topic/subscription.
112+
$subscription = $topic->subscription($subscriptionId);
113+
114+
// Submit request
115+
$parent = "projects/$callingProjectId/locations/global";
116+
$job = $dlp->createDlpJob($parent, [
117+
'inspectJob' => $inspectJob
118+
]);
119+
120+
// Poll Pub/Sub using exponential backoff until job finishes
121+
// Consider using an asynchronous execution model such as Cloud Functions
122+
$attempt = 1;
123+
$startTime = time();
124+
do {
125+
foreach ($subscription->pull() as $message) {
126+
if (
127+
isset($message->attributes()['DlpJobName']) &&
128+
$message->attributes()['DlpJobName'] === $job->getName()
129+
) {
130+
$subscription->acknowledge($message);
131+
// Get the updated job. Loop to avoid race condition with DLP API.
132+
do {
133+
$job = $dlp->getDlpJob($job->getName());
134+
} while ($job->getState() == JobState::RUNNING);
135+
break 2; // break from parent do while
136+
}
137+
}
138+
printf('Waiting for job to complete' . PHP_EOL);
139+
// Exponential backoff with max delay of 60 seconds
140+
sleep(min(60, pow(2, ++$attempt)));
141+
} while (time() - $startTime < 600); // 10 minute timeout
142+
143+
// Print finding counts
144+
printf('Job %s status: %s' . PHP_EOL, $job->getName(), JobState::name($job->getState()));
145+
switch ($job->getState()) {
146+
case JobState::DONE:
147+
$infoTypeStats = $job->getInspectDetails()->getResult()->getInfoTypeStats();
148+
if (count($infoTypeStats) === 0) {
149+
printf('No findings.' . PHP_EOL);
150+
} else {
151+
foreach ($infoTypeStats as $infoTypeStat) {
152+
printf(
153+
' Found %s instance(s) of infoType %s' . PHP_EOL,
154+
$infoTypeStat->getCount(),
155+
$infoTypeStat->getInfoType()->getName()
156+
);
157+
}
158+
}
159+
break;
160+
case JobState::FAILED:
161+
printf('Job %s had errors:' . PHP_EOL, $job->getName());
162+
$errors = $job->getErrors();
163+
foreach ($errors as $error) {
164+
var_dump($error->getDetails());
165+
}
166+
break;
167+
case JobState::PENDING:
168+
printf('Job has not completed. Consider a longer timeout or an asynchronous execution model' . PHP_EOL);
169+
break;
170+
default:
171+
printf('Unexpected job state. Most likely, the job is either running or has not yet started.');
172+
}
173+
}
174+
# [END dlp_inspect_bigquery_with_sampling]
175+
176+
// The following 2 lines are only needed to run the samples
177+
require_once __DIR__ . '/../../testing/sample_helpers.php';
178+
\Google\Cloud\Samples\execute_sample(__FILE__, __NAMESPACE__, $argv);

dlp/test/dlpLongRunningTest.php

Lines changed: 61 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,47 @@ public static function tearDownAfterClass(): void
6363
self::$subscription->delete();
6464
}
6565

66+
private function writeTempSample(string $sampleName, array $replacements): string
67+
{
68+
$sampleFile = sprintf('%s/../src/%s.php', __DIR__, $sampleName);
69+
$tmpFileName = 'dlp_' . basename($sampleFile, '.php');
70+
$tmpFilePath = sys_get_temp_dir() . '/' . $tmpFileName . '.php';
71+
72+
$fileContent = file_get_contents($sampleFile);
73+
$replacements[$sampleName] = $tmpFileName;
74+
$fileContent = strtr($fileContent, $replacements);
75+
76+
$tmpFile = file_put_contents(
77+
$tmpFilePath,
78+
$fileContent
79+
);
80+
81+
return $tmpFilePath;
82+
}
83+
84+
public function dlpJobResponse()
85+
{
86+
$createDlpJobResponse = (new DlpJob())
87+
->setName('projects/' . self::$projectId . '/dlpJobs/i-3208317104051988812')
88+
->setState(JobState::PENDING);
89+
90+
$result = $this->prophesize(Result::class);
91+
$infoTypeStats1 = $this->prophesize(InfoTypeStats::class);
92+
$infoTypeStats1->getInfoType()->shouldBeCalled()->willReturn((new InfoType())->setName('PERSON_NAME'));
93+
$infoTypeStats1->getCount()->shouldBeCalled()->willReturn(5);
94+
$result->getInfoTypeStats()->shouldBeCalled()->willReturn([$infoTypeStats1->reveal()]);
95+
96+
$inspectDetails = $this->prophesize(InspectDataSourceDetails::class);
97+
$inspectDetails->getResult()->shouldBeCalled()->willReturn($result->reveal());
98+
99+
$getDlpJobResponse = $this->prophesize(DlpJob::class);
100+
$getDlpJobResponse->getName()->shouldBeCalled()->willReturn('projects/' . self::$projectId . '/dlpJobs/i-3208317104051988812');
101+
$getDlpJobResponse->getState()->shouldBeCalled()->willReturn(JobState::DONE);
102+
$getDlpJobResponse->getInspectDetails()->shouldBeCalled()->willReturn($inspectDetails->reveal());
103+
104+
return ['createDlpJob' => $createDlpJobResponse, 'getDlpJob' => $getDlpJobResponse];
105+
}
106+
66107
public function testInspectDatastore()
67108
{
68109
$kind = 'Person';
@@ -102,31 +143,14 @@ public function testInspectGCS()
102143
// Mock the necessary objects and methods
103144
$dlpServiceClientMock = $this->prophesize(DlpServiceClient::class);
104145

105-
$createDlpJobResponse = (new DlpJob())
106-
->setName('projects/' . self::$projectId . '/dlpJobs/job-name-123')
107-
->setState(JobState::PENDING);
108-
109-
$getDlpJobResponse = (new DlpJob())
110-
->setName('projects/' . self::$projectId . '/dlpJobs/job-name-123')
111-
->setState(JobState::DONE)
112-
->setInspectDetails((new InspectDataSourceDetails())
113-
->setResult((new Result())
114-
->setInfoTypeStats([
115-
(new InfoTypeStats())
116-
->setInfoType((new InfoType())->setName('PERSON_NAME'))
117-
->setCount(3),
118-
(new InfoTypeStats())
119-
->setInfoType((new InfoType())->setName('CREDIT_CARD_NUMBER'))
120-
->setCount(3)
121-
])));
122-
146+
$dlpJobResponse = $this->dlpJobResponse();
123147
$dlpServiceClientMock->createDlpJob(Argument::any(), Argument::any())
124148
->shouldBeCalled()
125-
->willReturn($createDlpJobResponse);
149+
->willReturn($dlpJobResponse['createDlpJob']);
126150

127151
$dlpServiceClientMock->getDlpJob(Argument::any())
128152
->shouldBeCalled()
129-
->willReturn($getDlpJobResponse);
153+
->willReturn($dlpJobResponse['getDlpJob']);
130154

131155
$pubSubClientMock = $this->prophesize(PubSubClient::class);
132156
$topicMock = $this->prophesize(Topic::class);
@@ -152,50 +176,42 @@ public function testInspectGCS()
152176

153177
$messageMock->attributes()
154178
->shouldBeCalledTimes(2)
155-
->willReturn(['DlpJobName' => 'projects/' . self::$projectId . '/dlpJobs/job-name-123']);
179+
->willReturn(['DlpJobName' => 'projects/' . self::$projectId . '/dlpJobs/i-3208317104051988812']);
156180

157181
$subscriptionMock->acknowledge(Argument::any())
158182
->shouldBeCalled()
159183
->willReturn($messageMock->reveal());
160184

161185
// Creating a temp file for testing.
162-
$sampleFile = __DIR__ . '/../src/inspect_gcs.php';
163-
$tmpFileName = basename($sampleFile, '.php') . '_temp';
164-
$tmpFilePath = __DIR__ . '/../src/' . $tmpFileName . '.php';
186+
$callFunction = sprintf(
187+
"dlp_inspect_gcs('%s','%s','%s','%s','%s');",
188+
self::$projectId,
189+
$topicId,
190+
$subscriptionId,
191+
$bucketName,
192+
$objectName,
193+
);
165194

166-
$fileContent = file_get_contents($sampleFile);
167-
$replacements = [
195+
$tmpFile = $this->writeTempSample('inspect_gcs', [
168196
'$dlp = new DlpServiceClient();' => 'global $dlp;',
169197
'$pubsub = new PubSubClient();' => 'global $pubsub;',
170-
'inspect_gcs' => $tmpFileName
171-
];
172-
$fileContent = strtr($fileContent, $replacements);
173-
$tmpFile = file_put_contents(
174-
$tmpFilePath,
175-
$fileContent
176-
);
198+
"require_once __DIR__ . '/../../testing/sample_helpers.php';" => '',
199+
'\Google\Cloud\Samples\execute_sample(__FILE__, __NAMESPACE__, $argv);' => $callFunction
200+
]);
177201
global $dlp;
178202
global $pubsub;
179203

180204
$dlp = $dlpServiceClientMock->reveal();
181205
$pubsub = $pubSubClientMock->reveal();
182206

183-
// Call the method under test
184-
$output = $this->runFunctionSnippet($tmpFileName, [
185-
self::$projectId,
186-
$topicId,
187-
$subscriptionId,
188-
$bucketName,
189-
$objectName,
190-
]);
191-
192-
// delete topic , subscription , and temp file
193-
unlink($tmpFilePath);
207+
// Invoke file and capture output
208+
ob_start();
209+
include $tmpFile;
210+
$output = ob_get_clean();
194211

195212
// Assert the expected behavior or outcome
196213
$this->assertStringContainsString('Job projects/' . self::$projectId . '/dlpJobs/', $output);
197214
$this->assertStringContainsString('infoType PERSON_NAME', $output);
198-
$this->assertStringContainsString('infoType CREDIT_CARD_NUMBER', $output);
199215
}
200216

201217
public function testNumericalStats()

0 commit comments

Comments
 (0)