Skip to content

Commit ec929af

Browse files
authored
feat(dlp): DLP de-identify cloud storage (GoogleCloudPlatform#1856)
1 parent 12c4dfc commit ec929af

File tree

2 files changed

+268
-0
lines changed

2 files changed

+268
-0
lines changed
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
<?php
2+
/**
3+
* Copyright 2023 Google Inc.
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
/**
19+
* For instructions on how to run the samples:
20+
*
21+
* @see https://github.com/GoogleCloudPlatform/php-docs-samples/tree/main/dlp/README.md
22+
*/
23+
24+
namespace Google\Cloud\Samples\Dlp;
25+
26+
# [START dlp_deidentify_cloud_storage]
27+
use Google\Cloud\Dlp\V2\CloudStorageOptions;
28+
use Google\Cloud\Dlp\V2\CloudStorageOptions\FileSet;
29+
use Google\Cloud\Dlp\V2\DlpServiceClient;
30+
use Google\Cloud\Dlp\V2\InfoType;
31+
use Google\Cloud\Dlp\V2\InspectConfig;
32+
use Google\Cloud\Dlp\V2\StorageConfig;
33+
use Google\Cloud\Dlp\V2\Action;
34+
use Google\Cloud\Dlp\V2\Action\Deidentify;
35+
use Google\Cloud\Dlp\V2\BigQueryTable;
36+
use Google\Cloud\Dlp\V2\FileType;
37+
use Google\Cloud\Dlp\V2\InspectJobConfig;
38+
use Google\Cloud\Dlp\V2\TransformationConfig;
39+
use Google\Cloud\Dlp\V2\TransformationDetailsStorageConfig;
40+
use Google\Cloud\Dlp\V2\Client\BaseClient\DlpServiceBaseClient;
41+
use Google\Cloud\Dlp\V2\DlpJob\JobState;
42+
43+
/**
44+
* De-identify sensitive data stored in Cloud Storage using the API.
45+
* Create an inspection job that has a de-identification action.
46+
*
47+
* @param string $callingProjectId The project ID to run the API call under.
48+
* @param string $inputgcsPath The Cloud Storage directory that you want to de-identify.
49+
* @param string $outgcsPath The Cloud Storage directory where you want to store the
50+
* de-identified files.
51+
* @param string $deidentifyTemplateName The full resource name of the default de-identify template — for
52+
* unstructured and structured files — if you created one. This value
53+
* must be in the format
54+
* `projects/projectName/(locations/locationId)/deidentifyTemplates/templateName`.
55+
* @param string $structuredDeidentifyTemplateName The full resource name of the de-identify template for structured
56+
* files if you created one. This value must be in the format
57+
* `projects/projectName/(locations/locationId)/deidentifyTemplates/templateName`.
58+
* @param string $imageRedactTemplateName The full resource name of the image redaction template for images if
59+
* you created one. This value must be in the format
60+
* `projects/projectName/(locations/locationId)/deidentifyTemplates/templateName`.
61+
* @param string $datasetId The ID of the BigQuery dataset where you want to store
62+
* the transformation details. If you don't provide a table ID, the
63+
* system automatically creates one.
64+
* @param string $tableId The ID of the BigQuery table where you want to store the
65+
* transformation details.
66+
*/
67+
function deidentify_cloud_storage(
68+
// TODO(developer): Replace sample parameters before running the code.
69+
string $callingProjectId,
70+
string $inputgcsPath = 'gs://YOUR_GOOGLE_STORAGE_BUCKET',
71+
string $outgcsPath = 'gs://YOUR_GOOGLE_STORAGE_BUCKET',
72+
string $deidentifyTemplateName = 'YOUR_DEIDENTIFY_TEMPLATE_NAME',
73+
string $structuredDeidentifyTemplateName = 'YOUR_STRUCTURED_DEIDENTIFY_TEMPLATE_NAME',
74+
string $imageRedactTemplateName = 'YOUR_IMAGE_REDACT_DEIDENTIFY_TEMPLATE_NAME',
75+
string $datasetId = 'YOUR_DATASET_ID',
76+
string $tableId = 'YOUR_TABLE_ID'
77+
): void {
78+
// Instantiate a client.
79+
$dlp = new DlpServiceClient();
80+
81+
$parent = "projects/$callingProjectId/locations/global";
82+
83+
// Specify the GCS Path to be de-identify.
84+
$cloudStorageOptions = (new CloudStorageOptions())
85+
->setFileSet((new FileSet())
86+
->setUrl($inputgcsPath));
87+
$storageConfig = (new StorageConfig())
88+
->setCloudStorageOptions(($cloudStorageOptions));
89+
90+
// Specify the type of info the inspection will look for.
91+
$inspectConfig = (new InspectConfig())
92+
->setInfoTypes([
93+
(new InfoType())->setName('PERSON_NAME'),
94+
(new InfoType())->setName('EMAIL_ADDRESS')
95+
]);
96+
97+
// Specify the big query table to store the transformation details.
98+
$transformationDetailsStorageConfig = (new TransformationDetailsStorageConfig())
99+
->setTable((new BigQueryTable())
100+
->setProjectId($callingProjectId)
101+
->setDatasetId($datasetId)
102+
->setTableId($tableId));
103+
104+
// Specify the de-identify template used for the transformation.
105+
$transformationConfig = (new TransformationConfig())
106+
->setDeidentifyTemplate(
107+
DlpServiceBaseClient::projectDeidentifyTemplateName($callingProjectId, $deidentifyTemplateName)
108+
)
109+
->setStructuredDeidentifyTemplate(
110+
DlpServiceBaseClient::projectDeidentifyTemplateName($callingProjectId, $structuredDeidentifyTemplateName)
111+
)
112+
->setImageRedactTemplate(
113+
DlpServiceBaseClient::projectDeidentifyTemplateName($callingProjectId, $imageRedactTemplateName)
114+
);
115+
116+
$deidentify = (new Deidentify())
117+
->setCloudStorageOutput($outgcsPath)
118+
->setTransformationConfig($transformationConfig)
119+
->setTransformationDetailsStorageConfig($transformationDetailsStorageConfig)
120+
->setFileTypesToTransform([FileType::TEXT_FILE, FileType::IMAGE, FileType::CSV]);
121+
122+
$action = (new Action())
123+
->setDeidentify($deidentify);
124+
125+
// Configure the inspection job we want the service to perform.
126+
$inspectJobConfig = (new InspectJobConfig())
127+
->setInspectConfig($inspectConfig)
128+
->setStorageConfig($storageConfig)
129+
->setActions([$action]);
130+
131+
// Send the job creation request and process the response.
132+
$job = $dlp->createDlpJob($parent, [
133+
'inspectJob' => $inspectJobConfig
134+
]);
135+
136+
$numOfAttempts = 10;
137+
do {
138+
printf('Waiting for job to complete' . PHP_EOL);
139+
sleep(30);
140+
$job = $dlp->getDlpJob($job->getName());
141+
if ($job->getState() == JobState::DONE) {
142+
break;
143+
}
144+
$numOfAttempts--;
145+
} while ($numOfAttempts > 0);
146+
147+
// Print finding counts.
148+
printf('Job %s status: %s' . PHP_EOL, $job->getName(), JobState::name($job->getState()));
149+
switch ($job->getState()) {
150+
case JobState::DONE:
151+
$infoTypeStats = $job->getInspectDetails()->getResult()->getInfoTypeStats();
152+
if (count($infoTypeStats) === 0) {
153+
printf('No findings.' . PHP_EOL);
154+
} else {
155+
foreach ($infoTypeStats as $infoTypeStat) {
156+
printf(
157+
' Found %s instance(s) of infoType %s' . PHP_EOL,
158+
$infoTypeStat->getCount(),
159+
$infoTypeStat->getInfoType()->getName()
160+
);
161+
}
162+
}
163+
break;
164+
case JobState::FAILED:
165+
printf('Job %s had errors:' . PHP_EOL, $job->getName());
166+
$errors = $job->getErrors();
167+
foreach ($errors as $error) {
168+
var_dump($error->getDetails());
169+
}
170+
break;
171+
case JobState::PENDING:
172+
printf('Job has not completed. Consider a longer timeout or an asynchronous execution model' . PHP_EOL);
173+
break;
174+
default:
175+
printf('Unexpected job state. Most likely, the job is either running or has not yet started.');
176+
}
177+
}
178+
# [END dlp_deidentify_cloud_storage]
179+
// The following 2 lines are only needed to run the samples.
180+
require_once __DIR__ . '/../../testing/sample_helpers.php';
181+
\Google\Cloud\Samples\execute_sample(__FILE__, __NAMESPACE__, $argv);

dlp/test/dlpTest.php

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,18 @@
1818

1919
namespace Google\Cloud\Samples\Dlp;
2020

21+
use Google\Cloud\Dlp\V2\DlpJob;
22+
use Google\Cloud\Dlp\V2\DlpJob\JobState;
2123
use Google\Cloud\TestUtils\TestTrait;
2224
use PHPUnit\Framework\TestCase;
25+
use Prophecy\Argument;
26+
use Prophecy\PhpUnit\ProphecyTrait;
2327
use PHPUnitRetry\RetryTrait;
28+
use Google\Cloud\Dlp\V2\DlpServiceClient;
29+
use Google\Cloud\Dlp\V2\InfoType;
30+
use Google\Cloud\Dlp\V2\InfoTypeStats;
31+
use Google\Cloud\Dlp\V2\InspectDataSourceDetails;
32+
use Google\Cloud\Dlp\V2\InspectDataSourceDetails\Result;
2433

2534
/**
2635
* Unit Tests for dlp commands.
@@ -29,6 +38,7 @@ class dlpTest extends TestCase
2938
{
3039
use TestTrait;
3140
use RetryTrait;
41+
use ProphecyTrait;
3242

3343
public function testInspectImageFile()
3444
{
@@ -995,4 +1005,81 @@ public function testDeidentifyTableWithMultipleCryptoHash()
9951005
$this->assertStringContainsString('abbyabernathy1', $csvLines_ouput[2]);
9961006
unlink($outputCsvFile);
9971007
}
1008+
1009+
public function testDeidentifyCloudStorage()
1010+
{
1011+
$bucketName = $this->requireEnv('GOOGLE_STORAGE_BUCKET');
1012+
$inputgcsPath = 'gs://' . $bucketName;
1013+
$outgcsPath = 'gs://' . $bucketName;
1014+
$deidentifyTemplateName = $this->requireEnv('DLP_DEIDENTIFY_TEMPLATE');
1015+
$structuredDeidentifyTemplateName = $this->requireEnv('DLP_STRUCTURED_DEIDENTIFY_TEMPLATE');
1016+
$imageRedactTemplateName = $this->requireEnv('DLP_IMAGE_REDACT_DEIDENTIFY_TEMPLATE');
1017+
$datasetId = $this->requireEnv('DLP_DATASET_ID');
1018+
$tableId = $this->requireEnv('DLP_TABLE_ID');
1019+
1020+
$dlpServiceClientMock = $this->prophesize(DlpServiceClient::class);
1021+
1022+
$createDlpJobResponse = (new DlpJob())
1023+
->setName('projects/' . self::$projectId . '/dlpJobs/1234')
1024+
->setState(JobState::PENDING);
1025+
1026+
$getDlpJobResponse = (new DlpJob())
1027+
->setName('projects/' . self::$projectId . '/dlpJobs/1234')
1028+
->setState(JobState::DONE)
1029+
->setInspectDetails((new InspectDataSourceDetails())
1030+
->setResult((new Result())
1031+
->setInfoTypeStats([
1032+
(new InfoTypeStats())
1033+
->setInfoType((new InfoType())->setName('PERSON_NAME'))
1034+
->setCount(6),
1035+
(new InfoTypeStats())
1036+
->setInfoType((new InfoType())->setName('EMAIL_ADDRESS'))
1037+
->setCount(9)
1038+
])));
1039+
1040+
$dlpServiceClientMock->createDlpJob(Argument::any(), Argument::any())
1041+
->shouldBeCalled()
1042+
->willReturn($createDlpJobResponse);
1043+
1044+
$dlpServiceClientMock->getDlpJob(Argument::any())
1045+
->shouldBeCalled()
1046+
->willReturn($getDlpJobResponse);
1047+
1048+
// Creating a temp file for testing.
1049+
$sampleFile = __DIR__ . '/../src/deidentify_cloud_storage.php';
1050+
$tmpFileName = basename($sampleFile, '.php') . '_temp';
1051+
$tmpFilePath = __DIR__ . '/../src/' . $tmpFileName . '.php';
1052+
1053+
$fileContent = file_get_contents($sampleFile);
1054+
$replacements = [
1055+
'$dlp = new DlpServiceClient();' => 'global $dlp;',
1056+
'deidentify_cloud_storage' => $tmpFileName
1057+
];
1058+
$fileContent = strtr($fileContent, $replacements);
1059+
$tmpFile = file_put_contents(
1060+
$tmpFilePath,
1061+
$fileContent
1062+
);
1063+
global $dlp;
1064+
1065+
$dlp = $dlpServiceClientMock->reveal();
1066+
1067+
$output = $this->runFunctionSnippet($tmpFileName, [
1068+
self::$projectId,
1069+
$inputgcsPath,
1070+
$outgcsPath,
1071+
$deidentifyTemplateName,
1072+
$structuredDeidentifyTemplateName,
1073+
$imageRedactTemplateName,
1074+
$datasetId,
1075+
$tableId
1076+
]);
1077+
1078+
// delete a temp file.
1079+
unlink($tmpFilePath);
1080+
1081+
$this->assertStringContainsString('projects/' . self::$projectId . '/dlpJobs', $output);
1082+
$this->assertStringContainsString('infoType PERSON_NAME', $output);
1083+
$this->assertStringContainsString('infoType EMAIL_ADDRESS', $output);
1084+
}
9981085
}

0 commit comments

Comments
 (0)