Skip to content

Commit 6c5de4d

Browse files
[DLP] Implemented dlp_inspect_column_values_w_custom_hotwords and dlp_inspect_image_file (GoogleCloudPlatform#10278)
This PR includes two region tags: [dlp_inspect_image_file](https://cloud.google.com/dlp/docs/samples/dlp-inspect-image-file) [dlp_inspect_column_values_w_custom_hotwords](https://cloud.google.com/dlp/docs/creating-custom-infotypes-likelihood.md#match-column-values) ## Description Fixes #<ISSUE-NUMBER> Note: Before submitting a pull request, please open an issue for discussion if you are not associated with Google. ## Checklist - [x] I have followed [Sample Guidelines from AUTHORING_GUIDE.MD](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/AUTHORING_GUIDE.md) - [ ] README is updated to include [all relevant information](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/AUTHORING_GUIDE.md#readme-file) - [x] **Tests** pass: `nox -s py-3.9` (see [Test Environment Setup](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/AUTHORING_GUIDE.md#test-environment-setup)) - [x] **Lint** pass: `nox -s lint` (see [Test Environment Setup](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/AUTHORING_GUIDE.md#test-environment-setup)) - [ ] These samples need a new **API enabled** in testing projects to pass (let us know which ones) - [ ] These samples need a new/updated **env vars** in testing projects set to pass (let us know which ones) - [ ] This sample adds a new sample directory, and I updated the [CODEOWNERS file](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/.github/CODEOWNERS) with the codeowners for this sample - [ ] This sample adds a new **Product API**, and I updated the [Blunderbuss issue/PR auto-assigner](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/.github/blunderbuss.yml) with the codeowners for this sample - [x] Please **merge** this PR for me once it is approved
1 parent 1b5b52a commit 6c5de4d

File tree

2 files changed

+254
-0
lines changed

2 files changed

+254
-0
lines changed

dlp/snippets/inspect_content.py

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,103 @@ def inspect_table(
303303

304304
# [END dlp_inspect_table]
305305

306+
307+
# [START dlp_inspect_column_values_w_custom_hotwords]
308+
from typing import List # noqa: E402, I100
309+
310+
import google.cloud.dlp # noqa: F811, E402
311+
312+
313+
def inspect_column_values_w_custom_hotwords(
314+
project: str,
315+
table_header: List[str],
316+
table_rows: List[List[str]],
317+
info_types: List[str],
318+
custom_hotword: str,
319+
) -> None:
320+
"""Uses the Data Loss Prevention API to inspect table data using built-in
321+
infoType detectors, excluding columns that match a custom hot-word.
322+
Args:
323+
project: The Google Cloud project id to use as a parent resource.
324+
table_header: List of strings representing table field names.
325+
table_rows: List of rows representing table values.
326+
info_types: The infoType for which hot-word rule is applied.
327+
custom_hotword: The custom regular expression used for likelihood boosting.
328+
"""
329+
330+
# Instantiate a client
331+
dlp = google.cloud.dlp_v2.DlpServiceClient()
332+
333+
# Construct the `table`. For more details on the table schema, please see
334+
# https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
335+
headers = [{"name": val} for val in table_header]
336+
rows = []
337+
for row in table_rows:
338+
rows.append(
339+
{"values": [{"string_value": cell_val} for cell_val in row]}
340+
)
341+
table = {"headers": headers, "rows": rows}
342+
343+
# Construct the `item` for table to be inspected.
344+
item = {"table": table}
345+
346+
# Prepare info_types by converting the list of strings into a list of
347+
# dictionaries.
348+
info_types = [{"name": info_type} for info_type in info_types]
349+
350+
# Construct a rule set with caller provided hot-word, with a likelihood
351+
# boost to VERY_UNLIKELY when the hot-word are present
352+
hotword_rule = {
353+
"hotword_regex": {"pattern": custom_hotword},
354+
"likelihood_adjustment": {
355+
"fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_UNLIKELY
356+
},
357+
"proximity": {"window_before": 1},
358+
}
359+
360+
rule_set = [
361+
{
362+
"info_types": info_types,
363+
"rules": [{"hotword_rule": hotword_rule}],
364+
}
365+
]
366+
367+
# Construct the configuration dictionary, which defines the entire inspect content task.
368+
inspect_config = {
369+
"info_types": info_types,
370+
"rule_set": rule_set,
371+
"min_likelihood": google.cloud.dlp_v2.Likelihood.POSSIBLE,
372+
"include_quote": True,
373+
}
374+
375+
# Convert the project id into a full resource id.
376+
parent = f"projects/{project}/locations/global"
377+
378+
# Call the API
379+
response = dlp.inspect_content(
380+
request={
381+
"parent": parent,
382+
"inspect_config": inspect_config,
383+
"item": item,
384+
}
385+
)
386+
387+
# Print out the results.
388+
if response.result.findings:
389+
for finding in response.result.findings:
390+
try:
391+
if finding.quote:
392+
print("Quote: {}".format(finding.quote))
393+
except AttributeError:
394+
pass
395+
print("Info type: {}".format(finding.info_type.name))
396+
print("Likelihood: {}".format(finding.likelihood))
397+
else:
398+
print("No findings.")
399+
400+
# [END dlp_inspect_column_values_w_custom_hotwords]
401+
402+
306403
# [START dlp_inspect_file]
307404
import mimetypes # noqa: I100, E402
308405
from typing import Optional # noqa: I100, E402
@@ -969,6 +1066,65 @@ def inspect_image_file_all_infotypes(
9691066
# [END dlp_inspect_image_all_infotypes]
9701067

9711068

1069+
# [START dlp_inspect_image_file]
1070+
import google.cloud.dlp # noqa: F811, E402, I100
1071+
1072+
1073+
def inspect_image_file(
1074+
project: str,
1075+
filename: str,
1076+
include_quote: bool = True,
1077+
) -> None:
1078+
"""Uses the Data Loss Prevention API to analyze strings for
1079+
protected data in image file.
1080+
Args:
1081+
project: The Google Cloud project id to use as a parent resource.
1082+
filename: The path to the file to inspect.
1083+
include_quote: Boolean for whether to display a quote of the detected
1084+
information in the results.
1085+
"""
1086+
# Instantiate a client.
1087+
dlp = google.cloud.dlp_v2.DlpServiceClient()
1088+
1089+
# Prepare info_types by converting the list of strings into a list of
1090+
# dictionaries.
1091+
info_types = ["PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD_NUMBER"]
1092+
info_types = [{"name": info_type} for info_type in info_types]
1093+
1094+
# Construct the configuration for the Inspect request.
1095+
inspect_config = {
1096+
"info_types": info_types,
1097+
"include_quote": include_quote,
1098+
}
1099+
1100+
# Construct the byte_item, containing the image file's byte data.
1101+
with open(filename, mode="rb") as f:
1102+
byte_item = {"type_": "IMAGE", "data": f.read()}
1103+
1104+
# Convert the project id into a full resource id.
1105+
parent = f"projects/{project}/locations/global"
1106+
1107+
# Call the API.
1108+
response = dlp.inspect_content(
1109+
request={
1110+
"parent": parent,
1111+
"inspect_config": inspect_config,
1112+
"item": {"byte_item": byte_item},
1113+
}
1114+
)
1115+
1116+
# Parse the response and process results.
1117+
if response.result.findings:
1118+
for finding in response.result.findings:
1119+
print("Quote: {}".format(finding.quote))
1120+
print("Info type: {}".format(finding.info_type.name))
1121+
print("Likelihood: {}".format(finding.likelihood))
1122+
else:
1123+
print("No findings.")
1124+
1125+
# [END dlp_inspect_image_file]
1126+
1127+
9721128
# [START dlp_inspect_image_listed_infotypes]
9731129
import google.cloud.dlp # noqa: F811, E402
9741130

@@ -1539,6 +1695,42 @@ def inspect_data_to_hybrid_job_trigger(
15391695
default=True,
15401696
)
15411697

1698+
parser_table_hotword = subparsers.add_parser(
1699+
"table_w_custom_hotword",
1700+
help="Inspect a table and exclude column values when matched "
1701+
"with custom hot-word.",
1702+
)
1703+
parser_table_hotword.add_argument(
1704+
"--project",
1705+
help="The Google Cloud project id to use as a parent resource.",
1706+
default=default_project,
1707+
)
1708+
parser_table_hotword.add_argument(
1709+
"--table_header",
1710+
help="List of strings representing table field names."
1711+
"Example include '['Fake_Email_Address', 'Real_Email_Address]'. "
1712+
"The method can be used to exclude matches from entire column"
1713+
'"Fake_Email_Address".',
1714+
)
1715+
parser_table_hotword.add_argument(
1716+
"--table_rows",
1717+
help="List of rows representing table values."
1718+
'Example: '
1719+
1720+
1721+
)
1722+
parser_table_hotword.add_argument(
1723+
"--info_types",
1724+
action="append",
1725+
help="Strings representing info types to look for. A full list of "
1726+
"info categories and types is available from the API. Examples "
1727+
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
1728+
)
1729+
parser_table_hotword.add_argument(
1730+
"custom_hotword",
1731+
help="The custom regular expression used for likelihood boosting.",
1732+
)
1733+
15421734
parser_file = subparsers.add_parser("file", help="Inspect a local file.")
15431735
parser_file.add_argument("filename", help="The path to the file to inspect.")
15441736
parser_file.add_argument(
@@ -1904,6 +2096,22 @@ def inspect_data_to_hybrid_job_trigger(
19042096
default=True,
19052097
)
19062098

2099+
parser_image_default_infotypes = subparsers.add_parser(
2100+
"image_default_infotypes", help="Inspect a local file with default info types."
2101+
)
2102+
parser_image_default_infotypes.add_argument(
2103+
"--project",
2104+
help="The Google Cloud project id to use as a parent resource.",
2105+
default=default_project,
2106+
)
2107+
parser_image_default_infotypes.add_argument("filename", help="The path to the file to inspect.")
2108+
parser_image_default_infotypes.add_argument(
2109+
"--include_quote",
2110+
help="A Boolean for whether to display a quote of the detected"
2111+
"information in the results.",
2112+
default=True,
2113+
)
2114+
19072115
parser_image_infotypes = subparsers.add_parser(
19082116
"image_listed_infotypes", help="Inspect a local file with listed info types."
19092117
)
@@ -2039,6 +2247,14 @@ def inspect_data_to_hybrid_job_trigger(
20392247
max_findings=args.max_findings,
20402248
include_quote=args.include_quote,
20412249
)
2250+
elif args.content == "table_w_custom_hotword":
2251+
inspect_column_values_w_custom_hotwords(
2252+
args.project,
2253+
args.table_header,
2254+
args.table_rows,
2255+
args.info_types,
2256+
args.custom_hotword,
2257+
)
20422258
elif args.content == "file":
20432259
inspect_file(
20442260
args.project,
@@ -2124,6 +2340,12 @@ def inspect_data_to_hybrid_job_trigger(
21242340
args.filename,
21252341
include_quote=args.include_quote,
21262342
)
2343+
elif args.content == "image_default_infotypes":
2344+
inspect_image_file(
2345+
args.project,
2346+
args.filename,
2347+
include_quote=args.include_quote,
2348+
)
21272349
elif args.content == "image_listed_infotypes":
21282350
inspect_image_file_listed_infotypes(
21292351
args.project,

dlp/snippets/inspect_content_test.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,28 @@ def test_inspect_table(capsys: pytest.CaptureFixture) -> None:
226226
assert "Info type: EMAIL_ADDRESS" in out
227227

228228

229+
def test_inspect_column_values_w_custom_hotwords(capsys):
230+
table_data = {
231+
"header": ["Fake Social Security Number", "Real Social Security Number"],
232+
"rows": [
233+
["111-11-1111", "222-22-2222"],
234+
["987-23-1234", "333-33-3333"],
235+
["678-12-0909", "444-44-4444"]
236+
]
237+
}
238+
inspect_content.inspect_column_values_w_custom_hotwords(
239+
GCLOUD_PROJECT,
240+
table_data["header"],
241+
table_data["rows"],
242+
["US_SOCIAL_SECURITY_NUMBER"],
243+
"Fake Social Security Number",
244+
)
245+
out, _ = capsys.readouterr()
246+
assert "Info type: US_SOCIAL_SECURITY_NUMBER" in out
247+
assert "222-22-2222" in out
248+
assert "111-11-1111" not in out
249+
250+
229251
def test_inspect_string_with_custom_info_types(capsys: pytest.CaptureFixture) -> None:
230252
test_string = "My name is Gary Smith and my email is [email protected]"
231253
dictionaries = ["Gary Smith"]
@@ -330,6 +352,16 @@ def test_inspect_image_file_all_infotypes(capsys: pytest.CaptureFixture) -> None
330352
assert "Info type: EMAIL_ADDRESS" in out
331353

332354

355+
def test_inspect_image_file_default_infotypes(capsys: pytest.CaptureFixture) -> None:
356+
test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png")
357+
358+
inspect_content.inspect_image_file(GCLOUD_PROJECT, test_filepath)
359+
360+
out, _ = capsys.readouterr()
361+
assert "Info type: PHONE_NUMBER" in out
362+
assert "Info type: EMAIL_ADDRESS" in out
363+
364+
333365
def test_inspect_image_file_listed_infotypes(capsys: pytest.CaptureFixture) -> None:
334366
test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png")
335367

0 commit comments

Comments
 (0)