forked from Kyligence/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
executable file
·170 lines (148 loc) · 7.22 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python3
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import sys
import subprocess
from sparktestsupport import modules
from sparktestsupport.shellutils import run_cmd
from sparktestsupport.toposort import toposort_flatten
# -------------------------------------------------------------------------------------------------
# Functions for traversing module dependency graph
# -------------------------------------------------------------------------------------------------
def determine_modules_for_files(filenames):
"""
Given a list of filenames, return the set of modules that contain those files.
If a file is not associated with a more specific submodule, then this method will consider that
file to belong to the 'root' module. `.github` directory is counted only in GitHub Actions,
and `appveyor.yml` is always ignored because this file is dedicated only to AppVeyor builds,
and `README.md` is always ignored too.
>>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/core/foo"]))
['pyspark-core', 'sql']
>>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
['root']
>>> [x.name for x in determine_modules_for_files(["appveyor.yml", "sql/README.md"])]
[]
"""
changed_modules = set()
for filename in filenames:
if filename in ("appveyor.yml",):
continue
if filename.endswith("README.md"):
continue
if ("GITHUB_ACTIONS" not in os.environ) and filename.startswith(".github"):
continue
matched_at_least_one_module = False
for module in modules.all_modules:
if module.contains_file(filename):
changed_modules.add(module)
matched_at_least_one_module = True
if not matched_at_least_one_module:
changed_modules.add(modules.root)
return changed_modules
def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None):
"""
Given a git commit and target ref, use the set of files changed in the diff in order to
determine which modules' tests should be run.
>>> [x.name for x in determine_modules_for_files( \
identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))]
['graphx']
>>> 'root' in [x.name for x in determine_modules_for_files( \
identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
True
"""
if target_branch is None and target_ref is None:
raise AttributeError("must specify either target_branch or target_ref")
elif target_branch is not None and target_ref is not None:
raise AttributeError("must specify either target_branch or target_ref, not both")
if target_branch is not None:
diff_target = target_branch
run_cmd(["git", "fetch", "origin", str(target_branch + ":" + target_branch)])
else:
diff_target = target_ref
raw_output = subprocess.check_output(
["git", "diff", "--name-only", patch_sha, diff_target], universal_newlines=True
)
# Remove any empty strings
return [f for f in raw_output.split("\n") if f]
def determine_modules_to_test(changed_modules, deduplicated=True):
"""
Given a set of modules that have changed, compute the transitive closure of those modules'
dependent modules in order to determine the set of modules that should be tested.
Returns a topologically-sorted list of modules (ties are broken by sorting on module names).
If ``deduplicated`` is disabled, the modules are returned without tacking the deduplication
by dependencies into account.
>>> [x.name for x in determine_modules_to_test([modules.root])]
['root']
>>> [x.name for x in determine_modules_to_test([modules.build])]
['root']
>>> [x.name for x in determine_modules_to_test([modules.core])]
['root']
>>> [x.name for x in determine_modules_to_test([modules.launcher])]
['root']
>>> [x.name for x in determine_modules_to_test([modules.graphx])]
['graphx', 'examples']
>>> sorted([x.name for x in determine_modules_to_test([modules.sql])])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas',
'pyspark-pandas-connect', 'pyspark-pandas-slow', 'pyspark-sql', 'repl', 'sparkr', 'sql',
'sql-kafka-0-10']
>>> sorted([x.name for x in determine_modules_to_test(
... [modules.sparkr, modules.sql], deduplicated=False)])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas',
'pyspark-pandas-connect', 'pyspark-pandas-slow', 'pyspark-sql', 'repl', 'sparkr', 'sql',
'sql-kafka-0-10']
>>> sorted([x.name for x in determine_modules_to_test(
... [modules.sql, modules.core], deduplicated=False)])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'catalyst', 'connect', 'core', 'docker-integration-tests', 'examples', 'graphx',
'hive', 'hive-thriftserver', 'mllib', 'mllib-local', 'protobuf', 'pyspark-connect',
'pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-pandas', 'pyspark-pandas-connect',
'pyspark-pandas-slow', 'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root',
'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10',
'streaming-kinesis-asl']
"""
modules_to_test = set()
for module in changed_modules:
modules_to_test = modules_to_test.union(
determine_modules_to_test(module.dependent_modules, deduplicated)
)
modules_to_test = modules_to_test.union(set(changed_modules))
if not deduplicated:
return modules_to_test
# If we need to run all of the tests, then we should short-circuit and return 'root'
if modules.root in modules_to_test:
return [modules.root]
return toposort_flatten(
{m: set(m.dependencies).intersection(modules_to_test) for m in modules_to_test}, sort=True
)
def determine_tags_to_exclude(changed_modules):
tags = []
for m in modules.all_modules:
if m not in changed_modules:
tags += m.test_tags
return tags
def _test():
import doctest
failure_count = doctest.testmod()[0]
if failure_count:
sys.exit(-1)
if __name__ == "__main__":
_test()