|
13 | 13 | from transformers import AutoTokenizer |
14 | 14 |
|
15 | 15 | from commit0.harness.constants import SPLIT |
| 16 | +from commit0.harness.get_pytest_ids import main as get_tests |
16 | 17 | from commit0.harness.utils import clone_repo |
17 | 18 | from commit0.cli import write_commit0_config_file |
18 | 19 |
|
@@ -164,7 +165,7 @@ def get_blank_repo_metrics( |
164 | 165 |
|
165 | 166 |
|
166 | 167 | leaderboard_header = """\n\n## Leaderboard ({split}) |
167 | | -| Name | Repos Resolved (/{num_repos}) | Total Tests Passed (/{total_num_tests}) | Test Duration (s) | Date | Analysis | Github | |
| 168 | +| Name | Repos Resolved (/{num_repos}) | Tests Passed (Total: {total_num_tests}) | Test Duration (s) | Date | Analysis | Github | |
168 | 169 | |------|:-------------------------:|:--------------------:|:--------------------:|:----------:|----|----| """ |
169 | 170 |
|
170 | 171 | submission_table_header = """# Submission Name: **{display_name}** (split: {split}) |
@@ -203,10 +204,12 @@ def render_mds(overwrite_previous, subfolder="docs"): |
203 | 204 | if org_name in {"blank", "repos", "submission_repos"}: |
204 | 205 | continue |
205 | 206 | for branch_path in glob.glob(os.path.join(org_path, "*.json")): |
206 | | - cum_tests_passed = 0 |
| 207 | + evaluate_numbers = [] |
| 208 | + lite_evaluate_numbers = [] |
| 209 | + # cum_tests_passed = 0 |
207 | 210 | repos_resolved = 0 |
208 | 211 | total_duration = 0.0 |
209 | | - lite_cum_tests_passed = 0 |
| 212 | + # lite_cum_tests_passed = 0 |
210 | 213 | lite_repos_resolved = 0 |
211 | 214 | lite_total_duration = 0.0 |
212 | 215 | branch_metrics = json.load(open(branch_path)) |
@@ -299,11 +302,14 @@ def render_mds(overwrite_previous, subfolder="docs"): |
299 | 302 | f"### {shortened_testname}\n\n<details><summary> <pre>{shortened_testname}" |
300 | 303 | f"</pre></summary><pre>\n{failure['failure_string']}\n</pre>\n</details>\n" |
301 | 304 | ) |
302 | | - cum_tests_passed += pytest_info["summary"]["passed"] |
| 305 | + # cum_tests_passed += pytest_info["summary"]["passed"] |
| 306 | + num_tests = len(get_tests(repo_name, verbose=0)) |
| 307 | + evaluate_numbers.append(pytest_info["summary"]["passed"] / num_tests) |
303 | 308 | total_duration += pytest_info["duration"] |
304 | 309 | repos_resolved += int(resolved) |
305 | | - if split == "all": |
306 | | - lite_cum_tests_passed += pytest_info["summary"]["passed"] |
| 310 | + if split == "all" and repo_name in SPLIT['lite']: |
| 311 | + lite_evaluate_numbers.append(pytest_info["summary"]["passed"] / num_tests) |
| 312 | + # lite_cum_tests_passed += pytest_info["summary"]["passed"] |
307 | 313 | lite_total_duration += pytest_info["duration"] |
308 | 314 | lite_repos_resolved += int(resolved) |
309 | 315 |
|
@@ -331,20 +337,22 @@ def render_mds(overwrite_previous, subfolder="docs"): |
331 | 337 | wf.write(back_button + "\n" + submission_page) |
332 | 338 | analysis_link = f"[Analysis](/{f'analysis_{org_name}_{branch_name}'})" |
333 | 339 | github_link = f"[Github]({project_page_link})" |
334 | | - leaderboard[split].append((cum_tests_passed, |
| 340 | + avg_pass_rate = sum(evaluate_numbers) / len(evaluate_numbers) |
| 341 | + leaderboard[split].append((avg_pass_rate * 100, |
335 | 342 | f"\n|{display_name}|" |
336 | 343 | f"{repos_resolved}|" |
337 | | - f"{cum_tests_passed}|" |
| 344 | + f"{avg_pass_rate*100:.2f}%|" |
338 | 345 | f"{total_duration:.2f}|" |
339 | 346 | f"{submission_date}|" |
340 | 347 | f"{analysis_link}|" |
341 | 348 | f"{github_link}|" |
342 | 349 | )) |
343 | 350 | if ((split == "all") and ("Reference (Gold)" not in display_name)): |
344 | | - leaderboard["lite"].append((lite_cum_tests_passed, |
| 351 | + avg_lite_pass_rate = sum(lite_evaluate_numbers) / len(lite_evaluate_numbers) |
| 352 | + leaderboard["lite"].append((avg_lite_pass_rate * 100, |
345 | 353 | f"\n|{display_name} (subset of `all`)|" |
346 | 354 | f"{lite_repos_resolved}|" |
347 | | - f"{lite_cum_tests_passed}|" |
| 355 | + f"{avg_lite_pass_rate*100:.2f}%|" |
348 | 356 | f"{lite_total_duration:.2f}|" |
349 | 357 | f"{submission_date}|" |
350 | 358 | f"{analysis_link}|" |
|
0 commit comments