From 64f64f63a609fe5404192b95098e29316ce8f51f Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 21 Jul 2025 13:57:43 +0100 Subject: [PATCH 01/53] def 425 --- sdgym/_run_benchmark.py | 17 +++++++++++++++++ sdgym/benchmark.py | 2 +- tasks.py | 5 +++++ 3 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 sdgym/_run_benchmark.py diff --git a/sdgym/_run_benchmark.py b/sdgym/_run_benchmark.py new file mode 100644 index 00000000..c9a05371 --- /dev/null +++ b/sdgym/_run_benchmark.py @@ -0,0 +1,17 @@ +import os +from sdgym.benchmark import benchmark_single_table_aws, SDV_SINGLE_TABLE_SYNTHESIZERS +aws_key = os.getenv('AWS_ACCESS_KEY_ID') +aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') +OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425' +synthesizer = SDV_SINGLE_TABLE_SYNTHESIZERS +datasets = ['expedia_hotel_logs', 'child'] + + +if __name__ == '__main__': + for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: + benchmark_single_table_aws( + output_destination=OUTPUT_DESTINATION_AWS, + aws_access_key_id=aws_key, aws_secret_access_key=aws_secret, + synthesizers=[synthesizer], + sdv_datasets=datasets, + ) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 6eca2c91..d33129bb 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1287,7 +1287,7 @@ def _get_user_data_script(access_key, secret_key, region_name, script_content): echo "======== Install Dependencies in venv ============" pip install --upgrade pip - pip install "sdgym[all]" + pip install "sdgym[all] @ git+https://github.com/sdv-dev/SDGym.git@issue-425-workflow-sdgym#egg=sdgym" pip install s3fs echo "======== Write Script ===========" diff --git a/tasks.py b/tasks.py index 76eb01a0..64e5c84b 100644 --- a/tasks.py +++ b/tasks.py @@ -202,3 +202,8 @@ def rmdir(c, path): shutil.rmtree(path, onerror=remove_readonly) except PermissionError: pass + +@task +def sdgym_benchmark(c): + """Run the SDGym benchmark.""" + c.run('python sdgym/_run_benchmark.py') \ No newline at end of file From db0ee3a4ef2679a3a3cc97ee73e10caad5e9e093 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 21 Jul 2025 15:21:09 +0100 Subject: [PATCH 02/53] define run_benchmark.ym file --- .github/workflows/run_benchmark.yml | 31 +++++++++++++++++++++++++++++ pyproject.toml | 1 + sdgym/_run_benchmark.py | 2 +- 3 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/run_benchmark.yml diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml new file mode 100644 index 00000000..03028e8f --- /dev/null +++ b/.github/workflows/run_benchmark.yml @@ -0,0 +1,31 @@ +name: Run SDGym Benchmark + +on: + workflow_dispatch: + schedule: + - cron: '0 5 1 * *' + +jobs: + sdgym-benchmark: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Set up latest Python + uses: actions/setup-python@v5 + with: + python-version-file: 'pyproject.toml' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install invoke + python -m pip install -e .[dev] + + - name: SDGym Benchmark + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} + + run: invoke sdgym-benchmark diff --git a/pyproject.toml b/pyproject.toml index 0553c69f..7946e0ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -196,6 +196,7 @@ exclude = [ ".ipynb_checkpoints", "tasks.py", "static_code_analysis.txt", + "*.ipynb" ] [tool.ruff.lint] diff --git a/sdgym/_run_benchmark.py b/sdgym/_run_benchmark.py index c9a05371..4f1be9b2 100644 --- a/sdgym/_run_benchmark.py +++ b/sdgym/_run_benchmark.py @@ -8,7 +8,7 @@ if __name__ == '__main__': - for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: + for synthesizer in ['GaussianCopulaSynthesizer']: benchmark_single_table_aws( output_destination=OUTPUT_DESTINATION_AWS, aws_access_key_id=aws_key, aws_secret_access_key=aws_secret, From 6d63bc6ffdaf180c0f31a5d7522e931612456db6 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 21 Jul 2025 16:02:57 +0100 Subject: [PATCH 03/53] def 3 --- sdgym/_run_benchmark.py | 12 +++-- sdgym/_upload_benchmark_results.py | 51 +++++++++++++++++++ sdgym/benchmark.py | 2 + .../sdgym_result_explorer/result_explorer.py | 4 ++ sdgym/sdgym_result_explorer/result_handler.py | 13 +++++ 5 files changed, 77 insertions(+), 5 deletions(-) create mode 100644 sdgym/_upload_benchmark_results.py diff --git a/sdgym/_run_benchmark.py b/sdgym/_run_benchmark.py index 4f1be9b2..f98e9b2f 100644 --- a/sdgym/_run_benchmark.py +++ b/sdgym/_run_benchmark.py @@ -1,14 +1,16 @@ import os -from sdgym.benchmark import benchmark_single_table_aws, SDV_SINGLE_TABLE_SYNTHESIZERS + +from sdgym.benchmark import SDV_SINGLE_TABLE_SYNTHESIZERS, benchmark_single_table_aws + aws_key = os.getenv('AWS_ACCESS_KEY_ID') aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') -OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425' +OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' synthesizer = SDV_SINGLE_TABLE_SYNTHESIZERS -datasets = ['expedia_hotel_logs', 'child'] - +datasets = ['expedia_hotel_logs', 'fake_companies'] + if __name__ == '__main__': - for synthesizer in ['GaussianCopulaSynthesizer']: + for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: benchmark_single_table_aws( output_destination=OUTPUT_DESTINATION_AWS, aws_access_key_id=aws_key, aws_secret_access_key=aws_secret, diff --git a/sdgym/_upload_benchmark_results.py b/sdgym/_upload_benchmark_results.py new file mode 100644 index 00000000..fc968327 --- /dev/null +++ b/sdgym/_upload_benchmark_results.py @@ -0,0 +1,51 @@ +import argparse +import os +import sys +from datetime import datetime + +from sdgym.sdgym_result_explorer.result_explorer import SDGymResultsExplorer + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--date', type=str, help='Benchmark date (YYYY-MM-DD)') + return parser.parse_args() + + +def get_run_name(date_str): + try: + date = datetime.strptime(date_str, '%Y-%m-%d') + except ValueError: + raise ValueError(f"Invalid date format: {date_str}. Expected YYYY-MM-DD.") + return f'SDGym_results_{date.month:02d}_{date.day:02d}_{date.year}' + + +def main(): + args = parse_args() + + if args.date: + date_str = args.date + else: + date_str = datetime.utcnow().replace(day=1).strftime('%Y-%m-%d') + + run_name = get_run_name(date_str) + print(f"Checking benchmark results for run: {run_name}") + + aws_key = os.getenv('AWS_ACCESS_KEY_ID') + aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') + bucket_path = 's3://sdgym-benchmark/Debug/Issue_425' + result_explorer = SDGymResultsExplorer( + bucket_path, aws_access_key_id=aws_key, aws_secret_access_key=aws_secret + ) + + if not result_explorer.all_runs_complete(run_name): + print(f"Run {run_name} is not complete yet. Exiting.") + sys.exit(0) + + print(f"Run {run_name} is complete! Proceeding with summarization...") + # Call summarization/upload here + result_explorer.summarize_and_publish(run_name) + + +if __name__ == '__main__': + main() diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index d33129bb..a2056026 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1218,8 +1218,10 @@ def _store_job_args_in_s3(output_destination, job_args_list, s3_client): bucket_name = parsed_url.netloc path = parsed_url.path.lstrip('/') if parsed_url.path else '' filename = os.path.basename(job_args_list[0][-1]['run_id']) + print(filename) run_id = os.path.splitext(filename)[0] job_args_key = f'job_args_list_{run_id}.pkl' + print(f'Storing job args in S3: {bucket_name}/{path}{job_args_key}') job_args_key = f'{path}{job_args_key}' if path else job_args_key serialized_data = pickle.dumps(job_args_list) diff --git a/sdgym/sdgym_result_explorer/result_explorer.py b/sdgym/sdgym_result_explorer/result_explorer.py index eb04b576..068d588e 100644 --- a/sdgym/sdgym_result_explorer/result_explorer.py +++ b/sdgym/sdgym_result_explorer/result_explorer.py @@ -95,3 +95,7 @@ def summarize(self, folder_name): - A DataFrame with the results of the benchmark for the specified folder. """ return self._handler.summarize(folder_name) + + def all_runs_complete(self, folder_name): + """Check if all runs in the specified folder are complete.""" + return self._handler.all_runs_complete(folder_name) diff --git a/sdgym/sdgym_result_explorer/result_handler.py b/sdgym/sdgym_result_explorer/result_handler.py index 3de27197..52808319 100644 --- a/sdgym/sdgym_result_explorer/result_handler.py +++ b/sdgym/sdgym_result_explorer/result_handler.py @@ -155,6 +155,19 @@ def summarize(self, folder_name): return summarized_table, folder_to_results[folder_name] + def all_runs_complete(self, folder_name): + """Check if all runs in the specified folder are complete.""" + yaml_files = self._get_results_files(folder_name, prefix=RUN_ID_PREFIX, suffix='.yaml') + if not yaml_files: + return False + + for yaml_file in yaml_files: + run_id_info = self._load_yaml_file(folder_name, yaml_file) + if run_id_info.get('completed_date') is None: + return False + + return True + class LocalResultsHandler(ResultsHandler): """Results handler for local filesystem.""" From 92a4f1b29293e6dd7a243458436d7d301c48798c Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 21 Jul 2025 17:32:39 +0100 Subject: [PATCH 04/53] define upload workflow --- .../workflows/upload_benchmark_results.yml | 34 +++++++++++++++++++ sdgym/_run_benchmark.py/__init__.py | 5 +++ .../run_benchmark.py} | 6 ++-- .../upload_benchmark_results.py} | 30 ++++++++++------ sdgym/benchmark.py | 7 ++-- sdgym/result_writer.py | 4 +-- 6 files changed, 70 insertions(+), 16 deletions(-) create mode 100644 .github/workflows/upload_benchmark_results.yml create mode 100644 sdgym/_run_benchmark.py/__init__.py rename sdgym/{_run_benchmark.py => _run_benchmark.py/run_benchmark.py} (75%) rename sdgym/{_upload_benchmark_results.py => _run_benchmark.py/upload_benchmark_results.py} (50%) diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml new file mode 100644 index 00000000..bed671ea --- /dev/null +++ b/.github/workflows/upload_benchmark_results.yml @@ -0,0 +1,34 @@ +name: Upload SDGym Benchmark results + +on: + workflow_dispatch: + schedule: + - cron: '0 6 * * *' + +jobs: + sdgym-benchmark: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Set up latest Python + uses: actions/setup-python@v5 + with: + python-version-file: 'pyproject.toml' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install invoke + python -m pip install -e .[dev] + + - name: SDGym Benchmark + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} + + run: | + BENCHMARK_DATE=$(date -u "+%Y-%m-01") # First of current month, in UTC + echo "Benchmark date: $BENCHMARK_DATE" + invoke upload_benchmark_results --date "$BENCHMARK_DATE" \ No newline at end of file diff --git a/sdgym/_run_benchmark.py/__init__.py b/sdgym/_run_benchmark.py/__init__.py new file mode 100644 index 00000000..d916e31b --- /dev/null +++ b/sdgym/_run_benchmark.py/__init__.py @@ -0,0 +1,5 @@ +"""Folder for the SDGym benchmark module.""" + +OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' +UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' +RESULTS_UPLOADED = False diff --git a/sdgym/_run_benchmark.py b/sdgym/_run_benchmark.py/run_benchmark.py similarity index 75% rename from sdgym/_run_benchmark.py rename to sdgym/_run_benchmark.py/run_benchmark.py index f98e9b2f..6ad8aba0 100644 --- a/sdgym/_run_benchmark.py +++ b/sdgym/_run_benchmark.py/run_benchmark.py @@ -1,19 +1,21 @@ import os from sdgym.benchmark import SDV_SINGLE_TABLE_SYNTHESIZERS, benchmark_single_table_aws +from sdgym.run_benchmark import OUTPUT_DESTINATION_AWS, RESULT_UPLOADED aws_key = os.getenv('AWS_ACCESS_KEY_ID') aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') -OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' synthesizer = SDV_SINGLE_TABLE_SYNTHESIZERS datasets = ['expedia_hotel_logs', 'fake_companies'] if __name__ == '__main__': + RESULT_UPLOADED = False for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: benchmark_single_table_aws( output_destination=OUTPUT_DESTINATION_AWS, - aws_access_key_id=aws_key, aws_secret_access_key=aws_secret, + aws_access_key_id=aws_key, + aws_secret_access_key=aws_secret, synthesizers=[synthesizer], sdv_datasets=datasets, ) diff --git a/sdgym/_upload_benchmark_results.py b/sdgym/_run_benchmark.py/upload_benchmark_results.py similarity index 50% rename from sdgym/_upload_benchmark_results.py rename to sdgym/_run_benchmark.py/upload_benchmark_results.py index fc968327..bd27f2ae 100644 --- a/sdgym/_upload_benchmark_results.py +++ b/sdgym/_run_benchmark.py/upload_benchmark_results.py @@ -3,6 +3,10 @@ import sys from datetime import datetime +import boto3 + +from sdgym.result_writer import S3ResultsWriter +from sdgym.run_benchmark import OUTPUT_DESTINATION_AWS, RESULT_UPLOADED from sdgym.sdgym_result_explorer.result_explorer import SDGymResultsExplorer @@ -16,36 +20,42 @@ def get_run_name(date_str): try: date = datetime.strptime(date_str, '%Y-%m-%d') except ValueError: - raise ValueError(f"Invalid date format: {date_str}. Expected YYYY-MM-DD.") + raise ValueError(f'Invalid date format: {date_str}. Expected YYYY-MM-DD.') return f'SDGym_results_{date.month:02d}_{date.day:02d}_{date.year}' def main(): - args = parse_args() + if RESULT_UPLOADED: + print('Benchmark results have already been uploaded. Exiting.') + sys.exit(0) + args = parse_args() if args.date: date_str = args.date else: date_str = datetime.utcnow().replace(day=1).strftime('%Y-%m-%d') run_name = get_run_name(date_str) - print(f"Checking benchmark results for run: {run_name}") + print(f'Checking benchmark results for run: {run_name}') # noqa: T201 aws_key = os.getenv('AWS_ACCESS_KEY_ID') aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') - bucket_path = 's3://sdgym-benchmark/Debug/Issue_425' + bucket_path = OUTPUT_DESTINATION_AWS + summary_filepath = f'{bucket_path}/{run_name}_summary.csv' result_explorer = SDGymResultsExplorer( bucket_path, aws_access_key_id=aws_key, aws_secret_access_key=aws_secret ) - + summary = result_explorer. + s3_client = boto3.client('s3', aws_access_key_id=aws_key, aws_secret_access_key=aws_secret) + result_writer = S3ResultsWriter(s3_client) if not result_explorer.all_runs_complete(run_name): - print(f"Run {run_name} is not complete yet. Exiting.") + print(f'Run {run_name} is not complete yet. Exiting.') # noqa: T201 sys.exit(0) - print(f"Run {run_name} is complete! Proceeding with summarization...") - # Call summarization/upload here - result_explorer.summarize_and_publish(run_name) - + print(f'Run {run_name} is complete! Proceeding with summarization...') # noqa: T201 + summary, _ = result_explorer.summarize(run_name) + result_writer.write_dataframe(summary, f'{bucket_path}/{run_name}_summary.csv', index=True) + RESULT_UPLOADED = True if __name__ == '__main__': main() diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index a2056026..885d3ff6 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -168,6 +168,11 @@ def _setup_output_destination_aws(output_destination, synthesizers, datasets, s3 'run_id': f's3://{bucket_name}/{top_folder}/run_{today}_{increment}.yaml', } + s3_client.put_object( + Bucket=bucket_name, + Key=f'{top_folder}/run_{today}_{increment}.yaml', + Body='completed_date: null\n'.encode('utf-8'), + ) return paths @@ -1218,10 +1223,8 @@ def _store_job_args_in_s3(output_destination, job_args_list, s3_client): bucket_name = parsed_url.netloc path = parsed_url.path.lstrip('/') if parsed_url.path else '' filename = os.path.basename(job_args_list[0][-1]['run_id']) - print(filename) run_id = os.path.splitext(filename)[0] job_args_key = f'job_args_list_{run_id}.pkl' - print(f'Storing job args in S3: {bucket_name}/{path}{job_args_key}') job_args_key = f'{path}{job_args_key}' if path else job_args_key serialized_data = pickle.dumps(job_args_list) diff --git a/sdgym/result_writer.py b/sdgym/result_writer.py index 33a280fb..067ee93c 100644 --- a/sdgym/result_writer.py +++ b/sdgym/result_writer.py @@ -68,7 +68,7 @@ class S3ResultsWriter(ResultsWriter): def __init__(self, s3_client): self.s3_client = s3_client - def write_dataframe(self, data, file_path, append=False): + def write_dataframe(self, data, file_path, append=False, index=False): """Write a DataFrame to S3 as a CSV file.""" bucket, key = parse_s3_path(file_path) if append: @@ -81,7 +81,7 @@ def write_dataframe(self, data, file_path, append=False): except Exception: pass # If the file does not exist, we will create it - csv_buffer = data.to_csv(index=False).encode() + csv_buffer = data.to_csv(index=index).encode() self.s3_client.put_object(Body=csv_buffer, Bucket=bucket, Key=key) def write_pickle(self, obj, file_path): From f6a3d1c6a2955cdef9f20625b5894f8d10b49339 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 21 Jul 2025 17:39:24 +0100 Subject: [PATCH 05/53] restructure files --- .github/workflows/upload_benchmark_results.yml | 11 ++++++++++- .../{_run_benchmark.py => _run_benchmark}/__init__.py | 0 .../run_benchmark.py | 4 ++-- .../upload_benchmark_results.py | 4 ++-- tasks.py | 8 +++++++- 5 files changed, 21 insertions(+), 6 deletions(-) rename sdgym/{_run_benchmark.py => _run_benchmark}/__init__.py (100%) rename sdgym/{_run_benchmark.py => _run_benchmark}/run_benchmark.py (86%) rename sdgym/{_run_benchmark.py => _run_benchmark}/upload_benchmark_results.py (95%) diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index bed671ea..d6039f13 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -2,6 +2,10 @@ name: Upload SDGym Benchmark results on: workflow_dispatch: + inputs: + date: + description: 'Benchmark date (YYYY-MM-DD), defaults to the first of the current month' + required: false schedule: - cron: '0 6 * * *' @@ -29,6 +33,11 @@ jobs: AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} run: | - BENCHMARK_DATE=$(date -u "+%Y-%m-01") # First of current month, in UTC + if [ -z "${{ github.event.inputs.date }}" ]; then + BENCHMARK_DATE=$(date -u "+%Y-%m-01") + else + BENCHMARK_DATE="${{ github.event.inputs.date }}" + fi + echo "Benchmark date: $BENCHMARK_DATE" invoke upload_benchmark_results --date "$BENCHMARK_DATE" \ No newline at end of file diff --git a/sdgym/_run_benchmark.py/__init__.py b/sdgym/_run_benchmark/__init__.py similarity index 100% rename from sdgym/_run_benchmark.py/__init__.py rename to sdgym/_run_benchmark/__init__.py diff --git a/sdgym/_run_benchmark.py/run_benchmark.py b/sdgym/_run_benchmark/run_benchmark.py similarity index 86% rename from sdgym/_run_benchmark.py/run_benchmark.py rename to sdgym/_run_benchmark/run_benchmark.py index 6ad8aba0..f248c2da 100644 --- a/sdgym/_run_benchmark.py/run_benchmark.py +++ b/sdgym/_run_benchmark/run_benchmark.py @@ -1,7 +1,7 @@ import os from sdgym.benchmark import SDV_SINGLE_TABLE_SYNTHESIZERS, benchmark_single_table_aws -from sdgym.run_benchmark import OUTPUT_DESTINATION_AWS, RESULT_UPLOADED +from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS, RESULTS_UPLOADED aws_key = os.getenv('AWS_ACCESS_KEY_ID') aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') @@ -10,7 +10,7 @@ if __name__ == '__main__': - RESULT_UPLOADED = False + RESULTS_UPLOADED = False for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: benchmark_single_table_aws( output_destination=OUTPUT_DESTINATION_AWS, diff --git a/sdgym/_run_benchmark.py/upload_benchmark_results.py b/sdgym/_run_benchmark/upload_benchmark_results.py similarity index 95% rename from sdgym/_run_benchmark.py/upload_benchmark_results.py rename to sdgym/_run_benchmark/upload_benchmark_results.py index bd27f2ae..58584395 100644 --- a/sdgym/_run_benchmark.py/upload_benchmark_results.py +++ b/sdgym/_run_benchmark/upload_benchmark_results.py @@ -6,7 +6,7 @@ import boto3 from sdgym.result_writer import S3ResultsWriter -from sdgym.run_benchmark import OUTPUT_DESTINATION_AWS, RESULT_UPLOADED +from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS, RESULTS_UPLOADED from sdgym.sdgym_result_explorer.result_explorer import SDGymResultsExplorer @@ -25,7 +25,7 @@ def get_run_name(date_str): def main(): - if RESULT_UPLOADED: + if RESULTS_UPLOADED: print('Benchmark results have already been uploaded. Exiting.') sys.exit(0) diff --git a/tasks.py b/tasks.py index 64e5c84b..fb296a87 100644 --- a/tasks.py +++ b/tasks.py @@ -206,4 +206,10 @@ def rmdir(c, path): @task def sdgym_benchmark(c): """Run the SDGym benchmark.""" - c.run('python sdgym/_run_benchmark.py') \ No newline at end of file + c.run('python sdgym/_run_benchmark/run_benchmark.py') + +@task(help={"date": "Benchmark date in YYYY-MM-DD format (default: today with day=01)"}) +def upload_benchmark_results(c, date=None): + """Upload the benchmark results to S3.""" + date_arg = f"--date {date}" if date else "" + c.run(f'python sdgym/_run_benchmark/upload_benchmark_results.py {date_arg}') \ No newline at end of file From 266cbf23b95122dfed0ded4928cc1ac01030e873 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 21 Jul 2025 17:44:16 +0100 Subject: [PATCH 06/53] trigger on pushes --- .github/workflows/run_benchmark.yml | 3 +++ .github/workflows/upload_benchmark_results.yml | 3 +++ sdgym/_run_benchmark/__init__.py | 4 ++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index 03028e8f..40e967e7 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -1,6 +1,9 @@ name: Run SDGym Benchmark on: + push: + branches: + - issue-425-workflow-sdgym workflow_dispatch: schedule: - cron: '0 5 1 * *' diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index d6039f13..6c2bcdd0 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -1,6 +1,9 @@ name: Upload SDGym Benchmark results on: + push: + branches: + - issue-425-workflow-sdgym workflow_dispatch: inputs: date: diff --git a/sdgym/_run_benchmark/__init__.py b/sdgym/_run_benchmark/__init__.py index d916e31b..82d695ae 100644 --- a/sdgym/_run_benchmark/__init__.py +++ b/sdgym/_run_benchmark/__init__.py @@ -1,5 +1,5 @@ """Folder for the SDGym benchmark module.""" -OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' -UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' +OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425_2/' +UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425_2/' RESULTS_UPLOADED = False From 234957ffe99257cfd6e52d8aa0238fdaa6bf624f Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 21 Jul 2025 17:58:54 +0100 Subject: [PATCH 07/53] fix upload benchmark workflow --- .github/workflows/upload_benchmark_results.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index 6c2bcdd0..7229876d 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -43,4 +43,4 @@ jobs: fi echo "Benchmark date: $BENCHMARK_DATE" - invoke upload_benchmark_results --date "$BENCHMARK_DATE" \ No newline at end of file + invoke upload-benchmark-results --date "$BENCHMARK_DATE" \ No newline at end of file From 1c25b7f66857b1d5bc3a594c539c3b502a641119 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 21 Jul 2025 18:15:52 +0100 Subject: [PATCH 08/53] fix workflow --- sdgym/_run_benchmark/run_benchmark.py | 2 +- sdgym/_run_benchmark/upload_benchmark_results.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/sdgym/_run_benchmark/run_benchmark.py b/sdgym/_run_benchmark/run_benchmark.py index f248c2da..505f389f 100644 --- a/sdgym/_run_benchmark/run_benchmark.py +++ b/sdgym/_run_benchmark/run_benchmark.py @@ -1,7 +1,7 @@ import os -from sdgym.benchmark import SDV_SINGLE_TABLE_SYNTHESIZERS, benchmark_single_table_aws from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS, RESULTS_UPLOADED +from sdgym.benchmark import SDV_SINGLE_TABLE_SYNTHESIZERS, benchmark_single_table_aws aws_key = os.getenv('AWS_ACCESS_KEY_ID') aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') diff --git a/sdgym/_run_benchmark/upload_benchmark_results.py b/sdgym/_run_benchmark/upload_benchmark_results.py index 58584395..a051f7ce 100644 --- a/sdgym/_run_benchmark/upload_benchmark_results.py +++ b/sdgym/_run_benchmark/upload_benchmark_results.py @@ -5,8 +5,8 @@ import boto3 -from sdgym.result_writer import S3ResultsWriter from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS, RESULTS_UPLOADED +from sdgym.result_writer import S3ResultsWriter from sdgym.sdgym_result_explorer.result_explorer import SDGymResultsExplorer @@ -24,9 +24,9 @@ def get_run_name(date_str): return f'SDGym_results_{date.month:02d}_{date.day:02d}_{date.year}' -def main(): - if RESULTS_UPLOADED: - print('Benchmark results have already been uploaded. Exiting.') +def main(results_uploaded=RESULTS_UPLOADED): + if results_uploaded: + print('Benchmark results have already been uploaded. Exiting.') # noqa: T201 sys.exit(0) args = parse_args() @@ -41,11 +41,9 @@ def main(): aws_key = os.getenv('AWS_ACCESS_KEY_ID') aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') bucket_path = OUTPUT_DESTINATION_AWS - summary_filepath = f'{bucket_path}/{run_name}_summary.csv' result_explorer = SDGymResultsExplorer( bucket_path, aws_access_key_id=aws_key, aws_secret_access_key=aws_secret ) - summary = result_explorer. s3_client = boto3.client('s3', aws_access_key_id=aws_key, aws_secret_access_key=aws_secret) result_writer = S3ResultsWriter(s3_client) if not result_explorer.all_runs_complete(run_name): @@ -55,7 +53,8 @@ def main(): print(f'Run {run_name} is complete! Proceeding with summarization...') # noqa: T201 summary, _ = result_explorer.summarize(run_name) result_writer.write_dataframe(summary, f'{bucket_path}/{run_name}_summary.csv', index=True) - RESULT_UPLOADED = True + results_uploaded = True + if __name__ == '__main__': - main() + main(RESULTS_UPLOADED) From cfc31fde6e9d411aba635378172c6089565f209f Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 21 Jul 2025 18:20:41 +0100 Subject: [PATCH 09/53] fix run workflow --- sdgym/_run_benchmark/run_benchmark.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sdgym/_run_benchmark/run_benchmark.py b/sdgym/_run_benchmark/run_benchmark.py index 505f389f..f6416525 100644 --- a/sdgym/_run_benchmark/run_benchmark.py +++ b/sdgym/_run_benchmark/run_benchmark.py @@ -1,19 +1,17 @@ import os -from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS, RESULTS_UPLOADED -from sdgym.benchmark import SDV_SINGLE_TABLE_SYNTHESIZERS, benchmark_single_table_aws +import sdgym._run_benchmark as run_benchmark +from sdgym.benchmark import benchmark_single_table_aws aws_key = os.getenv('AWS_ACCESS_KEY_ID') aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') -synthesizer = SDV_SINGLE_TABLE_SYNTHESIZERS datasets = ['expedia_hotel_logs', 'fake_companies'] - if __name__ == '__main__': - RESULTS_UPLOADED = False + run_benchmark.RESULTS_UPLOADED = False for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: benchmark_single_table_aws( - output_destination=OUTPUT_DESTINATION_AWS, + output_destination=run_benchmark.OUTPUT_DESTINATION_AWS, aws_access_key_id=aws_key, aws_secret_access_key=aws_secret, synthesizers=[synthesizer], From 86c2eaf188fea2090668493981158c88c4f412f6 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 22 Jul 2025 12:59:05 +0100 Subject: [PATCH 10/53] add unit test upload_benchmark --- .../workflows/upload_benchmark_results.yml | 4 + sdgym/_run_benchmark/__init__.py | 5 +- sdgym/_run_benchmark/run_benchmark.py | 1 - .../upload_benchmark_results.py | 70 +++-- .../unit/_run_benchmark/test_run_benchmark.py | 0 .../test_upload_benchmark_result.py | 263 ++++++++++++++++++ 6 files changed, 323 insertions(+), 20 deletions(-) create mode 100644 tests/unit/_run_benchmark/test_run_benchmark.py create mode 100644 tests/unit/_run_benchmark/test_upload_benchmark_result.py diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index 7229876d..eef9cfa6 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -4,6 +4,10 @@ on: push: branches: - issue-425-workflow-sdgym + workflow_run: + workflows: ["Run SDGym Benchmark"] + types: + - completed workflow_dispatch: inputs: date: diff --git a/sdgym/_run_benchmark/__init__.py b/sdgym/_run_benchmark/__init__.py index 82d695ae..f447d07c 100644 --- a/sdgym/_run_benchmark/__init__.py +++ b/sdgym/_run_benchmark/__init__.py @@ -1,5 +1,4 @@ """Folder for the SDGym benchmark module.""" -OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425_2/' -UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425_2/' -RESULTS_UPLOADED = False +OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' +UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' diff --git a/sdgym/_run_benchmark/run_benchmark.py b/sdgym/_run_benchmark/run_benchmark.py index f6416525..10e6b827 100644 --- a/sdgym/_run_benchmark/run_benchmark.py +++ b/sdgym/_run_benchmark/run_benchmark.py @@ -8,7 +8,6 @@ datasets = ['expedia_hotel_logs', 'fake_companies'] if __name__ == '__main__': - run_benchmark.RESULTS_UPLOADED = False for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: benchmark_single_table_aws( output_destination=run_benchmark.OUTPUT_DESTINATION_AWS, diff --git a/sdgym/_run_benchmark/upload_benchmark_results.py b/sdgym/_run_benchmark/upload_benchmark_results.py index a051f7ce..41390448 100644 --- a/sdgym/_run_benchmark/upload_benchmark_results.py +++ b/sdgym/_run_benchmark/upload_benchmark_results.py @@ -1,14 +1,19 @@ import argparse +import logging import os import sys from datetime import datetime import boto3 +from botocore.exceptions import ClientError -from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS, RESULTS_UPLOADED +from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS from sdgym.result_writer import S3ResultsWriter +from sdgym.s3 import parse_s3_path from sdgym.sdgym_result_explorer.result_explorer import SDGymResultsExplorer +LOGGER = logging.getLogger(__name__) + def parse_args(): parser = argparse.ArgumentParser() @@ -21,14 +26,28 @@ def get_run_name(date_str): date = datetime.strptime(date_str, '%Y-%m-%d') except ValueError: raise ValueError(f'Invalid date format: {date_str}. Expected YYYY-MM-DD.') + return f'SDGym_results_{date.month:02d}_{date.day:02d}_{date.year}' -def main(results_uploaded=RESULTS_UPLOADED): - if results_uploaded: - print('Benchmark results have already been uploaded. Exiting.') # noqa: T201 - sys.exit(0) +def write_uploaded_marker(s3_client, bucket, prefix, run_name): + s3_client.put_object( + Bucket=bucket, Key=f'{prefix}{run_name}/upload_complete.marker', Body=b'Upload complete' + ) + +def upload_already_done(s3_client, bucket, prefix, run_name): + try: + s3_client.head_object(Bucket=bucket, Key=f'{prefix}{run_name}/upload_complete.marker') + return True + except ClientError as e: + if e.response['Error']['Code'] == '404': + return False + + raise + + +def get_run_name_and_s3_vars(aws_access_key_id, aws_secret_access_key): args = parse_args() if args.date: date_str = args.date @@ -36,25 +55,44 @@ def main(results_uploaded=RESULTS_UPLOADED): date_str = datetime.utcnow().replace(day=1).strftime('%Y-%m-%d') run_name = get_run_name(date_str) - print(f'Checking benchmark results for run: {run_name}') # noqa: T201 + bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS) + s3_client = boto3.client( + 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key + ) - aws_key = os.getenv('AWS_ACCESS_KEY_ID') - aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') - bucket_path = OUTPUT_DESTINATION_AWS + return run_name, s3_client, bucket, prefix + + +def upload_results(aws_access_key_id, aws_secret_access_key, run_name, s3_client, bucket, prefix): result_explorer = SDGymResultsExplorer( - bucket_path, aws_access_key_id=aws_key, aws_secret_access_key=aws_secret + OUTPUT_DESTINATION_AWS, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, ) - s3_client = boto3.client('s3', aws_access_key_id=aws_key, aws_secret_access_key=aws_secret) result_writer = S3ResultsWriter(s3_client) + if not result_explorer.all_runs_complete(run_name): - print(f'Run {run_name} is not complete yet. Exiting.') # noqa: T201 + LOGGER.info(f'Run {run_name} is not complete yet. Exiting.') sys.exit(0) - print(f'Run {run_name} is complete! Proceeding with summarization...') # noqa: T201 + LOGGER.info(f'Run {run_name} is complete! Proceeding with summarization...') summary, _ = result_explorer.summarize(run_name) - result_writer.write_dataframe(summary, f'{bucket_path}/{run_name}_summary.csv', index=True) - results_uploaded = True + result_writer.write_dataframe( + summary, f'{OUTPUT_DESTINATION_AWS}{run_name}/{run_name}_summary.csv', index=True + ) + write_uploaded_marker(s3_client, bucket, prefix, run_name) + + +def main(): + aws_key = os.getenv('AWS_ACCESS_KEY_ID') + aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') + run_name, s3_client, bucket, prefix = get_run_name_and_s3_vars(aws_key, aws_secret) + if upload_already_done(s3_client, bucket, prefix, run_name): + LOGGER.info('Benchmark results have already been uploaded. Exiting.') + sys.exit(0) + + upload_results(aws_key, aws_secret, run_name, s3_client, bucket, prefix) if __name__ == '__main__': - main(RESULTS_UPLOADED) + main() diff --git a/tests/unit/_run_benchmark/test_run_benchmark.py b/tests/unit/_run_benchmark/test_run_benchmark.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/_run_benchmark/test_upload_benchmark_result.py b/tests/unit/_run_benchmark/test_upload_benchmark_result.py new file mode 100644 index 00000000..50f15e49 --- /dev/null +++ b/tests/unit/_run_benchmark/test_upload_benchmark_result.py @@ -0,0 +1,263 @@ +from unittest.mock import Mock, patch + +import pytest +from botocore.exceptions import ClientError + +from sdgym._run_benchmark.upload_benchmark_results import ( + get_run_name, + get_run_name_and_s3_vars, + main, + parse_args, + upload_already_done, + upload_results, + write_uploaded_marker, +) + + +@patch('sdgym._run_benchmark.upload_benchmark_results.argparse.ArgumentParser') +def test_parse_args(mock_argparse): + """Test the `parse_args` method.""" + # Setup + parser = mock_argparse.return_value + parser.parse_args.return_value = Mock(date='01-07-2025') + mock_argparse.return_value.add_argument = Mock() + + # Run + args = parse_args() + + # Assert + assert args.date == '01-07-2025' + parser.add_argument.assert_called_once_with( + '--date', type=str, help='Benchmark date (YYYY-MM-DD)' + ) + parser.parse_args.assert_called_once() + + +def test_get_run_name(): + """Test the `get_run_name` method.""" + # Setup + expected_error_message = 'Invalid date format: invalid-date. Expected YYYY-MM-DD.' + + # Run and Assert + assert get_run_name('2023-10-01') == 'SDGym_results_10_01_2023' + with pytest.raises(ValueError, match=expected_error_message): + get_run_name('invalid-date') + + +def test_write_uploaded_marker(): + """Test the `write_uploaded_marker` method.""" + # Setup + s3_client = Mock() + bucket = 'test-bucket' + prefix = 'test-prefix/' + run_name = 'test_run' + + # Run + write_uploaded_marker(s3_client, bucket, prefix, run_name) + + # Assert + s3_client.put_object.assert_called_once_with( + Bucket=bucket, Key=f'{prefix}{run_name}/upload_complete.marker', Body=b'Upload complete' + ) + + +def test_upload_already_done(): + """Test the `upload_already_done` method.""" + # Setup + s3_client = Mock() + bucket = 'test-bucket' + prefix = 'test-prefix/' + run_name = 'test_run' + s3_client.head_object.side_effect = [ + '', + ClientError( + error_response={'Error': {'Code': '404', 'Message': 'Not Found'}}, + operation_name='HeadObject', + ), + ClientError( + error_response={'Error': {'Code': '405', 'Message': 'Other Error'}}, + operation_name='HeadObject', + ), + ] + + # Run + result = upload_already_done(s3_client, bucket, prefix, run_name) + result_false = upload_already_done(s3_client, bucket, prefix, run_name) + with pytest.raises(ClientError): + upload_already_done(s3_client, bucket, prefix, run_name) + + # Assert + assert result is True + assert result_false is False + + +@patch('sdgym._run_benchmark.upload_benchmark_results.get_run_name') +@patch('sdgym._run_benchmark.upload_benchmark_results.boto3.client') +@patch('sdgym._run_benchmark.upload_benchmark_results.parse_s3_path') +@patch('sdgym._run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') +@patch('sdgym._run_benchmark.upload_benchmark_results.parse_args') +def test_get_run_name_and_s3_vars( + mock_parse_args, + mock_output_destination_aws, + mock_parse_s3_path, + mock_boto_client, + mock_get_run_name, +): + """Test the `get_run_name_and_s3_vars` method.""" + # Setup + mock_parse_args.return_value.date = '2023-10-01' + aws_access_key_id = 'my_access_key' + aws_secret_access_key = 'my_secret_key' + expected_result = ('SDGym_results_10_01_2023', 's3_client', 'bucket', 'prefix') + mock_get_run_name.return_value = 'SDGym_results_10_01_2023' + mock_boto_client.return_value = 's3_client' + mock_parse_s3_path.return_value = ('bucket', 'prefix') + + # Run + result = get_run_name_and_s3_vars(aws_access_key_id, aws_secret_access_key) + + # Assert + assert result == expected_result + mock_get_run_name.assert_called_once_with('2023-10-01') + mock_boto_client.assert_called_once_with( + 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key + ) + mock_parse_s3_path.assert_called_once_with(mock_output_destination_aws) + + +@patch('sdgym._run_benchmark.upload_benchmark_results.SDGymResultsExplorer') +@patch('sdgym._run_benchmark.upload_benchmark_results.S3ResultsWriter') +@patch('sdgym._run_benchmark.upload_benchmark_results.write_uploaded_marker') +@patch('sdgym._run_benchmark.upload_benchmark_results.LOGGER') +@patch('sdgym._run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') +def test_upload_results( + mock_output_destination_aws, + mock_logger, + mock_write_uploaded_marker, + mock_s3_results_writer, + mock_sdgym_results_explorer, +): + """Test the `upload_results` method.""" + # Setup + aws_access_key_id = 'my_access_key' + aws_secret_access_key = 'my_secret_key' + run_name = 'SDGym_results_10_01_2023' + s3_client = 's3_client' + bucket = 'bucket' + prefix = 'prefix' + result_explorer_instance = mock_sdgym_results_explorer.return_value + result_explorer_instance.all_runs_complete.return_value = True + result_explorer_instance.summarize.return_value = ('summary', 'results') + + # Run + upload_results(aws_access_key_id, aws_secret_access_key, run_name, s3_client, bucket, prefix) + + # Assert + mock_logger.info.assert_called_once_with( + f'Run {run_name} is complete! Proceeding with summarization...' + ) + mock_sdgym_results_explorer.assert_called_once_with( + mock_output_destination_aws, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + ) + result_explorer_instance.all_runs_complete.assert_called_once_with(run_name) + result_explorer_instance.summarize.assert_called_once_with(run_name) + mock_s3_results_writer.return_value.write_dataframe.assert_called_once() + mock_write_uploaded_marker.assert_called_once_with(s3_client, bucket, prefix, run_name) + + +@patch('sdgym._run_benchmark.upload_benchmark_results.SDGymResultsExplorer') +@patch('sdgym._run_benchmark.upload_benchmark_results.S3ResultsWriter') +@patch('sdgym._run_benchmark.upload_benchmark_results.write_uploaded_marker') +@patch('sdgym._run_benchmark.upload_benchmark_results.LOGGER') +@patch('sdgym._run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') +def test_upload_results_not_all_runs_complete( + mock_output_destination_aws, + mock_logger, + mock_write_uploaded_marker, + mock_s3_results_writer, + mock_sdgym_results_explorer, +): + """Test the `upload_results` when not all runs are complete.""" + # Setup + aws_access_key_id = 'my_access_key' + aws_secret_access_key = 'my_secret_key' + run_name = 'SDGym_results_10_01_2023' + s3_client = 's3_client' + bucket = 'bucket' + prefix = 'prefix' + result_explorer_instance = mock_sdgym_results_explorer.return_value + result_explorer_instance.all_runs_complete.return_value = False + result_explorer_instance.summarize.return_value = ('summary', 'results') + + # Run + with pytest.raises(SystemExit, match='0'): + upload_results( + aws_access_key_id, aws_secret_access_key, run_name, s3_client, bucket, prefix + ) + + # Assert + mock_logger.info.assert_called_once_with(f'Run {run_name} is not complete yet. Exiting.') + mock_sdgym_results_explorer.assert_called_once_with( + mock_output_destination_aws, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + ) + result_explorer_instance.all_runs_complete.assert_called_once_with(run_name) + result_explorer_instance.summarize.assert_not_called() + mock_s3_results_writer.return_value.write_dataframe.assert_not_called() + mock_write_uploaded_marker.assert_not_called() + + +@patch('sdgym._run_benchmark.upload_benchmark_results.get_run_name_and_s3_vars') +@patch('sdgym._run_benchmark.upload_benchmark_results.upload_results') +@patch('sdgym._run_benchmark.upload_benchmark_results.upload_already_done') +@patch('sdgym._run_benchmark.upload_benchmark_results.LOGGER') +@patch('sdgym._run_benchmark.upload_benchmark_results.os.getenv') +def test_main_already_upload( + mock_getenv, + mock_logger, + mock_upload_already_done, + mock_upload_results, + mock_get_run_name_and_s3_vars, +): + """Test the `method` when results are already uploaded.""" + # Setup + mock_getenv.side_effect = ['my_access_key', 'my_secret_key'] + mock_get_run_name_and_s3_vars.return_value = ('run_name', 's3_client', 'bucket', 'prefix') + mock_upload_already_done.return_value = True + expected_log_message = 'Benchmark results have already been uploaded. Exiting.' + + # Run + with pytest.raises(SystemExit, match='0'): + main() + + # Assert + mock_get_run_name_and_s3_vars.assert_called_once_with('my_access_key', 'my_secret_key') + mock_logger.info.assert_called_once_with(expected_log_message) + mock_upload_results.assert_not_called() + + +@patch('sdgym._run_benchmark.upload_benchmark_results.get_run_name_and_s3_vars') +@patch('sdgym._run_benchmark.upload_benchmark_results.upload_results') +@patch('sdgym._run_benchmark.upload_benchmark_results.upload_already_done') +@patch('sdgym._run_benchmark.upload_benchmark_results.os.getenv') +def test_main( + mock_getenv, mock_upload_already_done, mock_upload_results, mock_get_run_name_and_s3_vars +): + """Test the `main` method.""" + # Setup + mock_getenv.side_effect = ['my_access_key', 'my_secret_key'] + mock_get_run_name_and_s3_vars.return_value = ('run_name', 's3_client', 'bucket', 'prefix') + mock_upload_already_done.return_value = False + + # Run + main() + + # Assert + mock_get_run_name_and_s3_vars.assert_called_once_with('my_access_key', 'my_secret_key') + mock_upload_already_done.assert_called_once_with('s3_client', 'bucket', 'prefix', 'run_name') + mock_upload_results.assert_called_once_with( + 'my_access_key', 'my_secret_key', 'run_name', 's3_client', 'bucket', 'prefix' + ) From f7e9733a99fd71584a7979e8c9176751f5b44153 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 22 Jul 2025 13:25:33 +0100 Subject: [PATCH 11/53] update write_run_id --- sdgym/benchmark.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 885d3ff6..981b1cb8 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1003,9 +1003,10 @@ def _write_run_id_file(synthesizers, job_args_list, result_writer=None): } for synthesizer in synthesizers: if synthesizer not in SDV_SINGLE_TABLE_SYNTHESIZERS: - ext_lib = EXTERNAL_SYNTHESIZER_TO_LIBRARY[synthesizer] - library_version = version(ext_lib) - metadata[f'{ext_lib}_version'] = library_version + ext_lib = EXTERNAL_SYNTHESIZER_TO_LIBRARY.get(synthesizer) + if ext_lib: + library_version = version(ext_lib) + metadata[f'{ext_lib}_version'] = library_version elif 'sdv' not in metadata.keys(): metadata['sdv_version'] = version('sdv') From d15900c0295db0d829f97fe31f0755d3ffd16e22 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 22 Jul 2025 13:47:04 +0100 Subject: [PATCH 12/53] fix sving big pickles --- sdgym/benchmark.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 981b1cb8..61c132d2 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1229,8 +1229,7 @@ def _store_job_args_in_s3(output_destination, job_args_list, s3_client): job_args_key = f'{path}{job_args_key}' if path else job_args_key serialized_data = pickle.dumps(job_args_list) - encoded_data = base64.b64encode(serialized_data).decode('utf-8') - s3_client.put_object(Bucket=bucket_name, Key=job_args_key, Body=encoded_data) + s3_client.put_object(Bucket=bucket_name, Key=job_args_key, Body=serialized_data) return bucket_name, job_args_key From 755ca33027ac443a1f738830a0b3cb1f7fb32284 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 22 Jul 2025 14:36:19 +0100 Subject: [PATCH 13/53] increase timeout for large data --- .github/workflows/upload_benchmark_results.yml | 4 +++- sdgym/benchmark.py | 8 ++++---- tasks.py | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index eef9cfa6..3e4893fc 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -35,6 +35,8 @@ jobs: - name: SDGym Benchmark env: + PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} + SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} @@ -47,4 +49,4 @@ jobs: fi echo "Benchmark date: $BENCHMARK_DATE" - invoke upload-benchmark-results --date "$BENCHMARK_DATE" \ No newline at end of file + invoke upload-benchmark-results --date "$BENCHMARK_DATE" diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 61c132d2..3019dda1 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1,6 +1,5 @@ """Main SDGym benchmarking module.""" -import base64 import concurrent import logging import math @@ -24,6 +23,7 @@ import numpy as np import pandas as pd import tqdm +from botocore.config import Config from sdmetrics.reports.multi_table import ( DiagnosticReport as MultiTableDiagnosticReport, ) @@ -1260,9 +1260,7 @@ def _get_s3_script_content( region_name='{region_name}' ) response = s3_client.get_object(Bucket='{bucket_name}', Key='{job_args_key}') -encoded_data = response['Body'].read().decode('utf-8') -serialized_data = base64.b64decode(encoded_data.encode('utf-8')) -job_args_list = pickle.loads(serialized_data) +job_args_list = pickle.loads(response['Body'].read()) result_writer = S3ResultsWriter(s3_client=s3_client) _write_run_id_file({synthesizers}, job_args_list, result_writer) scores = _run_jobs(None, job_args_list, False, result_writer=result_writer) @@ -1427,12 +1425,14 @@ def benchmark_single_table_aws( pandas.DataFrame: A table containing one row per synthesizer + dataset + metric. """ + config = Config(connect_timeout=30, read_timeout=300) s3_client = _validate_output_destination( output_destination, aws_keys={ 'aws_access_key_id': aws_access_key_id, 'aws_secret_access_key': aws_secret_access_key, }, + config=config, ) job_args_list = _generate_job_args_list( limit_dataset_size=limit_dataset_size, diff --git a/tasks.py b/tasks.py index fb296a87..61aba630 100644 --- a/tasks.py +++ b/tasks.py @@ -212,4 +212,4 @@ def sdgym_benchmark(c): def upload_benchmark_results(c, date=None): """Upload the benchmark results to S3.""" date_arg = f"--date {date}" if date else "" - c.run(f'python sdgym/_run_benchmark/upload_benchmark_results.py {date_arg}') \ No newline at end of file + c.run(f'python sdgym/_run_benchmark/upload_benchmark_results.py {date_arg}') From fc1f7a12c87bef6c8be7138b703fe119b7dbefcb Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 22 Jul 2025 14:40:04 +0100 Subject: [PATCH 14/53] fix benchmark --- sdgym/benchmark.py | 6 +++--- tests/unit/test_benchmark.py | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 3019dda1..58d1df70 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1191,15 +1191,17 @@ def _validate_aws_inputs(output_destination, aws_access_key_id, aws_secret_acces if not bucket_name: raise ValueError(f'Invalid S3 URL: {output_destination}') + config = Config(connect_timeout=30, read_timeout=300) if aws_access_key_id and aws_secret_access_key: s3_client = boto3.client( 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, + config=config, ) else: # No credentials provided β€” rely on default session - s3_client = boto3.client('s3') + s3_client = boto3.client('s3', config=config) s3_client.head_bucket(Bucket=bucket_name) if not _check_write_permissions(s3_client, bucket_name): @@ -1425,14 +1427,12 @@ def benchmark_single_table_aws( pandas.DataFrame: A table containing one row per synthesizer + dataset + metric. """ - config = Config(connect_timeout=30, read_timeout=300) s3_client = _validate_output_destination( output_destination, aws_keys={ 'aws_access_key_id': aws_access_key_id, 'aws_secret_access_key': aws_secret_access_key, }, - config=config, ) job_args_list = _generate_job_args_list( limit_dataset_size=limit_dataset_size, diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index 33a846d2..9faf1a5a 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -542,9 +542,12 @@ def test_setup_output_destination_aws(mock_get_run_id_increment): @patch('sdgym.benchmark.boto3.client') @patch('sdgym.benchmark._check_write_permissions') -def test_validate_aws_inputs_valid(mock_check_write_permissions, mock_boto3_client): +@patch('sdgym.benchmark.Config') +def test_validate_aws_inputs_valid(mock_config, mock_check_write_permissions, mock_boto3_client): """Test `_validate_aws_inputs` with valid inputs and credentials.""" # Setup + config_mock = Mock() + mock_config.return_value = config_mock valid_url = 's3://my-bucket/some/path' s3_client_mock = Mock() mock_boto3_client.return_value = s3_client_mock @@ -557,7 +560,7 @@ def test_validate_aws_inputs_valid(mock_check_write_permissions, mock_boto3_clie # Assert mock_boto3_client.assert_called_once_with( - 's3', aws_access_key_id='AKIA...', aws_secret_access_key='SECRET' + 's3', aws_access_key_id='AKIA...', aws_secret_access_key='SECRET', config=config_mock ) s3_client_mock.head_bucket.assert_called_once_with(Bucket='my-bucket') mock_check_write_permissions.assert_called_once_with(s3_client_mock, 'my-bucket') From ce661eaeae461a44e40002649b4e03500d8eb646 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 22 Jul 2025 15:52:31 +0100 Subject: [PATCH 15/53] debug --- sdgym/_run_benchmark/__init__.py | 2 ++ sdgym/benchmark.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/sdgym/_run_benchmark/__init__.py b/sdgym/_run_benchmark/__init__.py index f447d07c..150efadc 100644 --- a/sdgym/_run_benchmark/__init__.py +++ b/sdgym/_run_benchmark/__init__.py @@ -2,3 +2,5 @@ OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' +DEBUG_SLACK_CHANNEL = 'sdv-alerts-debug' +SLACK_CHANNEL = 'sdv-alerts' diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 58d1df70..e47f49f1 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -693,6 +693,7 @@ def _run_job(args): except Exception as error: output['exception'] = error + print('LA 1') scores = _format_output( output, name, @@ -702,10 +703,12 @@ def _run_job(args): compute_privacy_score, cache_dir, ) - + print('LA 2') if synthesizer_path and result_writer: + print(synthesizer_path['benchmark_result']) result_writer.write_dataframe(scores, synthesizer_path['benchmark_result']) + print('LA 3') return scores From 54b9f31d802010508ec8b7c9f99ebd211e21afdf Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 22 Jul 2025 16:10:04 +0100 Subject: [PATCH 16/53] use logger info --- sdgym/benchmark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index e47f49f1..e3f12168 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -64,6 +64,7 @@ ) LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.INFO) DEFAULT_SYNTHESIZERS = [GaussianCopulaSynthesizer, CTGANSynthesizer] DEFAULT_DATASETS = [ 'adult', From 37ea8cf5105f526191b1affa30394af43c64735d Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 22 Jul 2025 16:23:43 +0100 Subject: [PATCH 17/53] set level to info --- sdgym/benchmark.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index e3f12168..bba1d20f 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1249,6 +1249,7 @@ def _get_s3_script_content( import base64 import pandas as pd import sdgym +import logging from sdgym.synthesizers.sdv import ( CopulaGANSynthesizer, CTGANSynthesizer, GaussianCopulaSynthesizer, HMASynthesizer, PARSynthesizer, @@ -1258,6 +1259,8 @@ def _get_s3_script_content( from sdgym.benchmark import _run_jobs, _write_run_id_file, _update_run_id_file from io import StringIO from sdgym.result_writer import S3ResultsWriter +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.INFO) s3_client = boto3.client( 's3', From 3d082cca2642579c37035003c7627a0b7b2a08e5 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 22 Jul 2025 16:58:53 +0100 Subject: [PATCH 18/53] add logging handler --- sdgym/benchmark.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index bba1d20f..6b17531d 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1261,6 +1261,13 @@ def _get_s3_script_content( from sdgym.result_writer import S3ResultsWriter LOGGER = logging.getLogger(__name__) LOGGER.setLevel(logging.INFO) +if not LOGGER.handlers: + handler = logging.StreamHandler() + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + LOGGER.addHandler(handler) + s3_client = boto3.client( 's3', From bff2d225e8dfacdfc8e76ac740c017f4ce80a16c Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 22 Jul 2025 17:15:11 +0100 Subject: [PATCH 19/53] fix logs --- sdgym/benchmark.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 6b17531d..257664bf 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1259,14 +1259,17 @@ def _get_s3_script_content( from sdgym.benchmark import _run_jobs, _write_run_id_file, _update_run_id_file from io import StringIO from sdgym.result_writer import S3ResultsWriter +import sys + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + stream=sys.stdout +) + LOGGER = logging.getLogger(__name__) -LOGGER.setLevel(logging.INFO) -if not LOGGER.handlers: - handler = logging.StreamHandler() - handler.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') - handler.setFormatter(formatter) - LOGGER.addHandler(handler) +LOGGER.info("This should show up on CloudWatch / logs") + s3_client = boto3.client( From 9a07f7aebace4582faf6edd0b5a3f19031ffef4c Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 22 Jul 2025 18:01:44 +0100 Subject: [PATCH 20/53] clean + fix region name --- sdgym/benchmark.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 257664bf..10789875 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -694,7 +694,6 @@ def _run_job(args): except Exception as error: output['exception'] = error - print('LA 1') scores = _format_output( output, name, @@ -704,12 +703,9 @@ def _run_job(args): compute_privacy_score, cache_dir, ) - print('LA 2') if synthesizer_path and result_writer: - print(synthesizer_path['benchmark_result']) result_writer.write_dataframe(scores, synthesizer_path['benchmark_result']) - print('LA 3') return scores @@ -1201,6 +1197,7 @@ def _validate_aws_inputs(output_destination, aws_access_key_id, aws_secret_acces 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, + region_name='us-east-1', config=config, ) else: @@ -1393,7 +1390,7 @@ def benchmark_single_table_aws( limit_dataset_size=False, compute_quality_score=True, compute_diagnostic_score=True, - compute_privacy_score=True, + compute_privacy_score=False, sdmetrics=None, timeout=None, ): From af7afbbf85af1f6e2b4434141eb2aeba5db9cc8f Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 22 Jul 2025 18:11:58 +0100 Subject: [PATCH 21/53] update aws validation --- sdgym/benchmark.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 10789875..29b2130d 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1186,8 +1186,7 @@ def _validate_aws_inputs(output_destination, aws_access_key_id, aws_secret_acces if not output_destination.startswith('s3://'): raise ValueError("'output_destination' must be an S3 URL starting with 's3://'. ") - parsed_url = urlparse(output_destination) - bucket_name = parsed_url.netloc + bucket_name, _ = parse_s3_path(output_destination) if not bucket_name: raise ValueError(f'Invalid S3 URL: {output_destination}') From 620c8b0bece25368ca77fe7615e37c61ea5c3190 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 23 Jul 2025 10:47:18 +0100 Subject: [PATCH 22/53] debug _score --- sdgym/benchmark.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 29b2130d..81150263 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -331,6 +331,7 @@ def _compute_scores( modality, dataset_name, ): + LOGGER.info('ROM Computing scores for dataset %s', dataset_name) metrics = metrics or [] if len(metrics) > 0: metrics, metric_kwargs = get_metrics(metrics, modality='single-table') @@ -368,6 +369,7 @@ def _compute_scores( # re-inject list to multiprocessing output output['scores'] = scores + LOGGER.info('ROM before diagnostic score') if compute_diagnostic_score: start = get_utc_now() if modality == 'single_table': @@ -379,6 +381,7 @@ def _compute_scores( output['diagnostic_score_time'] = calculate_score_time(start) output['diagnostic_score'] = diagnostic_report.get_score() + LOGGER.info('ROM before quality score') if compute_quality_score: start = get_utc_now() if modality == 'single_table': @@ -387,9 +390,12 @@ def _compute_scores( quality_report = MultiTableQualityReport() quality_report.generate(real_data, synthetic_data, metadata, verbose=False) + LOGGER.info('ROM Quality report generated') output['quality_score_time'] = calculate_score_time(start) + LOGGER.info('ROM before quality score get_score') output['quality_score'] = quality_report.get_score() + LOGGER.info('ROM before privacy score') if compute_privacy_score: start = get_utc_now() num_rows = len(synthetic_data) From 41048352e6ab3312e572371ae653d1adf87f1896 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 23 Jul 2025 13:51:42 +0100 Subject: [PATCH 23/53] cleaning --- sdgym/benchmark.py | 38 +++++------------------------------- sdgym/s3.py | 1 + tests/unit/test_benchmark.py | 9 +++++++-- 3 files changed, 13 insertions(+), 35 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 81150263..66946dd5 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -45,6 +45,7 @@ from sdgym.result_writer import LocalResultsWriter from sdgym.s3 import ( S3_PREFIX, + S3_REGION, is_s3_path, parse_s3_path, write_csv, @@ -331,7 +332,6 @@ def _compute_scores( modality, dataset_name, ): - LOGGER.info('ROM Computing scores for dataset %s', dataset_name) metrics = metrics or [] if len(metrics) > 0: metrics, metric_kwargs = get_metrics(metrics, modality='single-table') @@ -369,7 +369,6 @@ def _compute_scores( # re-inject list to multiprocessing output output['scores'] = scores - LOGGER.info('ROM before diagnostic score') if compute_diagnostic_score: start = get_utc_now() if modality == 'single_table': @@ -381,7 +380,6 @@ def _compute_scores( output['diagnostic_score_time'] = calculate_score_time(start) output['diagnostic_score'] = diagnostic_report.get_score() - LOGGER.info('ROM before quality score') if compute_quality_score: start = get_utc_now() if modality == 'single_table': @@ -390,12 +388,9 @@ def _compute_scores( quality_report = MultiTableQualityReport() quality_report.generate(real_data, synthetic_data, metadata, verbose=False) - LOGGER.info('ROM Quality report generated') output['quality_score_time'] = calculate_score_time(start) - LOGGER.info('ROM before quality score get_score') output['quality_score'] = quality_report.get_score() - LOGGER.info('ROM before privacy score') if compute_privacy_score: start = get_utc_now() num_rows = len(synthetic_data) @@ -1202,7 +1197,7 @@ def _validate_aws_inputs(output_destination, aws_access_key_id, aws_secret_acces 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, - region_name='us-east-1', + region_name=S3_REGION, config=config, ) else: @@ -1248,31 +1243,9 @@ def _get_s3_script_content( return f""" import boto3 import pickle -import base64 -import pandas as pd -import sdgym -import logging -from sdgym.synthesizers.sdv import ( - CopulaGANSynthesizer, CTGANSynthesizer, - GaussianCopulaSynthesizer, HMASynthesizer, PARSynthesizer, - SDVRelationalSynthesizer, SDVTabularSynthesizer, TVAESynthesizer -) -from sdgym.synthesizers import RealTabFormerSynthesizer from sdgym.benchmark import _run_jobs, _write_run_id_file, _update_run_id_file from io import StringIO from sdgym.result_writer import S3ResultsWriter -import sys - -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - stream=sys.stdout -) - -LOGGER = logging.getLogger(__name__) -LOGGER.info("This should show up on CloudWatch / logs") - - s3_client = boto3.client( 's3', @@ -1337,11 +1310,10 @@ def _run_on_aws( aws_secret_access_key, ): bucket_name, job_args_key = _store_job_args_in_s3(output_destination, job_args_list, s3_client) - region_name = 'us-east-1' script_content = _get_s3_script_content( aws_access_key_id, aws_secret_access_key, - region_name, + S3_REGION, bucket_name, job_args_key, synthesizers, @@ -1351,12 +1323,12 @@ def _run_on_aws( session = boto3.session.Session( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, - region_name=region_name, + region_name=S3_REGION, ) ec2_client = session.client('ec2') print(f'This instance is being created in region: {session.region_name}') # noqa user_data_script = _get_user_data_script( - aws_access_key_id, aws_secret_access_key, region_name, script_content + aws_access_key_id, aws_secret_access_key, S3_REGION, script_content ) response = ec2_client.run_instances( ImageId='ami-080e1f13689e07408', diff --git a/sdgym/s3.py b/sdgym/s3.py index bfc22be9..e761e92a 100644 --- a/sdgym/s3.py +++ b/sdgym/s3.py @@ -10,6 +10,7 @@ import pandas as pd S3_PREFIX = 's3://' +S3_REGION = 'us-east-1' LOGGER = logging.getLogger(__name__) diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index 9faf1a5a..1c204a0a 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -25,6 +25,7 @@ benchmark_single_table_aws, ) from sdgym.result_writer import LocalResultsWriter +from sdgym.s3 import S3_REGION from sdgym.synthesizers import GaussianCopulaSynthesizer @@ -560,7 +561,11 @@ def test_validate_aws_inputs_valid(mock_config, mock_check_write_permissions, mo # Assert mock_boto3_client.assert_called_once_with( - 's3', aws_access_key_id='AKIA...', aws_secret_access_key='SECRET', config=config_mock + 's3', + aws_access_key_id='AKIA...', + aws_secret_access_key='SECRET', + region_name=S3_REGION, + config=config_mock, ) s3_client_mock.head_bucket.assert_called_once_with(Bucket='my-bucket') mock_check_write_permissions.assert_called_once_with(s3_client_mock, 'my-bucket') @@ -654,7 +659,7 @@ def test_benchmark_single_table_aws( output_destination=output_destination, compute_quality_score=True, compute_diagnostic_score=True, - compute_privacy_score=True, + compute_privacy_score=False, synthesizers=synthesizers, detailed_results_folder=None, custom_synthesizers=None, From 878005d521ff545c10954f4781ea8bc4eea75205 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 23 Jul 2025 14:35:42 +0100 Subject: [PATCH 24/53] add unit test --- sdgym/_run_benchmark/run_benchmark.py | 14 +++++--- .../upload_benchmark_results.py | 7 ++-- sdgym/benchmark.py | 3 +- .../unit/_run_benchmark/test_run_benchmark.py | 33 +++++++++++++++++++ .../test_upload_benchmark_result.py | 6 +++- tests/unit/test_benchmark.py | 2 +- 6 files changed, 55 insertions(+), 10 deletions(-) diff --git a/sdgym/_run_benchmark/run_benchmark.py b/sdgym/_run_benchmark/run_benchmark.py index 10e6b827..51433f6e 100644 --- a/sdgym/_run_benchmark/run_benchmark.py +++ b/sdgym/_run_benchmark/run_benchmark.py @@ -3,11 +3,12 @@ import sdgym._run_benchmark as run_benchmark from sdgym.benchmark import benchmark_single_table_aws -aws_key = os.getenv('AWS_ACCESS_KEY_ID') -aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') -datasets = ['expedia_hotel_logs', 'fake_companies'] +datasets = ['expedia_hotel_logs', 'fake_companies'] # DEFAULT_DATASETS -if __name__ == '__main__': + +def main(): + aws_key = os.getenv('AWS_ACCESS_KEY_ID') + aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: benchmark_single_table_aws( output_destination=run_benchmark.OUTPUT_DESTINATION_AWS, @@ -15,4 +16,9 @@ aws_secret_access_key=aws_secret, synthesizers=[synthesizer], sdv_datasets=datasets, + compute_privacy_score=False, ) + + +if __name__ == '__main__': + main() diff --git a/sdgym/_run_benchmark/upload_benchmark_results.py b/sdgym/_run_benchmark/upload_benchmark_results.py index 41390448..fb8d50a2 100644 --- a/sdgym/_run_benchmark/upload_benchmark_results.py +++ b/sdgym/_run_benchmark/upload_benchmark_results.py @@ -9,7 +9,7 @@ from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS from sdgym.result_writer import S3ResultsWriter -from sdgym.s3 import parse_s3_path +from sdgym.s3 import S3_REGION, parse_s3_path from sdgym.sdgym_result_explorer.result_explorer import SDGymResultsExplorer LOGGER = logging.getLogger(__name__) @@ -57,7 +57,10 @@ def get_run_name_and_s3_vars(aws_access_key_id, aws_secret_access_key): run_name = get_run_name(date_str) bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS) s3_client = boto3.client( - 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key + 's3', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=S3_REGION, ) return run_name, s3_client, bucket, prefix diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 66946dd5..66832869 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -65,7 +65,6 @@ ) LOGGER = logging.getLogger(__name__) -LOGGER.setLevel(logging.INFO) DEFAULT_SYNTHESIZERS = [GaussianCopulaSynthesizer, CTGANSynthesizer] DEFAULT_DATASETS = [ 'adult', @@ -1367,7 +1366,7 @@ def benchmark_single_table_aws( limit_dataset_size=False, compute_quality_score=True, compute_diagnostic_score=True, - compute_privacy_score=False, + compute_privacy_score=True, sdmetrics=None, timeout=None, ): diff --git a/tests/unit/_run_benchmark/test_run_benchmark.py b/tests/unit/_run_benchmark/test_run_benchmark.py index e69de29b..b22c1491 100644 --- a/tests/unit/_run_benchmark/test_run_benchmark.py +++ b/tests/unit/_run_benchmark/test_run_benchmark.py @@ -0,0 +1,33 @@ +from unittest.mock import call, patch + +from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS +from sdgym._run_benchmark.run_benchmark import main + + +@patch('sdgym._run_benchmark.run_benchmark.benchmark_single_table_aws') +@patch('sdgym._run_benchmark.run_benchmark.os.getenv') +def test_main(mock_getenv, mock_benchmark_single_table_aws): + """Test the `main` method.""" + # Setup + mock_getenv.side_effect = ['my_access_key', 'my_secret_key'] + + # Run + main() + + # Assert + mock_getenv.assert_any_call('AWS_ACCESS_KEY_ID') + mock_getenv.assert_any_call('AWS_SECRET_ACCESS_KEY') + expected_calls = [] + for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: + expected_calls.append( + call( + output_destination=OUTPUT_DESTINATION_AWS, + aws_access_key_id='my_access_key', + aws_secret_access_key='my_secret_key', + synthesizers=[synthesizer], + sdv_datasets=['expedia_hotel_logs', 'fake_companies'], + compute_privacy_score=False, + ) + ) + + mock_benchmark_single_table_aws.assert_has_calls(expected_calls) diff --git a/tests/unit/_run_benchmark/test_upload_benchmark_result.py b/tests/unit/_run_benchmark/test_upload_benchmark_result.py index 50f15e49..3b137f21 100644 --- a/tests/unit/_run_benchmark/test_upload_benchmark_result.py +++ b/tests/unit/_run_benchmark/test_upload_benchmark_result.py @@ -12,6 +12,7 @@ upload_results, write_uploaded_marker, ) +from sdgym.s3 import S3_REGION @patch('sdgym._run_benchmark.upload_benchmark_results.argparse.ArgumentParser') @@ -120,7 +121,10 @@ def test_get_run_name_and_s3_vars( assert result == expected_result mock_get_run_name.assert_called_once_with('2023-10-01') mock_boto_client.assert_called_once_with( - 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key + 's3', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=S3_REGION, ) mock_parse_s3_path.assert_called_once_with(mock_output_destination_aws) diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index 1c204a0a..6534cb9a 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -659,7 +659,7 @@ def test_benchmark_single_table_aws( output_destination=output_destination, compute_quality_score=True, compute_diagnostic_score=True, - compute_privacy_score=False, + compute_privacy_score=True, synthesizers=synthesizers, detailed_results_folder=None, custom_synthesizers=None, From 0d52be38f65d81a987ac77591b7f0281d148736e Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 23 Jul 2025 22:06:00 +0100 Subject: [PATCH 25/53] make variable name consistent --- sdgym/_run_benchmark/run_benchmark.py | 8 +-- .../upload_benchmark_results.py | 10 ++-- sdgym/benchmark.py | 18 +++++- sdgym/cli/__main__.py | 18 ++++-- sdgym/cli/collect.py | 20 ++++--- sdgym/cli/summary.py | 10 +++- sdgym/cli/utils.py | 30 +++++----- sdgym/datasets.py | 57 ++++++++++++++----- sdgym/s3.py | 28 ++++----- .../sdgym_result_explorer/result_explorer.py | 8 +-- .../test_result_explorer.py | 7 ++- tests/unit/test_benchmark.py | 10 ++-- tests/unit/test_datasets.py | 10 ++-- tests/unit/test_s3.py | 22 +++---- tests/unit/test_summary.py | 4 +- 15 files changed, 163 insertions(+), 97 deletions(-) diff --git a/sdgym/_run_benchmark/run_benchmark.py b/sdgym/_run_benchmark/run_benchmark.py index 51433f6e..f83a846d 100644 --- a/sdgym/_run_benchmark/run_benchmark.py +++ b/sdgym/_run_benchmark/run_benchmark.py @@ -7,13 +7,13 @@ def main(): - aws_key = os.getenv('AWS_ACCESS_KEY_ID') - aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') + aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') + aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: benchmark_single_table_aws( output_destination=run_benchmark.OUTPUT_DESTINATION_AWS, - aws_access_key_id=aws_key, - aws_secret_access_key=aws_secret, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, synthesizers=[synthesizer], sdv_datasets=datasets, compute_privacy_score=False, diff --git a/sdgym/_run_benchmark/upload_benchmark_results.py b/sdgym/_run_benchmark/upload_benchmark_results.py index fb8d50a2..aab73641 100644 --- a/sdgym/_run_benchmark/upload_benchmark_results.py +++ b/sdgym/_run_benchmark/upload_benchmark_results.py @@ -87,14 +87,16 @@ def upload_results(aws_access_key_id, aws_secret_access_key, run_name, s3_client def main(): - aws_key = os.getenv('AWS_ACCESS_KEY_ID') - aws_secret = os.getenv('AWS_SECRET_ACCESS_KEY') - run_name, s3_client, bucket, prefix = get_run_name_and_s3_vars(aws_key, aws_secret) + aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') + aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') + run_name, s3_client, bucket, prefix = get_run_name_and_s3_vars( + aws_access_key_id, aws_secret_access_key + ) if upload_already_done(s3_client, bucket, prefix, run_name): LOGGER.info('Benchmark results have already been uploaded. Exiting.') sys.exit(0) - upload_results(aws_key, aws_secret, run_name, s3_client, bucket, prefix) + upload_results(aws_access_key_id, aws_secret_access_key, run_name, s3_client, bucket, prefix) if __name__ == '__main__': diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 66832869..1b9a94f9 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -242,11 +242,25 @@ def _generate_job_args_list( synthesizers = get_synthesizers(synthesizers + custom_synthesizers) # Get list of dataset paths - sdv_datasets = [] if sdv_datasets is None else get_dataset_paths(datasets=sdv_datasets) + aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') + aws_secret_access_key_key = os.getenv('AWS_SECRET_ACCESS_KEY') + sdv_datasets = ( + [] + if sdv_datasets is None + else get_dataset_paths( + datasets=sdv_datasets, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key_key, + ) + ) additional_datasets = ( [] if additional_datasets_folder is None - else get_dataset_paths(bucket=additional_datasets_folder) + else get_dataset_paths( + bucket=additional_datasets_folder, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key_key, + ) ) datasets = sdv_datasets + additional_datasets synthesizer_names = [synthesizer['name'] for synthesizer in synthesizers] diff --git a/sdgym/cli/__main__.py b/sdgym/cli/__main__.py index 431531d4..59a812e8 100644 --- a/sdgym/cli/__main__.py +++ b/sdgym/cli/__main__.py @@ -98,12 +98,16 @@ def _download_datasets(args): datasets = args.datasets if not datasets: datasets = sdgym.datasets.get_available_datasets( - args.bucket, args.aws_key, args.aws_secret + args.bucket, args.aws_access_key_id, args.aws_secret_access_key )['name'] for dataset in tqdm.tqdm(datasets): sdgym.datasets.load_dataset( - dataset, args.datasets_path, args.bucket, args.aws_key, args.aws_secret + dataset, + args.datasets_path, + args.bucket, + args.aws_access_key_id, + args.aws_secret_access_key, ) @@ -114,7 +118,9 @@ def _list_downloaded(args): def _list_available(args): - datasets = sdgym.datasets.get_available_datasets(args.bucket, args.aws_key, args.aws_secret) + datasets = sdgym.datasets.get_available_datasets( + args.bucket, args.aws_access_key_id, args.aws_secret_access_key + ) _print_table(datasets, args.sort, args.reverse, {'size': humanfriendly.format_size}) @@ -125,7 +131,7 @@ def _list_synthesizers(args): def _collect(args): sdgym.cli.collect.collect_results( - args.input_path, args.output_file, args.aws_key, args.aws_secret + args.input_path, args.output_file, args.aws_access_key_id, args.aws_secret_access_key ) @@ -133,8 +139,8 @@ def _summary(args): sdgym.cli.summary.make_summary_spreadsheet( args.input_path, output_path=args.output_file, - aws_key=args.aws_key, - aws_secret=args.aws_secret, + aws_access_key_id=args.aws_access_key_id, + aws_secret_access_key=args.aws_secret_access_key, ) diff --git a/sdgym/cli/collect.py b/sdgym/cli/collect.py index 350fd291..8468e251 100644 --- a/sdgym/cli/collect.py +++ b/sdgym/cli/collect.py @@ -4,7 +4,9 @@ from sdgym.s3 import write_csv -def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None): +def collect_results( + input_path, output_file=None, aws_access_key_id=None, aws_secret_access_key=None +): """Collect the results in the given input directory. Write all the results into one csv file. @@ -15,15 +17,15 @@ def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None) output_file (str): If ``output_file`` is provided, the consolidated results will be written there. Otherwise, they will be written to ``input_path``/results.csv. - aws_key (str): - If an ``aws_key`` is provided, the given access key id will be used to read from - and/or write to any s3 paths. - aws_secret (str): - If an ``aws_secret`` is provided, the given secret access key will be used to read - from and/or write to any s3 paths. + aws_access_key_id (str): + If an ``aws_access_key_id`` is provided, the given access key id will be used + to read from and/or write to any s3 paths. + aws_secret_access_key (str): + If an ``aws_secret_access_key`` is provided, the given secret access key will + be used to read from and/or write to any s3 paths. """ print(f'Reading results from {input_path}') # noqa: T201 - scores = read_csv_from_path(input_path, aws_key, aws_secret) + scores = read_csv_from_path(input_path, aws_access_key_id, aws_secret_access_key) scores = scores.drop_duplicates() if output_file: @@ -32,4 +34,4 @@ def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None) output = f'{input_path}/results.csv' print(f'Storing results at {output}') # noqa: T201 - write_csv(scores, output, aws_key, aws_secret) + write_csv(scores, output, aws_access_key_id, aws_secret_access_key) diff --git a/sdgym/cli/summary.py b/sdgym/cli/summary.py index 06d872a3..cbbb9a98 100644 --- a/sdgym/cli/summary.py +++ b/sdgym/cli/summary.py @@ -289,7 +289,11 @@ def _add_summary(data, modality, baselines, writer): def make_summary_spreadsheet( - results_csv_path, output_path=None, baselines=None, aws_key=None, aws_secret=None + results_csv_path, + output_path=None, + baselines=None, + aws_access_key_id=None, + aws_secret_access_key=None, ): """Create a spreadsheet document organizing information from results. @@ -307,7 +311,7 @@ def make_summary_spreadsheet( Optional dict mapping modalities to a list of baseline model names. If not provided, a default dict is used. """ - results = read_csv(results_csv_path, aws_key, aws_secret) + results = read_csv(results_csv_path, aws_access_key_id, aws_secret_access_key) data = preprocess(results) baselines = baselines or MODALITY_BASELINES output_path = output_path or re.sub('.csv$', '.xlsx', results_csv_path) @@ -319,4 +323,4 @@ def make_summary_spreadsheet( _add_summary(df, modality, modality_baselines, writer) writer.save() - write_file(output.getvalue(), output_path, aws_key, aws_secret) + write_file(output.getvalue(), output_path, aws_access_key_id, aws_secret_access_key) diff --git a/sdgym/cli/utils.py b/sdgym/cli/utils.py index 77346277..1d1425b4 100644 --- a/sdgym/cli/utils.py +++ b/sdgym/cli/utils.py @@ -11,7 +11,7 @@ from sdgym.s3 import get_s3_client, is_s3_path, parse_s3_path -def read_file(path, aws_key, aws_secret): +def read_file(path, aws_access_key_id, aws_secret_access_key): """Read file from path. The path can either be a local path or an s3 directory. @@ -19,9 +19,9 @@ def read_file(path, aws_key, aws_secret): Args: path (str): The path to the file. - aws_key (str): + aws_access_key_id (str): The access key id that will be used to communicate with s3, if provided. - aws_secret (str): + aws_secret_access_key (str): The secret access key that will be used to communicate with s3, if provided. Returns: @@ -29,7 +29,7 @@ def read_file(path, aws_key, aws_secret): The content of the file in bytes. """ if is_s3_path(path): - s3 = get_s3_client(aws_key, aws_secret) + s3 = get_s3_client(aws_access_key_id, aws_secret_access_key) bucket_name, key = parse_s3_path(path) obj = s3.get_object(Bucket=bucket_name, Key=key) contents = obj['Body'].read() @@ -40,7 +40,7 @@ def read_file(path, aws_key, aws_secret): return contents -def read_csv(path, aws_key, aws_secret): +def read_csv(path, aws_access_key_id, aws_secret_access_key): """Read csv file from path. The path can either be a local path or an s3 directory. @@ -48,20 +48,20 @@ def read_csv(path, aws_key, aws_secret): Args: path (str): The path to the csv file. - aws_key (str): + aws_access_key_id (str): The access key id that will be used to communicate with s3, if provided. - aws_secret (str): + aws_secret_access_key (str): The secret access key that will be used to communicate with s3, if provided. Returns: pandas.DataFrame: A DataFrame containing the contents of the csv file. """ - contents = read_file(path, aws_key, aws_secret) + contents = read_file(path, aws_access_key_id, aws_secret_access_key) return pd.read_csv(io.BytesIO(contents)) -def read_csv_from_path(path, aws_key, aws_secret): +def read_csv_from_path(path, aws_access_key_id, aws_secret_access_key): """Read all csv content within a path. All csv content within a path will be read and returned in a @@ -70,9 +70,9 @@ def read_csv_from_path(path, aws_key, aws_secret): Args: path (str): The path to read from, which can be either local or an s3 path. - aws_key (str): + aws_access_key_id (str): The access key id that will be used to communicate with s3, if provided. - aws_secret (str): + aws_secret_access_key (str): The secret access key that will be used to communicate with s3, if provided. Returns: @@ -81,13 +81,17 @@ def read_csv_from_path(path, aws_key, aws_secret): """ csv_contents = [] if is_s3_path(path): - s3 = get_s3_client(aws_key, aws_secret) + s3 = get_s3_client(aws_access_key_id, aws_secret_access_key) bucket_name, key_prefix = parse_s3_path(path) resp = s3.list_objects(Bucket=bucket_name, Prefix=key_prefix) csv_files = [f for f in resp['Contents'] if f['Key'].endswith('.csv')] for csv_file in csv_files: csv_file_key = csv_file['Key'] - csv_contents.append(read_csv(f's3://{bucket_name}/{csv_file_key}', aws_key, aws_secret)) + csv_contents.append( + read_csv( + f's3://{bucket_name}/{csv_file_key}', aws_access_key_id, aws_secret_access_key + ) + ) else: run_path = pathlib.Path(path) diff --git a/sdgym/datasets.py b/sdgym/datasets.py index 13a3b237..b04b00d5 100644 --- a/sdgym/datasets.py +++ b/sdgym/datasets.py @@ -28,7 +28,12 @@ def _get_bucket_name(bucket): def _download_dataset( - modality, dataset_name, datasets_path=None, bucket=None, aws_key=None, aws_secret=None + modality, + dataset_name, + datasets_path=None, + bucket=None, + aws_access_key_id=None, + aws_secret_access_key=None, ): """Download a dataset and extract it into the given ``datasets_path``.""" datasets_path = datasets_path or DATASETS_PATH / dataset_name @@ -36,7 +41,7 @@ def _download_dataset( bucket_name = _get_bucket_name(bucket) LOGGER.info('Downloading dataset %s from %s', dataset_name, bucket) - s3 = get_s3_client(aws_key, aws_secret) + s3 = get_s3_client(aws_access_key_id, aws_secret_access_key) obj = s3.get_object(Bucket=bucket_name, Key=f'{modality.upper()}/{dataset_name}.zip') bytes_io = io.BytesIO(obj['Body'].read()) @@ -45,7 +50,14 @@ def _download_dataset( zf.extractall(datasets_path) -def _get_dataset_path(modality, dataset, datasets_path, bucket=None, aws_key=None, aws_secret=None): +def _get_dataset_path( + modality, + dataset, + datasets_path, + bucket=None, + aws_access_key_id=None, + aws_secret_access_key=None, +): dataset = Path(dataset) if dataset.exists(): return dataset @@ -62,7 +74,12 @@ def _get_dataset_path(modality, dataset, datasets_path, bucket=None, aws_key=Non return local_path _download_dataset( - modality, dataset, dataset_path, bucket=bucket, aws_key=aws_key, aws_secret=aws_secret + modality, + dataset, + dataset_path, + bucket=bucket, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, ) return dataset_path @@ -88,8 +105,8 @@ def load_dataset( dataset, datasets_path=None, bucket=None, - aws_key=None, - aws_secret=None, + aws_access_key_id=None, + aws_secret_access_key=None, limit_dataset_size=None, ): """Get the data and metadata of a dataset. @@ -105,9 +122,9 @@ def load_dataset( bucket (str): The AWS bucket where to get the dataset. This will only be used if both ``dataset`` and ``dataset_path`` don't exist. - aws_key (str): + aws_access_key_id (str): The access key id that will be used to communicate with s3, if provided. - aws_secret (str): + aws_secret_access_key (str): The secret access key that will be used to communicate with s3, if provided. limit_dataset_size (bool): Use this flag to limit the size of the datasets for faster evaluation. If ``True``, @@ -118,7 +135,9 @@ def load_dataset( pd.DataFrame, dict: The data and medatata of a dataset. """ - dataset_path = _get_dataset_path(modality, dataset, datasets_path, bucket, aws_key, aws_secret) + dataset_path = _get_dataset_path( + modality, dataset, datasets_path, bucket, aws_access_key_id, aws_secret_access_key + ) with open(dataset_path / f'{dataset_path.name}.csv') as data_csv: data = pd.read_csv(data_csv) @@ -153,12 +172,14 @@ def get_available_datasets(modality='single_table'): return _get_available_datasets(modality) -def _get_available_datasets(modality, bucket=None, aws_key=None, aws_secret=None): +def _get_available_datasets( + modality, bucket=None, aws_access_key_id=None, aws_secret_access_key=None +): if modality not in MODALITIES: modalities_list = ', '.join(MODALITIES) raise ValueError(f'Modality `{modality}` not recognized. Must be one of {modalities_list}') - s3 = get_s3_client(aws_key, aws_secret) + s3 = get_s3_client(aws_access_key_id, aws_secret_access_key) bucket = bucket or BUCKET bucket_name = _get_bucket_name(bucket) @@ -182,7 +203,11 @@ def _get_available_datasets(modality, bucket=None, aws_key=None, aws_secret=None def get_dataset_paths( - datasets=None, datasets_path=None, bucket=None, aws_key=None, aws_secret=None + datasets=None, + datasets_path=None, + bucket=None, + aws_access_key_id=None, + aws_secret_access_key=None, ): """Build the full path to datasets and ensure they exist. @@ -193,9 +218,9 @@ def get_dataset_paths( The path of the datasets. bucket (str): The AWS bucket where to get the dataset. - aws_key (str): + aws_access_key_id (str): The access key id that will be used to communicate with s3, if provided. - aws_secret (str): + aws_secret_access_key (str): The secret access key that will be used to communicate with s3, if provided. Returns: @@ -230,6 +255,8 @@ def get_dataset_paths( ].tolist() return [ - _get_dataset_path('single_table', dataset, datasets_path, bucket, aws_key, aws_secret) + _get_dataset_path( + 'single_table', dataset, datasets_path, bucket, aws_access_key_id, aws_secret_access_key + ) for dataset in datasets ] diff --git a/sdgym/s3.py b/sdgym/s3.py index e761e92a..8d5ecdd1 100644 --- a/sdgym/s3.py +++ b/sdgym/s3.py @@ -50,14 +50,14 @@ def parse_s3_path(path): return bucket_name, key_prefix -def get_s3_client(aws_key=None, aws_secret=None): +def get_s3_client(aws_access_key_id=None, aws_secret_access_key=None): """Get the boto client for interfacing with AWS s3. Args: - aws_key (str): + aws_access_key_id (str): The access key id that will be used to communicate with s3, if provided. - aws_secret (str): + aws_secret_access_key (str): The secret access key that will be used to communicate with s3, if provided. @@ -65,9 +65,11 @@ def get_s3_client(aws_key=None, aws_secret=None): boto3.session.Session.client: The s3 client that can be used to read / write to s3. """ - if aws_key is not None and aws_secret is not None: + if aws_access_key_id is not None and aws_secret_access_key is not None: # credentials available - return boto3.client('s3', aws_access_key_id=aws_key, aws_secret_access_key=aws_secret) + return boto3.client( + 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key + ) else: if boto3.Session().get_credentials(): # credentials available and will be detected automatically @@ -79,7 +81,7 @@ def get_s3_client(aws_key=None, aws_secret=None): return boto3.client('s3', config=config) -def write_file(data_contents, path, aws_key, aws_secret): +def write_file(data_contents, path, aws_access_key_id, aws_secret_access_key): """Write a file to the given path with the given contents. If the path is an s3 directory, we will use the given aws credentials @@ -91,10 +93,10 @@ def write_file(data_contents, path, aws_key, aws_secret): path (str): The path to write the file to, which can be either local or an s3 path. - aws_key (str): + aws_access_key_id (str): The access key id that will be used to communicate with s3, if provided. - aws_secret (str): + aws_secret_access_key (str): The secret access key that will be used to communicate with s3, if provided. @@ -110,7 +112,7 @@ def write_file(data_contents, path, aws_key, aws_secret): write_mode = 'wb' if is_s3_path(path): - s3 = get_s3_client(aws_key, aws_secret) + s3 = get_s3_client(aws_access_key_id, aws_secret_access_key) bucket_name, key = parse_s3_path(path) s3.put_object( Bucket=bucket_name, @@ -126,7 +128,7 @@ def write_file(data_contents, path, aws_key, aws_secret): f.write(data_contents) -def write_csv(data, path, aws_key, aws_secret): +def write_csv(data, path, aws_access_key_id, aws_secret_access_key): """Write a csv file to the given path with the given contents. If the path is an s3 directory, we will use the given aws credentials @@ -138,10 +140,10 @@ def write_csv(data, path, aws_key, aws_secret): path (str): The path to write the file to, which can be either local or an s3 path. - aws_key (str): + aws_access_key_id (str): The access key id that will be used to communicate with s3, if provided. - aws_secret (str): + aws_secret_access_key (str): The secret access key that will be used to communicate with s3, if provided. @@ -149,7 +151,7 @@ def write_csv(data, path, aws_key, aws_secret): none """ data_contents = data.to_csv(index=False).encode('utf-8') - write_file(data_contents, path, aws_key, aws_secret) + write_file(data_contents, path, aws_access_key_id, aws_secret_access_key) def _parse_s3_paths(s3_paths_dict): diff --git a/sdgym/sdgym_result_explorer/result_explorer.py b/sdgym/sdgym_result_explorer/result_explorer.py index 068d588e..889fde95 100644 --- a/sdgym/sdgym_result_explorer/result_explorer.py +++ b/sdgym/sdgym_result_explorer/result_explorer.py @@ -65,8 +65,8 @@ def load_real_data(self, dataset_name): if dataset_name in DEFAULT_DATASETS: dataset_path = get_dataset_paths( datasets=[dataset_name], - aws_key=self.aws_access_key_id, - aws_secret=self.aws_secret_access_key, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, )[0] else: raise ValueError( @@ -77,8 +77,8 @@ def load_real_data(self, dataset_name): data, _ = load_dataset( 'single_table', dataset_path, - aws_key=self.aws_access_key_id, - aws_secret=self.aws_secret_access_key, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, ) return data diff --git a/tests/unit/sdgym_result_explorer/test_result_explorer.py b/tests/unit/sdgym_result_explorer/test_result_explorer.py index 3f64a78c..a9dd27bf 100644 --- a/tests/unit/sdgym_result_explorer/test_result_explorer.py +++ b/tests/unit/sdgym_result_explorer/test_result_explorer.py @@ -191,10 +191,13 @@ def test_load_real_data(self, mock_get_dataset_paths, mock_load_dataset, tmp_pat # Assert mock_get_dataset_paths.assert_called_once_with( - datasets=[dataset_name], aws_key=None, aws_secret=None + datasets=[dataset_name], aws_access_key_id=None, aws_secret_access_key=None ) mock_load_dataset.assert_called_once_with( - 'single_table', 'path/to/adult/dataset', aws_key=None, aws_secret=None + 'single_table', + 'path/to/adult/dataset', + aws_access_key_id=None, + aws_secret_access_key=None, ) pd.testing.assert_frame_equal(real_data, expected_data) diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index 6534cb9a..4073d618 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -372,21 +372,23 @@ def test__validate_output_destination(tmp_path): @patch('sdgym.benchmark._validate_aws_inputs') -def test__validate_output_destination_with_aws_keys(mock_validate): +def test__validate_output_destination_with_aws_access_key_ids(mock_validate): """Test the `_validate_output_destination` function with AWS keys.""" # Setup output_destination = 's3://my-bucket/path/to/file' - aws_keys = { + aws_access_key_ids = { 'aws_access_key_id': 'mock_access_key', 'aws_secret_access_key': 'mock_secret_key', } # Run - _validate_output_destination(output_destination, aws_keys) + _validate_output_destination(output_destination, aws_access_key_ids) # Assert mock_validate.assert_called_once_with( - output_destination, aws_keys['aws_access_key_id'], aws_keys['aws_secret_access_key'] + output_destination, + aws_access_key_ids['aws_access_key_id'], + aws_access_key_ids['aws_secret_access_key'], ) diff --git a/tests/unit/test_datasets.py b/tests/unit/test_datasets.py index 1c9cdd04..a575c0f4 100644 --- a/tests/unit/test_datasets.py +++ b/tests/unit/test_datasets.py @@ -110,8 +110,8 @@ def test__download_dataset_private_bucket(boto3_mock, tmpdir): modality = 'single_table' dataset = 'my_dataset' bucket = 's3://my_bucket' - aws_key = 'my_key' - aws_secret = 'my_secret' + aws_access_key_id = 'my_key' + aws_secret_access_key = 'my_secret' bytesio = io.BytesIO() with ZipFile(bytesio, mode='w') as zf: @@ -130,13 +130,13 @@ def test__download_dataset_private_bucket(boto3_mock, tmpdir): dataset, datasets_path=str(tmpdir), bucket=bucket, - aws_key=aws_key, - aws_secret=aws_secret, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, ) # asserts boto3_mock.client.assert_called_once_with( - 's3', aws_access_key_id=aws_key, aws_secret_access_key=aws_secret + 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key ) s3_mock.get_object.assert_called_once_with( Bucket='my_bucket', Key=f'{modality.upper()}/{dataset}.zip' diff --git a/tests/unit/test_s3.py b/tests/unit/test_s3.py index fff365ed..892642f9 100644 --- a/tests/unit/test_s3.py +++ b/tests/unit/test_s3.py @@ -120,8 +120,8 @@ def test_write_file(tmpdir): Input: - contents of the local file - path to the local file - - aws_key is None - - aws_secret is None + - aws_access_key_id is None + - aws_secret_access_key is None Output: - None @@ -151,14 +151,14 @@ def test_write_file_s3(boto3_mock): Input: - contents of the s3 file - path to the s3 file location - - aws_key for aws authentication - - aws_secret for aws authentication + - aws_access_key_id for aws authentication + - aws_secret_access_key for aws authentication Output: - None Side effects: - - s3 client creation with aws credentials (aws_key, aws_secret) + - s3 client creation with aws credentials (aws_access_key_id, aws_secret_access_key) - s3 method call to create a file in the given bucket with the given contents """ @@ -167,18 +167,18 @@ def test_write_file_s3(boto3_mock): bucket_name = 'my-bucket' key = 'test.txt' path = f's3://{bucket_name}/{key}' - aws_key = 'my-key' - aws_secret = 'my-secret' + aws_access_key_id = 'my-key' + aws_secret_access_key = 'my-secret' s3_mock = Mock() boto3_mock.client.return_value = s3_mock # run - write_file(content_str.encode('utf-8'), path, aws_key, aws_secret) + write_file(content_str.encode('utf-8'), path, aws_access_key_id, aws_secret_access_key) # asserts boto3_mock.client.assert_called_once_with( - 's3', aws_access_key_id=aws_key, aws_secret_access_key=aws_secret + 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key ) s3_mock.put_object.assert_called_once_with( Bucket=bucket_name, @@ -199,8 +199,8 @@ def test_write_csv(write_file_mock): Input: - data to be written to the csv file - path of the desired csv file - - aws_key is None - - aws_secret is None + - aws_access_key_id is None + - aws_secret_access_key is None Output: - None diff --git a/tests/unit/test_summary.py b/tests/unit/test_summary.py index b34d6fdd..650ec3ec 100644 --- a/tests/unit/test_summary.py +++ b/tests/unit/test_summary.py @@ -26,8 +26,8 @@ def test_make_summary_spreadsheet( The ``make_summary_spreadsheet`` function is expected to extract the correct columns from the input file and add them to the correct sheets. It should - then use the ``aws_key`` and ``aws_secret`` provided to call ``sdgym.s3.write_file`` - and save the output document. + then use the ``aws_access_key_id`` and ``aws_secret_access_key`` provided to + call ``sdgym.s3.write_file`` and save the output document. Input: - file path to results csv. From cf270c46fbf95ecf83123b7c4e5803ab37bb3034 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 23 Jul 2025 22:12:07 +0100 Subject: [PATCH 26/53] add region name --- sdgym/s3.py | 6 +++++- tests/unit/test_datasets.py | 6 +++++- tests/unit/test_s3.py | 7 ++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/sdgym/s3.py b/sdgym/s3.py index 8d5ecdd1..d271f2c5 100644 --- a/sdgym/s3.py +++ b/sdgym/s3.py @@ -68,7 +68,10 @@ def get_s3_client(aws_access_key_id=None, aws_secret_access_key=None): if aws_access_key_id is not None and aws_secret_access_key is not None: # credentials available return boto3.client( - 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key + 's3', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=S3_REGION, ) else: if boto3.Session().get_credentials(): @@ -206,6 +209,7 @@ def _get_s3_client(output_destination, aws_access_key_id=None, aws_secret_access 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, + region_name=S3_REGION, ) else: s3_client = boto3.client('s3') diff --git a/tests/unit/test_datasets.py b/tests/unit/test_datasets.py index a575c0f4..f498d5f3 100644 --- a/tests/unit/test_datasets.py +++ b/tests/unit/test_datasets.py @@ -14,6 +14,7 @@ get_dataset_paths, load_dataset, ) +from sdgym.s3 import S3_REGION class AnyConfigWith: @@ -136,7 +137,10 @@ def test__download_dataset_private_bucket(boto3_mock, tmpdir): # asserts boto3_mock.client.assert_called_once_with( - 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key + 's3', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=S3_REGION, ) s3_mock.get_object.assert_called_once_with( Bucket='my_bucket', Key=f'{modality.upper()}/{dataset}.zip' diff --git a/tests/unit/test_s3.py b/tests/unit/test_s3.py index 892642f9..757653b3 100644 --- a/tests/unit/test_s3.py +++ b/tests/unit/test_s3.py @@ -9,6 +9,7 @@ from botocore.exceptions import NoCredentialsError from sdgym.s3 import ( + S3_REGION, _get_s3_client, _upload_dataframe_to_s3, _upload_pickle_to_s3, @@ -178,7 +179,10 @@ def test_write_file_s3(boto3_mock): # asserts boto3_mock.client.assert_called_once_with( - 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key + 's3', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=S3_REGION, ) s3_mock.put_object.assert_called_once_with( Bucket=bucket_name, @@ -307,6 +311,7 @@ def test__get_s3_client_with_credentials(mock_boto_client): 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, + region_name=S3_REGION, ) mock_s3_client.head_bucket.assert_called_once_with(Bucket='my-bucket') From 7fc725bc01a455488e0303ab04adb034be7bf926 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 24 Jul 2025 10:27:59 +0100 Subject: [PATCH 27/53] improve datetime logic --- .github/workflows/run_benchmark.yml | 4 +- .../workflows/upload_benchmark_results.yml | 15 +----- sdgym/_run_benchmark/_utils.py | 10 ++++ sdgym/_run_benchmark/run_benchmark.py | 29 ++++++++++++ .../upload_benchmark_results.py | 33 +++++-------- tasks.py | 2 +- tests/unit/_run_benchmark/test__utils.py | 14 ++++++ .../unit/_run_benchmark/test_run_benchmark.py | 10 +++- .../test_upload_benchmark_result.py | 47 +++---------------- 9 files changed, 85 insertions(+), 79 deletions(-) create mode 100644 sdgym/_run_benchmark/_utils.py create mode 100644 tests/unit/_run_benchmark/test__utils.py diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index 40e967e7..34505a0e 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -9,7 +9,7 @@ on: - cron: '0 5 1 * *' jobs: - sdgym-benchmark: + run-sdgym-benchmark: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -31,4 +31,4 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} - run: invoke sdgym-benchmark + run: invoke run-sdgym-benchmark diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index 3e4893fc..21b038c6 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -9,15 +9,11 @@ on: types: - completed workflow_dispatch: - inputs: - date: - description: 'Benchmark date (YYYY-MM-DD), defaults to the first of the current month' - required: false schedule: - cron: '0 6 * * *' jobs: - sdgym-benchmark: + upload-sdgym-benchmark: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -42,11 +38,4 @@ jobs: AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} run: | - if [ -z "${{ github.event.inputs.date }}" ]; then - BENCHMARK_DATE=$(date -u "+%Y-%m-01") - else - BENCHMARK_DATE="${{ github.event.inputs.date }}" - fi - - echo "Benchmark date: $BENCHMARK_DATE" - invoke upload-benchmark-results --date "$BENCHMARK_DATE" + invoke upload-benchmark-results diff --git a/sdgym/_run_benchmark/_utils.py b/sdgym/_run_benchmark/_utils.py new file mode 100644 index 00000000..7c73fef7 --- /dev/null +++ b/sdgym/_run_benchmark/_utils.py @@ -0,0 +1,10 @@ +from datetime import datetime + + +def get_run_name(date_str): + try: + date = datetime.strptime(date_str, '%Y-%m-%d') + except ValueError: + raise ValueError(f'Invalid date format: {date_str}. Expected YYYY-MM-DD.') + + return f'SDGym_results_{date.month:02d}_{date.day:02d}_{date.year}' diff --git a/sdgym/_run_benchmark/run_benchmark.py b/sdgym/_run_benchmark/run_benchmark.py index f83a846d..1f1166af 100644 --- a/sdgym/_run_benchmark/run_benchmark.py +++ b/sdgym/_run_benchmark/run_benchmark.py @@ -1,14 +1,41 @@ +import json import os +from datetime import datetime, timezone import sdgym._run_benchmark as run_benchmark +from sdgym._run_benchmark._utils import get_run_name from sdgym.benchmark import benchmark_single_table_aws +from sdgym.s3 import get_s3_client, parse_s3_path datasets = ['expedia_hotel_logs', 'fake_companies'] # DEFAULT_DATASETS +def append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str): + s3_client = get_s3_client( + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + ) + bucket, prefix = parse_s3_path(run_benchmark.OUTPUT_DESTINATION_AWS) + key = '_BENCHMARK_DATES.json' + try: + object = s3_client.get_object(Bucket=bucket, Key=f'{prefix}{key}') + body = object['Body'].read().decode('utf-8') + data = json.loads(body) + except s3_client.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'NoSuchKey': + data = {'runs': []} + else: + raise RuntimeError(f'Failed to read {key} from S3: {e}') + + data['runs'].append({'date': date_str, 'run_name': get_run_name(date_str)}) + data['runs'] = sorted(data['runs'], key=lambda x: x['date']) + s3_client.put_object(Bucket=bucket, Key=f'{prefix}{key}', Body=json.dumps(data).encode('utf-8')) + + def main(): aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') + date_str = datetime.now(timezone.utc).strftime('%Y-%m-%d') for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: benchmark_single_table_aws( output_destination=run_benchmark.OUTPUT_DESTINATION_AWS, @@ -19,6 +46,8 @@ def main(): compute_privacy_score=False, ) + append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str) + if __name__ == '__main__': main() diff --git a/sdgym/_run_benchmark/upload_benchmark_results.py b/sdgym/_run_benchmark/upload_benchmark_results.py index aab73641..710a7987 100644 --- a/sdgym/_run_benchmark/upload_benchmark_results.py +++ b/sdgym/_run_benchmark/upload_benchmark_results.py @@ -1,8 +1,7 @@ -import argparse +import json import logging import os import sys -from datetime import datetime import boto3 from botocore.exceptions import ClientError @@ -15,19 +14,15 @@ LOGGER = logging.getLogger(__name__) -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--date', type=str, help='Benchmark date (YYYY-MM-DD)') - return parser.parse_args() - - -def get_run_name(date_str): +def get_latest_run_from_file(s3_client, bucket, key): try: - date = datetime.strptime(date_str, '%Y-%m-%d') - except ValueError: - raise ValueError(f'Invalid date format: {date_str}. Expected YYYY-MM-DD.') - - return f'SDGym_results_{date.month:02d}_{date.day:02d}_{date.year}' + object = s3_client.get_object(Bucket=bucket, Key=key) + body = object['Body'].read().decode('utf-8') + data = json.loads(body) + latest = sorted(data['runs'], key=lambda x: x['date'])[-1] + return latest['run_name'] + except s3_client.exceptions.ClientError as e: + raise RuntimeError(f'Failed to read {key} from S3: {e}') def write_uploaded_marker(s3_client, bucket, prefix, run_name): @@ -48,13 +43,6 @@ def upload_already_done(s3_client, bucket, prefix, run_name): def get_run_name_and_s3_vars(aws_access_key_id, aws_secret_access_key): - args = parse_args() - if args.date: - date_str = args.date - else: - date_str = datetime.utcnow().replace(day=1).strftime('%Y-%m-%d') - - run_name = get_run_name(date_str) bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS) s3_client = boto3.client( 's3', @@ -62,6 +50,7 @@ def get_run_name_and_s3_vars(aws_access_key_id, aws_secret_access_key): aws_secret_access_key=aws_secret_access_key, region_name=S3_REGION, ) + run_name = get_latest_run_from_file(s3_client, bucket, f'{prefix}_BENCHMARK_DATES.json') return run_name, s3_client, bucket, prefix @@ -75,7 +64,7 @@ def upload_results(aws_access_key_id, aws_secret_access_key, run_name, s3_client result_writer = S3ResultsWriter(s3_client) if not result_explorer.all_runs_complete(run_name): - LOGGER.info(f'Run {run_name} is not complete yet. Exiting.') + LOGGER.warning(f'Run {run_name} is not complete yet. Exiting.') sys.exit(0) LOGGER.info(f'Run {run_name} is complete! Proceeding with summarization...') diff --git a/tasks.py b/tasks.py index 61aba630..dadc65e9 100644 --- a/tasks.py +++ b/tasks.py @@ -204,7 +204,7 @@ def rmdir(c, path): pass @task -def sdgym_benchmark(c): +def run_sdgym_benchmark(c): """Run the SDGym benchmark.""" c.run('python sdgym/_run_benchmark/run_benchmark.py') diff --git a/tests/unit/_run_benchmark/test__utils.py b/tests/unit/_run_benchmark/test__utils.py new file mode 100644 index 00000000..b11ed59b --- /dev/null +++ b/tests/unit/_run_benchmark/test__utils.py @@ -0,0 +1,14 @@ +import pytest + +from sdgym._run_benchmark._utils import get_run_name + + +def test_get_run_name(): + """Test the `get_run_name` method.""" + # Setup + expected_error_message = 'Invalid date format: invalid-date. Expected YYYY-MM-DD.' + + # Run and Assert + assert get_run_name('2023-10-01') == 'SDGym_results_10_01_2023' + with pytest.raises(ValueError, match=expected_error_message): + get_run_name('invalid-date') diff --git a/tests/unit/_run_benchmark/test_run_benchmark.py b/tests/unit/_run_benchmark/test_run_benchmark.py index b22c1491..2792e179 100644 --- a/tests/unit/_run_benchmark/test_run_benchmark.py +++ b/tests/unit/_run_benchmark/test_run_benchmark.py @@ -1,3 +1,4 @@ +from datetime import datetime, timezone from unittest.mock import call, patch from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS @@ -6,10 +7,12 @@ @patch('sdgym._run_benchmark.run_benchmark.benchmark_single_table_aws') @patch('sdgym._run_benchmark.run_benchmark.os.getenv') -def test_main(mock_getenv, mock_benchmark_single_table_aws): +@patch('sdgym._run_benchmark.run_benchmark.append_benchmark_run') +def test_main(mock_append_benchmark_run, mock_getenv, mock_benchmark_single_table_aws): """Test the `main` method.""" # Setup mock_getenv.side_effect = ['my_access_key', 'my_secret_key'] + date = datetime.now(timezone.utc).strftime('%Y-%m-%d') # Run main() @@ -31,3 +34,8 @@ def test_main(mock_getenv, mock_benchmark_single_table_aws): ) mock_benchmark_single_table_aws.assert_has_calls(expected_calls) + mock_append_benchmark_run.assert_called_once_with( + 'my_access_key', + 'my_secret_key', + date, + ) diff --git a/tests/unit/_run_benchmark/test_upload_benchmark_result.py b/tests/unit/_run_benchmark/test_upload_benchmark_result.py index 3b137f21..dcc5a755 100644 --- a/tests/unit/_run_benchmark/test_upload_benchmark_result.py +++ b/tests/unit/_run_benchmark/test_upload_benchmark_result.py @@ -4,10 +4,8 @@ from botocore.exceptions import ClientError from sdgym._run_benchmark.upload_benchmark_results import ( - get_run_name, get_run_name_and_s3_vars, main, - parse_args, upload_already_done, upload_results, write_uploaded_marker, @@ -15,36 +13,6 @@ from sdgym.s3 import S3_REGION -@patch('sdgym._run_benchmark.upload_benchmark_results.argparse.ArgumentParser') -def test_parse_args(mock_argparse): - """Test the `parse_args` method.""" - # Setup - parser = mock_argparse.return_value - parser.parse_args.return_value = Mock(date='01-07-2025') - mock_argparse.return_value.add_argument = Mock() - - # Run - args = parse_args() - - # Assert - assert args.date == '01-07-2025' - parser.add_argument.assert_called_once_with( - '--date', type=str, help='Benchmark date (YYYY-MM-DD)' - ) - parser.parse_args.assert_called_once() - - -def test_get_run_name(): - """Test the `get_run_name` method.""" - # Setup - expected_error_message = 'Invalid date format: invalid-date. Expected YYYY-MM-DD.' - - # Run and Assert - assert get_run_name('2023-10-01') == 'SDGym_results_10_01_2023' - with pytest.raises(ValueError, match=expected_error_message): - get_run_name('invalid-date') - - def test_write_uploaded_marker(): """Test the `write_uploaded_marker` method.""" # Setup @@ -92,34 +60,30 @@ def test_upload_already_done(): assert result_false is False -@patch('sdgym._run_benchmark.upload_benchmark_results.get_run_name') @patch('sdgym._run_benchmark.upload_benchmark_results.boto3.client') @patch('sdgym._run_benchmark.upload_benchmark_results.parse_s3_path') @patch('sdgym._run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') -@patch('sdgym._run_benchmark.upload_benchmark_results.parse_args') +@patch('sdgym._run_benchmark.upload_benchmark_results.get_latest_run_from_file') def test_get_run_name_and_s3_vars( - mock_parse_args, + mock_get_latest_run_from_file, mock_output_destination_aws, mock_parse_s3_path, mock_boto_client, - mock_get_run_name, ): """Test the `get_run_name_and_s3_vars` method.""" # Setup - mock_parse_args.return_value.date = '2023-10-01' aws_access_key_id = 'my_access_key' aws_secret_access_key = 'my_secret_key' expected_result = ('SDGym_results_10_01_2023', 's3_client', 'bucket', 'prefix') - mock_get_run_name.return_value = 'SDGym_results_10_01_2023' mock_boto_client.return_value = 's3_client' mock_parse_s3_path.return_value = ('bucket', 'prefix') + mock_get_latest_run_from_file.return_value = 'SDGym_results_10_01_2023' # Run result = get_run_name_and_s3_vars(aws_access_key_id, aws_secret_access_key) # Assert assert result == expected_result - mock_get_run_name.assert_called_once_with('2023-10-01') mock_boto_client.assert_called_once_with( 's3', aws_access_key_id=aws_access_key_id, @@ -127,6 +91,9 @@ def test_get_run_name_and_s3_vars( region_name=S3_REGION, ) mock_parse_s3_path.assert_called_once_with(mock_output_destination_aws) + mock_get_latest_run_from_file.assert_called_once_with( + 's3_client', 'bucket', 'prefix_BENCHMARK_DATES.json' + ) @patch('sdgym._run_benchmark.upload_benchmark_results.SDGymResultsExplorer') @@ -202,7 +169,7 @@ def test_upload_results_not_all_runs_complete( ) # Assert - mock_logger.info.assert_called_once_with(f'Run {run_name} is not complete yet. Exiting.') + mock_logger.warning.assert_called_once_with(f'Run {run_name} is not complete yet. Exiting.') mock_sdgym_results_explorer.assert_called_once_with( mock_output_destination_aws, aws_access_key_id=aws_access_key_id, From 4c411b887e7479855c3020155bdd126670554f85 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 24 Jul 2025 12:07:48 +0100 Subject: [PATCH 28/53] add unit test --- .github/workflows/run_benchmark.yml | 2 +- .../workflows/upload_benchmark_results.yml | 5 +- sdgym/_run_benchmark/__init__.py | 3 + sdgym/_run_benchmark/run_benchmark.py | 9 +- .../upload_benchmark_results.py | 2 +- tasks.py | 5 +- .../unit/_run_benchmark/test_run_benchmark.py | 102 +++++++++++++++++- .../test_upload_benchmark_result.py | 2 +- 8 files changed, 110 insertions(+), 20 deletions(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index 34505a0e..de4aaff1 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -25,7 +25,7 @@ jobs: python -m pip install invoke python -m pip install -e .[dev] - - name: SDGym Benchmark + - name: Run SDGym Benchmark env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index 21b038c6..e37f9dfd 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -1,9 +1,6 @@ name: Upload SDGym Benchmark results on: - push: - branches: - - issue-425-workflow-sdgym workflow_run: workflows: ["Run SDGym Benchmark"] types: @@ -29,7 +26,7 @@ jobs: python -m pip install invoke python -m pip install -e .[dev] - - name: SDGym Benchmark + - name: Upload SDGym Benchmark env: PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} diff --git a/sdgym/_run_benchmark/__init__.py b/sdgym/_run_benchmark/__init__.py index 150efadc..a764800d 100644 --- a/sdgym/_run_benchmark/__init__.py +++ b/sdgym/_run_benchmark/__init__.py @@ -1,6 +1,9 @@ """Folder for the SDGym benchmark module.""" +from sdgym.benchmark import SDV_SINGLE_TABLE_SYNTHESIZERS + OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' DEBUG_SLACK_CHANNEL = 'sdv-alerts-debug' SLACK_CHANNEL = 'sdv-alerts' +SYNTHESIZERS = SDV_SINGLE_TABLE_SYNTHESIZERS diff --git a/sdgym/_run_benchmark/run_benchmark.py b/sdgym/_run_benchmark/run_benchmark.py index 1f1166af..e5d29c01 100644 --- a/sdgym/_run_benchmark/run_benchmark.py +++ b/sdgym/_run_benchmark/run_benchmark.py @@ -2,13 +2,13 @@ import os from datetime import datetime, timezone +from botocore.exceptions import ClientError + import sdgym._run_benchmark as run_benchmark from sdgym._run_benchmark._utils import get_run_name from sdgym.benchmark import benchmark_single_table_aws from sdgym.s3 import get_s3_client, parse_s3_path -datasets = ['expedia_hotel_logs', 'fake_companies'] # DEFAULT_DATASETS - def append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str): s3_client = get_s3_client( @@ -21,7 +21,7 @@ def append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str): object = s3_client.get_object(Bucket=bucket, Key=f'{prefix}{key}') body = object['Body'].read().decode('utf-8') data = json.loads(body) - except s3_client.exceptions.ClientError as e: + except ClientError as e: if e.response['Error']['Code'] == 'NoSuchKey': data = {'runs': []} else: @@ -36,13 +36,12 @@ def main(): aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') date_str = datetime.now(timezone.utc).strftime('%Y-%m-%d') - for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: + for synthesizer in run_benchmark.SYNTHESIZERS: benchmark_single_table_aws( output_destination=run_benchmark.OUTPUT_DESTINATION_AWS, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, synthesizers=[synthesizer], - sdv_datasets=datasets, compute_privacy_score=False, ) diff --git a/sdgym/_run_benchmark/upload_benchmark_results.py b/sdgym/_run_benchmark/upload_benchmark_results.py index 710a7987..d9d782a7 100644 --- a/sdgym/_run_benchmark/upload_benchmark_results.py +++ b/sdgym/_run_benchmark/upload_benchmark_results.py @@ -82,7 +82,7 @@ def main(): aws_access_key_id, aws_secret_access_key ) if upload_already_done(s3_client, bucket, prefix, run_name): - LOGGER.info('Benchmark results have already been uploaded. Exiting.') + LOGGER.warning('Benchmark results have already been uploaded. Exiting.') sys.exit(0) upload_results(aws_access_key_id, aws_secret_access_key, run_name, s3_client, bucket, prefix) diff --git a/tasks.py b/tasks.py index dadc65e9..a70ebb0f 100644 --- a/tasks.py +++ b/tasks.py @@ -208,8 +208,7 @@ def run_sdgym_benchmark(c): """Run the SDGym benchmark.""" c.run('python sdgym/_run_benchmark/run_benchmark.py') -@task(help={"date": "Benchmark date in YYYY-MM-DD format (default: today with day=01)"}) +@task def upload_benchmark_results(c, date=None): """Upload the benchmark results to S3.""" - date_arg = f"--date {date}" if date else "" - c.run(f'python sdgym/_run_benchmark/upload_benchmark_results.py {date_arg}') + c.run(f'python sdgym/_run_benchmark/upload_benchmark_results.py {date}') diff --git a/tests/unit/_run_benchmark/test_run_benchmark.py b/tests/unit/_run_benchmark/test_run_benchmark.py index 2792e179..5de6a3f7 100644 --- a/tests/unit/_run_benchmark/test_run_benchmark.py +++ b/tests/unit/_run_benchmark/test_run_benchmark.py @@ -1,8 +1,101 @@ +import json from datetime import datetime, timezone -from unittest.mock import call, patch +from unittest.mock import Mock, call, patch -from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS -from sdgym._run_benchmark.run_benchmark import main +from botocore.exceptions import ClientError + +from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS, SYNTHESIZERS +from sdgym._run_benchmark.run_benchmark import append_benchmark_run, main + + +@patch('sdgym._run_benchmark.run_benchmark.get_s3_client') +@patch('sdgym._run_benchmark.run_benchmark.parse_s3_path') +@patch('sdgym._run_benchmark.run_benchmark.get_run_name') +def test_append_benchmark_run(mock_get_run_name, mock_parse_s3_path, mock_get_s3_client): + """Test the `append_benchmark_run` method.""" + # Setup + aws_access_key_id = 'my_access_key' + aws_secret_access_key = 'my_secret_key' + date = '2023-10-01' + mock_get_run_name.return_value = 'SDGym_results_10_01_2023' + mock_parse_s3_path.return_value = ('my-bucket', 'my-prefix/') + mock_s3_client = Mock() + benchmark_date = { + 'runs': [ + {'date': '2023-09-30', 'run_name': 'SDGym_results_09_30_2023'}, + ] + } + mock_get_s3_client.return_value = mock_s3_client + mock_s3_client.get_object.return_value = { + 'Body': Mock(read=lambda: json.dumps(benchmark_date).encode('utf-8')) + } + expected_data = { + 'runs': [ + {'date': '2023-09-30', 'run_name': 'SDGym_results_09_30_2023'}, + {'date': date, 'run_name': 'SDGym_results_10_01_2023'}, + ] + } + + # Run + append_benchmark_run(aws_access_key_id, aws_secret_access_key, date) + + # Assert + mock_get_s3_client.assert_called_once_with( + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + ) + mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS) + mock_get_run_name.assert_called_once_with(date) + mock_s3_client.get_object.assert_called_once_with( + Bucket='my-bucket', Key='my-prefix/_BENCHMARK_DATES.json' + ) + mock_s3_client.put_object.assert_called_once_with( + Bucket='my-bucket', + Key='my-prefix/_BENCHMARK_DATES.json', + Body=json.dumps(expected_data).encode('utf-8'), + ) + + +@patch('sdgym._run_benchmark.run_benchmark.get_s3_client') +@patch('sdgym._run_benchmark.run_benchmark.parse_s3_path') +@patch('sdgym._run_benchmark.run_benchmark.get_run_name') +def test_append_benchmark_run_new_file(mock_get_run_name, mock_parse_s3_path, mock_get_s3_client): + """Test the `append_benchmark_run` with a new file.""" + # Setup + aws_access_key_id = 'my_access_key' + aws_secret_access_key = 'my_secret_key' + date = '2023-10-01' + mock_get_run_name.return_value = 'SDGym_results_10_01_2023' + mock_parse_s3_path.return_value = ('my-bucket', 'my-prefix/') + mock_s3_client = Mock() + mock_get_s3_client.return_value = mock_s3_client + mock_s3_client.get_object.side_effect = ClientError( + {'Error': {'Code': 'NoSuchKey'}}, 'GetObject' + ) + expected_data = { + 'runs': [ + {'date': date, 'run_name': 'SDGym_results_10_01_2023'}, + ] + } + + # Run + append_benchmark_run(aws_access_key_id, aws_secret_access_key, date) + + # Assert + mock_get_s3_client.assert_called_once_with( + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + ) + mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS) + mock_get_run_name.assert_called_once_with(date) + mock_s3_client.get_object.assert_called_once_with( + Bucket='my-bucket', Key='my-prefix/_BENCHMARK_DATES.json' + ) + mock_s3_client.put_object.assert_called_once_with( + Bucket='my-bucket', + Key='my-prefix/_BENCHMARK_DATES.json', + Body=json.dumps(expected_data).encode('utf-8'), + ) @patch('sdgym._run_benchmark.run_benchmark.benchmark_single_table_aws') @@ -21,14 +114,13 @@ def test_main(mock_append_benchmark_run, mock_getenv, mock_benchmark_single_tabl mock_getenv.assert_any_call('AWS_ACCESS_KEY_ID') mock_getenv.assert_any_call('AWS_SECRET_ACCESS_KEY') expected_calls = [] - for synthesizer in ['GaussianCopulaSynthesizer', 'TVAESynthesizer']: + for synthesizer in SYNTHESIZERS: expected_calls.append( call( output_destination=OUTPUT_DESTINATION_AWS, aws_access_key_id='my_access_key', aws_secret_access_key='my_secret_key', synthesizers=[synthesizer], - sdv_datasets=['expedia_hotel_logs', 'fake_companies'], compute_privacy_score=False, ) ) diff --git a/tests/unit/_run_benchmark/test_upload_benchmark_result.py b/tests/unit/_run_benchmark/test_upload_benchmark_result.py index dcc5a755..ab196a6e 100644 --- a/tests/unit/_run_benchmark/test_upload_benchmark_result.py +++ b/tests/unit/_run_benchmark/test_upload_benchmark_result.py @@ -206,7 +206,7 @@ def test_main_already_upload( # Assert mock_get_run_name_and_s3_vars.assert_called_once_with('my_access_key', 'my_secret_key') - mock_logger.info.assert_called_once_with(expected_log_message) + mock_logger.warning.assert_called_once_with(expected_log_message) mock_upload_results.assert_not_called() From c8e30670ef33adbdba947742171425432b00c94c Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 30 Jul 2025 13:36:32 +0100 Subject: [PATCH 29/53] address comments --- .github/workflows/run_benchmark.yml | 1 - sdgym/_run_benchmark/__init__.py | 9 -- sdgym/_run_benchmark/_utils.py | 10 -- .../upload_benchmark_results.py | 92 ---------------- .../run_benchmark.py | 29 +++-- .../run_benchmark/upload_benchmark_results.py | 102 ++++++++++++++++++ sdgym/run_benchmark/utils.py | 22 ++++ tests/unit/_run_benchmark/test__utils.py | 14 --- tests/unit/run_benchmark/test__utils.py | 14 +++ .../test_run_benchmark.py | 44 ++++---- .../test_upload_benchmark_result.py | 85 +++++++++------ 11 files changed, 231 insertions(+), 191 deletions(-) delete mode 100644 sdgym/_run_benchmark/__init__.py delete mode 100644 sdgym/_run_benchmark/_utils.py delete mode 100644 sdgym/_run_benchmark/upload_benchmark_results.py rename sdgym/{_run_benchmark => run_benchmark}/run_benchmark.py (62%) create mode 100644 sdgym/run_benchmark/upload_benchmark_results.py create mode 100644 sdgym/run_benchmark/utils.py delete mode 100644 tests/unit/_run_benchmark/test__utils.py create mode 100644 tests/unit/run_benchmark/test__utils.py rename tests/unit/{_run_benchmark => run_benchmark}/test_run_benchmark.py (69%) rename tests/unit/{_run_benchmark => run_benchmark}/test_upload_benchmark_result.py (70%) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index de4aaff1..d27bfd49 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -22,7 +22,6 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install invoke python -m pip install -e .[dev] - name: Run SDGym Benchmark diff --git a/sdgym/_run_benchmark/__init__.py b/sdgym/_run_benchmark/__init__.py deleted file mode 100644 index a764800d..00000000 --- a/sdgym/_run_benchmark/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Folder for the SDGym benchmark module.""" - -from sdgym.benchmark import SDV_SINGLE_TABLE_SYNTHESIZERS - -OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' -UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' -DEBUG_SLACK_CHANNEL = 'sdv-alerts-debug' -SLACK_CHANNEL = 'sdv-alerts' -SYNTHESIZERS = SDV_SINGLE_TABLE_SYNTHESIZERS diff --git a/sdgym/_run_benchmark/_utils.py b/sdgym/_run_benchmark/_utils.py deleted file mode 100644 index 7c73fef7..00000000 --- a/sdgym/_run_benchmark/_utils.py +++ /dev/null @@ -1,10 +0,0 @@ -from datetime import datetime - - -def get_run_name(date_str): - try: - date = datetime.strptime(date_str, '%Y-%m-%d') - except ValueError: - raise ValueError(f'Invalid date format: {date_str}. Expected YYYY-MM-DD.') - - return f'SDGym_results_{date.month:02d}_{date.day:02d}_{date.year}' diff --git a/sdgym/_run_benchmark/upload_benchmark_results.py b/sdgym/_run_benchmark/upload_benchmark_results.py deleted file mode 100644 index d9d782a7..00000000 --- a/sdgym/_run_benchmark/upload_benchmark_results.py +++ /dev/null @@ -1,92 +0,0 @@ -import json -import logging -import os -import sys - -import boto3 -from botocore.exceptions import ClientError - -from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS -from sdgym.result_writer import S3ResultsWriter -from sdgym.s3 import S3_REGION, parse_s3_path -from sdgym.sdgym_result_explorer.result_explorer import SDGymResultsExplorer - -LOGGER = logging.getLogger(__name__) - - -def get_latest_run_from_file(s3_client, bucket, key): - try: - object = s3_client.get_object(Bucket=bucket, Key=key) - body = object['Body'].read().decode('utf-8') - data = json.loads(body) - latest = sorted(data['runs'], key=lambda x: x['date'])[-1] - return latest['run_name'] - except s3_client.exceptions.ClientError as e: - raise RuntimeError(f'Failed to read {key} from S3: {e}') - - -def write_uploaded_marker(s3_client, bucket, prefix, run_name): - s3_client.put_object( - Bucket=bucket, Key=f'{prefix}{run_name}/upload_complete.marker', Body=b'Upload complete' - ) - - -def upload_already_done(s3_client, bucket, prefix, run_name): - try: - s3_client.head_object(Bucket=bucket, Key=f'{prefix}{run_name}/upload_complete.marker') - return True - except ClientError as e: - if e.response['Error']['Code'] == '404': - return False - - raise - - -def get_run_name_and_s3_vars(aws_access_key_id, aws_secret_access_key): - bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS) - s3_client = boto3.client( - 's3', - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - region_name=S3_REGION, - ) - run_name = get_latest_run_from_file(s3_client, bucket, f'{prefix}_BENCHMARK_DATES.json') - - return run_name, s3_client, bucket, prefix - - -def upload_results(aws_access_key_id, aws_secret_access_key, run_name, s3_client, bucket, prefix): - result_explorer = SDGymResultsExplorer( - OUTPUT_DESTINATION_AWS, - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - ) - result_writer = S3ResultsWriter(s3_client) - - if not result_explorer.all_runs_complete(run_name): - LOGGER.warning(f'Run {run_name} is not complete yet. Exiting.') - sys.exit(0) - - LOGGER.info(f'Run {run_name} is complete! Proceeding with summarization...') - summary, _ = result_explorer.summarize(run_name) - result_writer.write_dataframe( - summary, f'{OUTPUT_DESTINATION_AWS}{run_name}/{run_name}_summary.csv', index=True - ) - write_uploaded_marker(s3_client, bucket, prefix, run_name) - - -def main(): - aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') - aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') - run_name, s3_client, bucket, prefix = get_run_name_and_s3_vars( - aws_access_key_id, aws_secret_access_key - ) - if upload_already_done(s3_client, bucket, prefix, run_name): - LOGGER.warning('Benchmark results have already been uploaded. Exiting.') - sys.exit(0) - - upload_results(aws_access_key_id, aws_secret_access_key, run_name, s3_client, bucket, prefix) - - -if __name__ == '__main__': - main() diff --git a/sdgym/_run_benchmark/run_benchmark.py b/sdgym/run_benchmark/run_benchmark.py similarity index 62% rename from sdgym/_run_benchmark/run_benchmark.py rename to sdgym/run_benchmark/run_benchmark.py index e5d29c01..2a4833a6 100644 --- a/sdgym/_run_benchmark/run_benchmark.py +++ b/sdgym/run_benchmark/run_benchmark.py @@ -1,44 +1,53 @@ +"""Script to run a benchmark and upload results to S3.""" + import json import os from datetime import datetime, timezone from botocore.exceptions import ClientError -import sdgym._run_benchmark as run_benchmark -from sdgym._run_benchmark._utils import get_run_name from sdgym.benchmark import benchmark_single_table_aws +from sdgym.run_benchmark.utils import ( + KEY_DATE_FILE, + OUTPUT_DESTINATION_AWS, + SYNTHESIZERS, + get_result_folder_name, +) from sdgym.s3 import get_s3_client, parse_s3_path def append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str): + """Append a new benchmark run to the benchmark dates file in S3.""" s3_client = get_s3_client( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, ) - bucket, prefix = parse_s3_path(run_benchmark.OUTPUT_DESTINATION_AWS) - key = '_BENCHMARK_DATES.json' + bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS) try: - object = s3_client.get_object(Bucket=bucket, Key=f'{prefix}{key}') + object = s3_client.get_object(Bucket=bucket, Key=f'{prefix}{KEY_DATE_FILE}') body = object['Body'].read().decode('utf-8') data = json.loads(body) except ClientError as e: if e.response['Error']['Code'] == 'NoSuchKey': data = {'runs': []} else: - raise RuntimeError(f'Failed to read {key} from S3: {e}') + raise RuntimeError(f'Failed to read {KEY_DATE_FILE} from S3: {e}') - data['runs'].append({'date': date_str, 'run_name': get_run_name(date_str)}) + data['runs'].append({'date': date_str, 'folder_name': get_result_folder_name(date_str)}) data['runs'] = sorted(data['runs'], key=lambda x: x['date']) - s3_client.put_object(Bucket=bucket, Key=f'{prefix}{key}', Body=json.dumps(data).encode('utf-8')) + s3_client.put_object( + Bucket=bucket, Key=f'{prefix}{KEY_DATE_FILE}', Body=json.dumps(data).encode('utf-8') + ) def main(): + """Main function to run the benchmark and upload results.""" aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') date_str = datetime.now(timezone.utc).strftime('%Y-%m-%d') - for synthesizer in run_benchmark.SYNTHESIZERS: + for synthesizer in SYNTHESIZERS: benchmark_single_table_aws( - output_destination=run_benchmark.OUTPUT_DESTINATION_AWS, + output_destination=OUTPUT_DESTINATION_AWS, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, synthesizers=[synthesizer], diff --git a/sdgym/run_benchmark/upload_benchmark_results.py b/sdgym/run_benchmark/upload_benchmark_results.py new file mode 100644 index 00000000..e61b8d0a --- /dev/null +++ b/sdgym/run_benchmark/upload_benchmark_results.py @@ -0,0 +1,102 @@ +"""Script to upload benchmark results to S3.""" + +import json +import logging +import os +import sys + +import boto3 +from botocore.exceptions import ClientError + +from sdgym.result_writer import S3ResultsWriter +from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS +from sdgym.s3 import S3_REGION, parse_s3_path +from sdgym.sdgym_result_explorer.result_explorer import SDGymResultsExplorer + +LOGGER = logging.getLogger(__name__) + + +def get_latest_run_from_file(s3_client, bucket, key): + """Get the latest run folder name from the benchmark dates file in S3.""" + try: + object = s3_client.get_object(Bucket=bucket, Key=key) + body = object['Body'].read().decode('utf-8') + data = json.loads(body) + latest = sorted(data['runs'], key=lambda x: x['date'])[-1] + return latest['folder_name'] + except s3_client.exceptions.ClientError as e: + raise RuntimeError(f'Failed to read {key} from S3: {e}') + + +def write_uploaded_marker(s3_client, bucket, prefix, folder_name): + """Write a marker file to indicate that the upload is complete.""" + s3_client.put_object( + Bucket=bucket, Key=f'{prefix}{folder_name}/upload_complete.marker', Body=b'Upload complete' + ) + + +def upload_already_done(s3_client, bucket, prefix, folder_name): + """Check if the upload has already been done by looking for the marker file.""" + try: + s3_client.head_object(Bucket=bucket, Key=f'{prefix}{folder_name}/upload_complete.marker') + return True + except ClientError as e: + if e.response['Error']['Code'] == '404': + return False + + raise + + +def get_result_folder_name_and_s3_vars(aws_access_key_id, aws_secret_access_key): + """Get the result folder name and S3 client variables.""" + bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS) + s3_client = boto3.client( + 's3', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=S3_REGION, + ) + folder_name = get_latest_run_from_file(s3_client, bucket, f'{prefix}_BENCHMARK_DATES.json') + + return folder_name, s3_client, bucket, prefix + + +def upload_results( + aws_access_key_id, aws_secret_access_key, folder_name, s3_client, bucket, prefix +): + """Upload benchmark results to S3.""" + result_explorer = SDGymResultsExplorer( + OUTPUT_DESTINATION_AWS, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + ) + result_writer = S3ResultsWriter(s3_client) + + if not result_explorer.all_runs_complete(folder_name): + LOGGER.warning(f'Run {folder_name} is not complete yet. Exiting.') + sys.exit(0) + + LOGGER.info(f'Run {folder_name} is complete! Proceeding with summarization...') + summary, _ = result_explorer.summarize(folder_name) + result_writer.write_dataframe( + summary, f'{OUTPUT_DESTINATION_AWS}{folder_name}/{folder_name}_summary.csv', index=True + ) + write_uploaded_marker(s3_client, bucket, prefix, folder_name) + + +def main(): + """Main function to upload benchmark results.""" + aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') + aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') + folder_name, s3_client, bucket, prefix = get_result_folder_name_and_s3_vars( + aws_access_key_id, aws_secret_access_key + ) + if upload_already_done(s3_client, bucket, prefix, folder_name): + LOGGER.warning('Benchmark results have already been uploaded. Exiting.') + sys.exit(0) + + upload_results(aws_access_key_id, aws_secret_access_key, folder_name, s3_client, bucket, prefix) + + +if __name__ == '__main__': + main() diff --git a/sdgym/run_benchmark/utils.py b/sdgym/run_benchmark/utils.py new file mode 100644 index 00000000..33242199 --- /dev/null +++ b/sdgym/run_benchmark/utils.py @@ -0,0 +1,22 @@ +"""Utils file for the run_benchmark module.""" + +from datetime import datetime + +from sdgym.benchmark import SDV_SINGLE_TABLE_SYNTHESIZERS + +OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' +UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' +DEBUG_SLACK_CHANNEL = 'sdv-alerts-debug' +SLACK_CHANNEL = 'sdv-alerts' +KEY_DATE_FILE = '_BENCHMARK_DATES.json' +SYNTHESIZERS = SDV_SINGLE_TABLE_SYNTHESIZERS + + +def get_result_folder_name(date_str): + """Get the result folder name based on the date string.""" + try: + date = datetime.strptime(date_str, '%Y-%m-%d') + except ValueError: + raise ValueError(f'Invalid date format: {date_str}. Expected YYYY-MM-DD.') + + return f'SDGym_results_{date.month:02d}_{date.day:02d}_{date.year}' diff --git a/tests/unit/_run_benchmark/test__utils.py b/tests/unit/_run_benchmark/test__utils.py deleted file mode 100644 index b11ed59b..00000000 --- a/tests/unit/_run_benchmark/test__utils.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest - -from sdgym._run_benchmark._utils import get_run_name - - -def test_get_run_name(): - """Test the `get_run_name` method.""" - # Setup - expected_error_message = 'Invalid date format: invalid-date. Expected YYYY-MM-DD.' - - # Run and Assert - assert get_run_name('2023-10-01') == 'SDGym_results_10_01_2023' - with pytest.raises(ValueError, match=expected_error_message): - get_run_name('invalid-date') diff --git a/tests/unit/run_benchmark/test__utils.py b/tests/unit/run_benchmark/test__utils.py new file mode 100644 index 00000000..8ec5f6b4 --- /dev/null +++ b/tests/unit/run_benchmark/test__utils.py @@ -0,0 +1,14 @@ +import pytest + +from sdgym.run_benchmark.utils import get_result_folder_name + + +def test_get_result_folder_name(): + """Test the `get_result_folder_name` method.""" + # Setup + expected_error_message = 'Invalid date format: invalid-date. Expected YYYY-MM-DD.' + + # Run and Assert + assert get_result_folder_name('2023-10-01') == 'SDGym_results_10_01_2023' + with pytest.raises(ValueError, match=expected_error_message): + get_result_folder_name('invalid-date') diff --git a/tests/unit/_run_benchmark/test_run_benchmark.py b/tests/unit/run_benchmark/test_run_benchmark.py similarity index 69% rename from tests/unit/_run_benchmark/test_run_benchmark.py rename to tests/unit/run_benchmark/test_run_benchmark.py index 5de6a3f7..93a733cb 100644 --- a/tests/unit/_run_benchmark/test_run_benchmark.py +++ b/tests/unit/run_benchmark/test_run_benchmark.py @@ -4,25 +4,25 @@ from botocore.exceptions import ClientError -from sdgym._run_benchmark import OUTPUT_DESTINATION_AWS, SYNTHESIZERS -from sdgym._run_benchmark.run_benchmark import append_benchmark_run, main +from sdgym.run_benchmark.run_benchmark import append_benchmark_run, main +from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS, SYNTHESIZERS -@patch('sdgym._run_benchmark.run_benchmark.get_s3_client') -@patch('sdgym._run_benchmark.run_benchmark.parse_s3_path') -@patch('sdgym._run_benchmark.run_benchmark.get_run_name') -def test_append_benchmark_run(mock_get_run_name, mock_parse_s3_path, mock_get_s3_client): +@patch('sdgym.run_benchmark.run_benchmark.get_s3_client') +@patch('sdgym.run_benchmark.run_benchmark.parse_s3_path') +@patch('sdgym.run_benchmark.run_benchmark.get_result_folder_name') +def test_append_benchmark_run(mock_get_result_folder_name, mock_parse_s3_path, mock_get_s3_client): """Test the `append_benchmark_run` method.""" # Setup aws_access_key_id = 'my_access_key' aws_secret_access_key = 'my_secret_key' date = '2023-10-01' - mock_get_run_name.return_value = 'SDGym_results_10_01_2023' + mock_get_result_folder_name.return_value = 'SDGym_results_10_01_2023' mock_parse_s3_path.return_value = ('my-bucket', 'my-prefix/') mock_s3_client = Mock() benchmark_date = { 'runs': [ - {'date': '2023-09-30', 'run_name': 'SDGym_results_09_30_2023'}, + {'date': '2023-09-30', 'folder_name': 'SDGym_results_09_30_2023'}, ] } mock_get_s3_client.return_value = mock_s3_client @@ -31,8 +31,8 @@ def test_append_benchmark_run(mock_get_run_name, mock_parse_s3_path, mock_get_s3 } expected_data = { 'runs': [ - {'date': '2023-09-30', 'run_name': 'SDGym_results_09_30_2023'}, - {'date': date, 'run_name': 'SDGym_results_10_01_2023'}, + {'date': '2023-09-30', 'folder_name': 'SDGym_results_09_30_2023'}, + {'date': date, 'folder_name': 'SDGym_results_10_01_2023'}, ] } @@ -45,7 +45,7 @@ def test_append_benchmark_run(mock_get_run_name, mock_parse_s3_path, mock_get_s3 aws_secret_access_key=aws_secret_access_key, ) mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS) - mock_get_run_name.assert_called_once_with(date) + mock_get_result_folder_name.assert_called_once_with(date) mock_s3_client.get_object.assert_called_once_with( Bucket='my-bucket', Key='my-prefix/_BENCHMARK_DATES.json' ) @@ -56,16 +56,18 @@ def test_append_benchmark_run(mock_get_run_name, mock_parse_s3_path, mock_get_s3 ) -@patch('sdgym._run_benchmark.run_benchmark.get_s3_client') -@patch('sdgym._run_benchmark.run_benchmark.parse_s3_path') -@patch('sdgym._run_benchmark.run_benchmark.get_run_name') -def test_append_benchmark_run_new_file(mock_get_run_name, mock_parse_s3_path, mock_get_s3_client): +@patch('sdgym.run_benchmark.run_benchmark.get_s3_client') +@patch('sdgym.run_benchmark.run_benchmark.parse_s3_path') +@patch('sdgym.run_benchmark.run_benchmark.get_result_folder_name') +def test_append_benchmark_run_new_file( + mock_get_result_folder_name, mock_parse_s3_path, mock_get_s3_client +): """Test the `append_benchmark_run` with a new file.""" # Setup aws_access_key_id = 'my_access_key' aws_secret_access_key = 'my_secret_key' date = '2023-10-01' - mock_get_run_name.return_value = 'SDGym_results_10_01_2023' + mock_get_result_folder_name.return_value = 'SDGym_results_10_01_2023' mock_parse_s3_path.return_value = ('my-bucket', 'my-prefix/') mock_s3_client = Mock() mock_get_s3_client.return_value = mock_s3_client @@ -74,7 +76,7 @@ def test_append_benchmark_run_new_file(mock_get_run_name, mock_parse_s3_path, mo ) expected_data = { 'runs': [ - {'date': date, 'run_name': 'SDGym_results_10_01_2023'}, + {'date': date, 'folder_name': 'SDGym_results_10_01_2023'}, ] } @@ -87,7 +89,7 @@ def test_append_benchmark_run_new_file(mock_get_run_name, mock_parse_s3_path, mo aws_secret_access_key=aws_secret_access_key, ) mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS) - mock_get_run_name.assert_called_once_with(date) + mock_get_result_folder_name.assert_called_once_with(date) mock_s3_client.get_object.assert_called_once_with( Bucket='my-bucket', Key='my-prefix/_BENCHMARK_DATES.json' ) @@ -98,9 +100,9 @@ def test_append_benchmark_run_new_file(mock_get_run_name, mock_parse_s3_path, mo ) -@patch('sdgym._run_benchmark.run_benchmark.benchmark_single_table_aws') -@patch('sdgym._run_benchmark.run_benchmark.os.getenv') -@patch('sdgym._run_benchmark.run_benchmark.append_benchmark_run') +@patch('sdgym.run_benchmark.run_benchmark.benchmark_single_table_aws') +@patch('sdgym.run_benchmark.run_benchmark.os.getenv') +@patch('sdgym.run_benchmark.run_benchmark.append_benchmark_run') def test_main(mock_append_benchmark_run, mock_getenv, mock_benchmark_single_table_aws): """Test the `main` method.""" # Setup diff --git a/tests/unit/_run_benchmark/test_upload_benchmark_result.py b/tests/unit/run_benchmark/test_upload_benchmark_result.py similarity index 70% rename from tests/unit/_run_benchmark/test_upload_benchmark_result.py rename to tests/unit/run_benchmark/test_upload_benchmark_result.py index ab196a6e..92757a84 100644 --- a/tests/unit/_run_benchmark/test_upload_benchmark_result.py +++ b/tests/unit/run_benchmark/test_upload_benchmark_result.py @@ -3,8 +3,8 @@ import pytest from botocore.exceptions import ClientError -from sdgym._run_benchmark.upload_benchmark_results import ( - get_run_name_and_s3_vars, +from sdgym.run_benchmark.upload_benchmark_results import ( + get_result_folder_name_and_s3_vars, main, upload_already_done, upload_results, @@ -60,17 +60,17 @@ def test_upload_already_done(): assert result_false is False -@patch('sdgym._run_benchmark.upload_benchmark_results.boto3.client') -@patch('sdgym._run_benchmark.upload_benchmark_results.parse_s3_path') -@patch('sdgym._run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') -@patch('sdgym._run_benchmark.upload_benchmark_results.get_latest_run_from_file') -def test_get_run_name_and_s3_vars( +@patch('sdgym.run_benchmark.upload_benchmark_results.boto3.client') +@patch('sdgym.run_benchmark.upload_benchmark_results.parse_s3_path') +@patch('sdgym.run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') +@patch('sdgym.run_benchmark.upload_benchmark_results.get_latest_run_from_file') +def test_get_result_folder_name_and_s3_vars( mock_get_latest_run_from_file, mock_output_destination_aws, mock_parse_s3_path, mock_boto_client, ): - """Test the `get_run_name_and_s3_vars` method.""" + """Test the `get_result_folder_name_and_s3_vars` method.""" # Setup aws_access_key_id = 'my_access_key' aws_secret_access_key = 'my_secret_key' @@ -80,7 +80,7 @@ def test_get_run_name_and_s3_vars( mock_get_latest_run_from_file.return_value = 'SDGym_results_10_01_2023' # Run - result = get_run_name_and_s3_vars(aws_access_key_id, aws_secret_access_key) + result = get_result_folder_name_and_s3_vars(aws_access_key_id, aws_secret_access_key) # Assert assert result == expected_result @@ -96,11 +96,11 @@ def test_get_run_name_and_s3_vars( ) -@patch('sdgym._run_benchmark.upload_benchmark_results.SDGymResultsExplorer') -@patch('sdgym._run_benchmark.upload_benchmark_results.S3ResultsWriter') -@patch('sdgym._run_benchmark.upload_benchmark_results.write_uploaded_marker') -@patch('sdgym._run_benchmark.upload_benchmark_results.LOGGER') -@patch('sdgym._run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') +@patch('sdgym.run_benchmark.upload_benchmark_results.SDGymResultsExplorer') +@patch('sdgym.run_benchmark.upload_benchmark_results.S3ResultsWriter') +@patch('sdgym.run_benchmark.upload_benchmark_results.write_uploaded_marker') +@patch('sdgym.run_benchmark.upload_benchmark_results.LOGGER') +@patch('sdgym.run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') def test_upload_results( mock_output_destination_aws, mock_logger, @@ -138,11 +138,11 @@ def test_upload_results( mock_write_uploaded_marker.assert_called_once_with(s3_client, bucket, prefix, run_name) -@patch('sdgym._run_benchmark.upload_benchmark_results.SDGymResultsExplorer') -@patch('sdgym._run_benchmark.upload_benchmark_results.S3ResultsWriter') -@patch('sdgym._run_benchmark.upload_benchmark_results.write_uploaded_marker') -@patch('sdgym._run_benchmark.upload_benchmark_results.LOGGER') -@patch('sdgym._run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') +@patch('sdgym.run_benchmark.upload_benchmark_results.SDGymResultsExplorer') +@patch('sdgym.run_benchmark.upload_benchmark_results.S3ResultsWriter') +@patch('sdgym.run_benchmark.upload_benchmark_results.write_uploaded_marker') +@patch('sdgym.run_benchmark.upload_benchmark_results.LOGGER') +@patch('sdgym.run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') def test_upload_results_not_all_runs_complete( mock_output_destination_aws, mock_logger, @@ -181,22 +181,27 @@ def test_upload_results_not_all_runs_complete( mock_write_uploaded_marker.assert_not_called() -@patch('sdgym._run_benchmark.upload_benchmark_results.get_run_name_and_s3_vars') -@patch('sdgym._run_benchmark.upload_benchmark_results.upload_results') -@patch('sdgym._run_benchmark.upload_benchmark_results.upload_already_done') -@patch('sdgym._run_benchmark.upload_benchmark_results.LOGGER') -@patch('sdgym._run_benchmark.upload_benchmark_results.os.getenv') +@patch('sdgym.run_benchmark.upload_benchmark_results.get_result_folder_name_and_s3_vars') +@patch('sdgym.run_benchmark.upload_benchmark_results.upload_results') +@patch('sdgym.run_benchmark.upload_benchmark_results.upload_already_done') +@patch('sdgym.run_benchmark.upload_benchmark_results.LOGGER') +@patch('sdgym.run_benchmark.upload_benchmark_results.os.getenv') def test_main_already_upload( mock_getenv, mock_logger, mock_upload_already_done, mock_upload_results, - mock_get_run_name_and_s3_vars, + mock_get_result_folder_name_and_s3_vars, ): """Test the `method` when results are already uploaded.""" # Setup mock_getenv.side_effect = ['my_access_key', 'my_secret_key'] - mock_get_run_name_and_s3_vars.return_value = ('run_name', 's3_client', 'bucket', 'prefix') + mock_get_result_folder_name_and_s3_vars.return_value = ( + 'run_name', + 's3_client', + 'bucket', + 'prefix', + ) mock_upload_already_done.return_value = True expected_log_message = 'Benchmark results have already been uploaded. Exiting.' @@ -205,29 +210,41 @@ def test_main_already_upload( main() # Assert - mock_get_run_name_and_s3_vars.assert_called_once_with('my_access_key', 'my_secret_key') + mock_get_result_folder_name_and_s3_vars.assert_called_once_with( + 'my_access_key', 'my_secret_key' + ) mock_logger.warning.assert_called_once_with(expected_log_message) mock_upload_results.assert_not_called() -@patch('sdgym._run_benchmark.upload_benchmark_results.get_run_name_and_s3_vars') -@patch('sdgym._run_benchmark.upload_benchmark_results.upload_results') -@patch('sdgym._run_benchmark.upload_benchmark_results.upload_already_done') -@patch('sdgym._run_benchmark.upload_benchmark_results.os.getenv') +@patch('sdgym.run_benchmark.upload_benchmark_results.get_result_folder_name_and_s3_vars') +@patch('sdgym.run_benchmark.upload_benchmark_results.upload_results') +@patch('sdgym.run_benchmark.upload_benchmark_results.upload_already_done') +@patch('sdgym.run_benchmark.upload_benchmark_results.os.getenv') def test_main( - mock_getenv, mock_upload_already_done, mock_upload_results, mock_get_run_name_and_s3_vars + mock_getenv, + mock_upload_already_done, + mock_upload_results, + mock_get_result_folder_name_and_s3_vars, ): """Test the `main` method.""" # Setup mock_getenv.side_effect = ['my_access_key', 'my_secret_key'] - mock_get_run_name_and_s3_vars.return_value = ('run_name', 's3_client', 'bucket', 'prefix') + mock_get_result_folder_name_and_s3_vars.return_value = ( + 'run_name', + 's3_client', + 'bucket', + 'prefix', + ) mock_upload_already_done.return_value = False # Run main() # Assert - mock_get_run_name_and_s3_vars.assert_called_once_with('my_access_key', 'my_secret_key') + mock_get_result_folder_name_and_s3_vars.assert_called_once_with( + 'my_access_key', 'my_secret_key' + ) mock_upload_already_done.assert_called_once_with('s3_client', 'bucket', 'prefix', 'run_name') mock_upload_results.assert_called_once_with( 'my_access_key', 'my_secret_key', 'run_name', 's3_client', 'bucket', 'prefix' From e8eb5bee62878ae5cb22f0addba8aa7b1740fd0c Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 30 Jul 2025 14:38:54 +0100 Subject: [PATCH 30/53] def sclack 1 --- sdgym/run_benchmark/run_benchmark.py | 9 +++-- sdgym/run_benchmark/utils.py | 51 ++++++++++++++++++++++++++-- tasks.py | 4 +-- 3 files changed, 57 insertions(+), 7 deletions(-) diff --git a/sdgym/run_benchmark/run_benchmark.py b/sdgym/run_benchmark/run_benchmark.py index 2a4833a6..22d630c9 100644 --- a/sdgym/run_benchmark/run_benchmark.py +++ b/sdgym/run_benchmark/run_benchmark.py @@ -10,8 +10,9 @@ from sdgym.run_benchmark.utils import ( KEY_DATE_FILE, OUTPUT_DESTINATION_AWS, - SYNTHESIZERS, + SYNTHESIZERS_SPLIT, get_result_folder_name, + post_benchmark_launch_message, ) from sdgym.s3 import get_s3_client, parse_s3_path @@ -45,16 +46,18 @@ def main(): aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') date_str = datetime.now(timezone.utc).strftime('%Y-%m-%d') - for synthesizer in SYNTHESIZERS: + for synthesizer_group in SYNTHESIZERS_SPLIT[:2]: benchmark_single_table_aws( output_destination=OUTPUT_DESTINATION_AWS, + dataset=['expedia_hotel_logs', 'fake_companies'], aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, - synthesizers=[synthesizer], + synthesizers=synthesizer_group, compute_privacy_score=False, ) append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str) + post_benchmark_launch_message() if __name__ == '__main__': diff --git a/sdgym/run_benchmark/utils.py b/sdgym/run_benchmark/utils.py index 33242199..f68f0507 100644 --- a/sdgym/run_benchmark/utils.py +++ b/sdgym/run_benchmark/utils.py @@ -1,15 +1,24 @@ """Utils file for the run_benchmark module.""" +import os from datetime import datetime -from sdgym.benchmark import SDV_SINGLE_TABLE_SYNTHESIZERS +from slack_sdk import WebClient OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' DEBUG_SLACK_CHANNEL = 'sdv-alerts-debug' SLACK_CHANNEL = 'sdv-alerts' KEY_DATE_FILE = '_BENCHMARK_DATES.json' -SYNTHESIZERS = SDV_SINGLE_TABLE_SYNTHESIZERS + +# The synthesizers inside the same list will be run by the same ec2 instance +SYNTHESIZERS_SPLIT = [ + ['UniformSynthesizer', 'ColumnSynthesizer', 'GaussianCopulaSynthesizer'], + ['TVAESynthesizer'], + ['CopulaGANSynthesizer'], + ['CTGANSynthesizer'], + ['RealTabFormerSynthesizer'], +] def get_result_folder_name(date_str): @@ -20,3 +29,41 @@ def get_result_folder_name(date_str): raise ValueError(f'Invalid date format: {date_str}. Expected YYYY-MM-DD.') return f'SDGym_results_{date.month:02d}_{date.day:02d}_{date.year}' + + +def _get_slack_client(): + """Create an authenticated Slack client. + + Returns: + WebClient: + An authenticated Slack WebClient instance. + """ + token = os.getenv('SLACK_TOKEN') + client = WebClient(token=token) + return client + + +def post_slack_message(channel, text): + """Post a message to a Slack channel.""" + client = _get_slack_client() + client.chat_postMessage(channel=channel, text=text) + + +def post_benchmark_launch_message(): + """Post a message to the SDV Alerts Slack channel when the benchmark is launched.""" + channel = DEBUG_SLACK_CHANNEL + body = 'SDGym benchmark has been launched! Results will be available soon.' + post_slack_message(channel, body) + + +def post_run_summary(folder_name): + """Post run summary to sdv-alerts slack channel.""" + channel = DEBUG_SLACK_CHANNEL + body = '' + body += f'SDGym benchmark results for {folder_name} are available!\n' + body += ( + f'Check the results <{OUTPUT_DESTINATION_AWS}{folder_name}/{folder_name}_summary' + '.csv|here>.\n' + ) + + post_slack_message(channel, body) diff --git a/tasks.py b/tasks.py index a70ebb0f..d2ec8f52 100644 --- a/tasks.py +++ b/tasks.py @@ -206,9 +206,9 @@ def rmdir(c, path): @task def run_sdgym_benchmark(c): """Run the SDGym benchmark.""" - c.run('python sdgym/_run_benchmark/run_benchmark.py') + c.run('python sdgym/run_benchmark/run_benchmark.py') @task def upload_benchmark_results(c, date=None): """Upload the benchmark results to S3.""" - c.run(f'python sdgym/_run_benchmark/upload_benchmark_results.py {date}') + c.run(f'python sdgym/run_benchmark/upload_benchmark_results.py {date}') From a96fe88f291b2106cfc1bab6bd1fd68ede5d4db5 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 30 Jul 2025 14:46:22 +0100 Subject: [PATCH 31/53] pyproject slack sdk --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 7946e0ab..f580b20b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,6 +83,7 @@ test = [ 'pytest-cov>=2.6.0', 'jupyter>=1.0.0,<2', 'tomli>=2.0.0,<3', + 'slack-sdk>=3.23,<4.0' ] dev = [ 'sdgym[dask, test]', From d7fe8bf435b566fb1279b1fa69aba9b90e5cc26c Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 30 Jul 2025 15:27:34 +0100 Subject: [PATCH 32/53] fix parameter name --- sdgym/run_benchmark/run_benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdgym/run_benchmark/run_benchmark.py b/sdgym/run_benchmark/run_benchmark.py index 22d630c9..09ec5e2c 100644 --- a/sdgym/run_benchmark/run_benchmark.py +++ b/sdgym/run_benchmark/run_benchmark.py @@ -49,7 +49,7 @@ def main(): for synthesizer_group in SYNTHESIZERS_SPLIT[:2]: benchmark_single_table_aws( output_destination=OUTPUT_DESTINATION_AWS, - dataset=['expedia_hotel_logs', 'fake_companies'], + sdv_datasets=['expedia_hotel_logs', 'fake_companies'], aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, synthesizers=synthesizer_group, From c00d200dc28db103100d09d1cb6934f15bbbbaab Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 30 Jul 2025 16:07:44 +0100 Subject: [PATCH 33/53] add token --- .github/workflows/run_benchmark.yml | 1 + .github/workflows/upload_benchmark_results.yml | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index d27bfd49..78b4402f 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -26,6 +26,7 @@ jobs: - name: Run SDGym Benchmark env: + SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index e37f9dfd..195de346 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -28,7 +28,6 @@ jobs: - name: Upload SDGym Benchmark env: - PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} From 9f7fe605c627e24f7979f180fb9693aae7b4496d Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Wed, 30 Jul 2025 16:49:51 +0100 Subject: [PATCH 34/53] update slack message --- sdgym/run_benchmark/run_benchmark.py | 2 +- sdgym/run_benchmark/utils.py | 17 +++++++++++------ tests/unit/run_benchmark/test_run_benchmark.py | 16 ++++++++++++---- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/sdgym/run_benchmark/run_benchmark.py b/sdgym/run_benchmark/run_benchmark.py index 09ec5e2c..5a8c9a05 100644 --- a/sdgym/run_benchmark/run_benchmark.py +++ b/sdgym/run_benchmark/run_benchmark.py @@ -57,7 +57,7 @@ def main(): ) append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str) - post_benchmark_launch_message() + post_benchmark_launch_message(date_str) if __name__ == '__main__': diff --git a/sdgym/run_benchmark/utils.py b/sdgym/run_benchmark/utils.py index f68f0507..4ec85078 100644 --- a/sdgym/run_benchmark/utils.py +++ b/sdgym/run_benchmark/utils.py @@ -15,11 +15,14 @@ SYNTHESIZERS_SPLIT = [ ['UniformSynthesizer', 'ColumnSynthesizer', 'GaussianCopulaSynthesizer'], ['TVAESynthesizer'], - ['CopulaGANSynthesizer'], - ['CTGANSynthesizer'], - ['RealTabFormerSynthesizer'], ] +""" +['CopulaGANSynthesizer'], +['CTGANSynthesizer'], +['RealTabFormerSynthesizer'], +""" + def get_result_folder_name(date_str): """Get the result folder name based on the date string.""" @@ -49,10 +52,12 @@ def post_slack_message(channel, text): client.chat_postMessage(channel=channel, text=text) -def post_benchmark_launch_message(): +def post_benchmark_launch_message(date_str): """Post a message to the SDV Alerts Slack channel when the benchmark is launched.""" channel = DEBUG_SLACK_CHANNEL - body = 'SDGym benchmark has been launched! Results will be available soon.' + body = 'πŸƒ SDGym benchmark has been launched! EC2 Instances are running. ' + body += 'Intermediate results can be found ' + body += f'<{OUTPUT_DESTINATION_AWS}{get_result_folder_name(date_str)} |here>.\n' post_slack_message(channel, body) @@ -60,7 +65,7 @@ def post_run_summary(folder_name): """Post run summary to sdv-alerts slack channel.""" channel = DEBUG_SLACK_CHANNEL body = '' - body += f'SDGym benchmark results for {folder_name} are available!\n' + body += f'πŸ€ΈπŸ»β€β™€οΈ SDGym benchmark results for {folder_name} are available!πŸ‹οΈβ€β™€οΈ \n' body += ( f'Check the results <{OUTPUT_DESTINATION_AWS}{folder_name}/{folder_name}_summary' '.csv|here>.\n' diff --git a/tests/unit/run_benchmark/test_run_benchmark.py b/tests/unit/run_benchmark/test_run_benchmark.py index 93a733cb..ffa5211a 100644 --- a/tests/unit/run_benchmark/test_run_benchmark.py +++ b/tests/unit/run_benchmark/test_run_benchmark.py @@ -5,7 +5,7 @@ from botocore.exceptions import ClientError from sdgym.run_benchmark.run_benchmark import append_benchmark_run, main -from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS, SYNTHESIZERS +from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS, SYNTHESIZERS_SPLIT @patch('sdgym.run_benchmark.run_benchmark.get_s3_client') @@ -103,7 +103,13 @@ def test_append_benchmark_run_new_file( @patch('sdgym.run_benchmark.run_benchmark.benchmark_single_table_aws') @patch('sdgym.run_benchmark.run_benchmark.os.getenv') @patch('sdgym.run_benchmark.run_benchmark.append_benchmark_run') -def test_main(mock_append_benchmark_run, mock_getenv, mock_benchmark_single_table_aws): +@patch('sdgym.run_benchmark.run_benchmark.post_benchmark_launch_message') +def test_main( + mock_post_benchmark_launch_message, + mock_append_benchmark_run, + mock_getenv, + mock_benchmark_single_table_aws, +): """Test the `main` method.""" # Setup mock_getenv.side_effect = ['my_access_key', 'my_secret_key'] @@ -116,13 +122,14 @@ def test_main(mock_append_benchmark_run, mock_getenv, mock_benchmark_single_tabl mock_getenv.assert_any_call('AWS_ACCESS_KEY_ID') mock_getenv.assert_any_call('AWS_SECRET_ACCESS_KEY') expected_calls = [] - for synthesizer in SYNTHESIZERS: + for synthesizer in SYNTHESIZERS_SPLIT: expected_calls.append( call( output_destination=OUTPUT_DESTINATION_AWS, aws_access_key_id='my_access_key', aws_secret_access_key='my_secret_key', - synthesizers=[synthesizer], + synthesizers=synthesizer, + sdv_datasets=['expedia_hotel_logs', 'fake_companies'], compute_privacy_score=False, ) ) @@ -133,3 +140,4 @@ def test_main(mock_append_benchmark_run, mock_getenv, mock_benchmark_single_tabl 'my_secret_key', date, ) + mock_post_benchmark_launch_message.assert_called_once_with(date) From d5657df93680e693dc00781a1b4b89dee7c0c9a0 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 31 Jul 2025 13:05:06 +0100 Subject: [PATCH 35/53] update message 1 --- sdgym/run_benchmark/utils.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/sdgym/run_benchmark/utils.py b/sdgym/run_benchmark/utils.py index 4ec85078..d5720241 100644 --- a/sdgym/run_benchmark/utils.py +++ b/sdgym/run_benchmark/utils.py @@ -5,6 +5,8 @@ from slack_sdk import WebClient +from sdgym.s3 import parse_s3_path + OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' DEBUG_SLACK_CHANNEL = 'sdv-alerts-debug' @@ -34,6 +36,13 @@ def get_result_folder_name(date_str): return f'SDGym_results_{date.month:02d}_{date.day:02d}_{date.year}' +def get_s3_console_link(bucket, prefix): + """Get the S3 console link for the specified bucket and prefix.""" + return ( + f'https://s3.console.aws.amazon.com/s3/buckets/%7Bbucket%7D?prefix={prefix}&showversions=false' + ) + + def _get_slack_client(): """Create an authenticated Slack client. @@ -55,20 +64,21 @@ def post_slack_message(channel, text): def post_benchmark_launch_message(date_str): """Post a message to the SDV Alerts Slack channel when the benchmark is launched.""" channel = DEBUG_SLACK_CHANNEL + folder_name = get_result_folder_name(date_str) + bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS) + url_link = get_s3_console_link(bucket, f'{prefix}{folder_name}/') body = 'πŸƒ SDGym benchmark has been launched! EC2 Instances are running. ' - body += 'Intermediate results can be found ' - body += f'<{OUTPUT_DESTINATION_AWS}{get_result_folder_name(date_str)} |here>.\n' + body += f'Intermediate results can be found <<{url_link} |here>.\n' post_slack_message(channel, body) def post_run_summary(folder_name): """Post run summary to sdv-alerts slack channel.""" channel = DEBUG_SLACK_CHANNEL + bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS) + url_link = get_s3_console_link(bucket, f'{prefix}{folder_name}/{folder_name}_summary.csv') body = '' body += f'πŸ€ΈπŸ»β€β™€οΈ SDGym benchmark results for {folder_name} are available!πŸ‹οΈβ€β™€οΈ \n' - body += ( - f'Check the results <{OUTPUT_DESTINATION_AWS}{folder_name}/{folder_name}_summary' - '.csv|here>.\n' - ) + body += f'Check the results <<{url_link} |here>.\n' post_slack_message(channel, body) From a5ccd6d15f8ce1639120d94bc212c649ab80a4c9 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 31 Jul 2025 15:36:59 +0100 Subject: [PATCH 36/53] update uploading workflow --- .../workflows/upload_benchmark_results.yml | 34 +++++++++++++++++-- sdgym/result_writer.py | 4 +-- .../run_benchmark/upload_benchmark_results.py | 32 ++++++++++++++--- sdgym/run_benchmark/utils.py | 15 ++++---- tasks.py | 11 ++++-- .../test_upload_benchmark_result.py | 28 +++++++++++++-- 6 files changed, 103 insertions(+), 21 deletions(-) diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index 195de346..77e8c27c 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -1,6 +1,9 @@ name: Upload SDGym Benchmark results on: + push: + branches: + - issue-425-workflow-sdgym workflow_run: workflows: ["Run SDGym Benchmark"] types: @@ -28,10 +31,37 @@ jobs: - name: Upload SDGym Benchmark env: - SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} - + GITHUB_LOCAL_RESULTS_DIR: ${{ runner.temp }}/sdgym-leaderboard-files + run: | invoke upload-benchmark-results + - name: Create pull request for SDGym benchmark result + id: create_pr + if: env.SKIP_UPLOAD != 'true' + uses: peter-evans/create-pull-request@v4 + with: + token: ${{ secrets.GH_ACCESS_TOKEN }} + commit-message: "Upload SDGym Benchmark Results (${{ env.FOLDER_NAME }})" + author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>" + committer: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>" + title: "Upload SDGym Benchmark Results - ${{ env.FOLDER_NAME }}" + body: | + This is an **auto-generated PR** for uploading SDGym benchmark latest results. + + branch: sdgym-benchmark-upload + branch-suffix: short-commit-hash + base: gatsby-home + + - name: Send Slack notification + if: env.SKIP_UPLOAD != 'true' + env: + SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} + + run: | + invoke notify-sdgym-benchmark-uploaded \ + --folder-name ${{ env.FOLDER_NAME }} \ + --pr-url "${{ steps.create_pr.outputs.pull-request-url }}" + diff --git a/sdgym/result_writer.py b/sdgym/result_writer.py index 067ee93c..b69a4070 100644 --- a/sdgym/result_writer.py +++ b/sdgym/result_writer.py @@ -33,11 +33,11 @@ def write_yaml(self, data, file_path, append=False): class LocalResultsWriter(ResultsWriter): """Results writer for local file system.""" - def write_dataframe(self, data, file_path, append=False): + def write_dataframe(self, data, file_path, append=False, index=False): """Write a DataFrame to a CSV file.""" file_path = Path(file_path) if file_path.exists() and append: - data.to_csv(file_path, mode='a', index=False, header=False) + data.to_csv(file_path, mode='a', index=index, header=False) else: data.to_csv(file_path, mode='w', index=False) diff --git a/sdgym/run_benchmark/upload_benchmark_results.py b/sdgym/run_benchmark/upload_benchmark_results.py index e61b8d0a..4395ea06 100644 --- a/sdgym/run_benchmark/upload_benchmark_results.py +++ b/sdgym/run_benchmark/upload_benchmark_results.py @@ -8,7 +8,7 @@ import boto3 from botocore.exceptions import ClientError -from sdgym.result_writer import S3ResultsWriter +from sdgym.result_writer import LocalResultsWriter, S3ResultsWriter from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS from sdgym.s3 import S3_REGION, parse_s3_path from sdgym.sdgym_result_explorer.result_explorer import SDGymResultsExplorer @@ -62,7 +62,7 @@ def get_result_folder_name_and_s3_vars(aws_access_key_id, aws_secret_access_key) def upload_results( - aws_access_key_id, aws_secret_access_key, folder_name, s3_client, bucket, prefix + aws_access_key_id, aws_secret_access_key, folder_name, s3_client, bucket, prefix, github_env ): """Upload benchmark results to S3.""" result_explorer = SDGymResultsExplorer( @@ -71,16 +71,31 @@ def upload_results( aws_secret_access_key=aws_secret_access_key, ) result_writer = S3ResultsWriter(s3_client) - + local_results_writer = LocalResultsWriter() if not result_explorer.all_runs_complete(folder_name): LOGGER.warning(f'Run {folder_name} is not complete yet. Exiting.') + if github_env: + with open(github_env, 'a') as env_file: + env_file.write('SKIP_UPLOAD=true\n') + sys.exit(0) + else: + LOGGER.info(f'Run {folder_name} is complete! Proceeding with summarization...') + if github_env: + with open(github_env, 'a') as env_file: + env_file.write('SKIP_UPLOAD=false\n') + env_file.write(f'FOLDER_NAME={folder_name}\n') - LOGGER.info(f'Run {folder_name} is complete! Proceeding with summarization...') summary, _ = result_explorer.summarize(folder_name) result_writer.write_dataframe( summary, f'{OUTPUT_DESTINATION_AWS}{folder_name}/{folder_name}_summary.csv', index=True ) + local_export_dir = os.environ.get('GITHUB_LOCAL_RESULTS_DIR') + if local_export_dir: + local_results_writer.write_dataframe( + summary, f'{local_export_dir}/{folder_name}_summary.csv', index=True + ) + write_uploaded_marker(s3_client, bucket, prefix, folder_name) @@ -91,11 +106,18 @@ def main(): folder_name, s3_client, bucket, prefix = get_result_folder_name_and_s3_vars( aws_access_key_id, aws_secret_access_key ) + github_env = os.environ.get('GITHUB_ENV') if upload_already_done(s3_client, bucket, prefix, folder_name): LOGGER.warning('Benchmark results have already been uploaded. Exiting.') + if github_env: + with open(github_env, 'a') as env_file: + env_file.write('SKIP_UPLOAD=true\n') + sys.exit(0) - upload_results(aws_access_key_id, aws_secret_access_key, folder_name, s3_client, bucket, prefix) + upload_results( + aws_access_key_id, aws_secret_access_key, folder_name, s3_client, bucket, prefix, github_env + ) if __name__ == '__main__': diff --git a/sdgym/run_benchmark/utils.py b/sdgym/run_benchmark/utils.py index d5720241..ac020f44 100644 --- a/sdgym/run_benchmark/utils.py +++ b/sdgym/run_benchmark/utils.py @@ -68,17 +68,20 @@ def post_benchmark_launch_message(date_str): bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS) url_link = get_s3_console_link(bucket, f'{prefix}{folder_name}/') body = 'πŸƒ SDGym benchmark has been launched! EC2 Instances are running. ' - body += f'Intermediate results can be found <<{url_link} |here>.\n' + body += f'Intermediate results can be found <{url_link} |here>.\n' post_slack_message(channel, body) -def post_run_summary(folder_name): - """Post run summary to sdv-alerts slack channel.""" +def post_benchmark_uploaded_message(folder_name, pr_url=None): + """Post benchmark uploaded message to sdv-alerts slack channel.""" channel = DEBUG_SLACK_CHANNEL bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS) url_link = get_s3_console_link(bucket, f'{prefix}{folder_name}/{folder_name}_summary.csv') - body = '' - body += f'πŸ€ΈπŸ»β€β™€οΈ SDGym benchmark results for {folder_name} are available!πŸ‹οΈβ€β™€οΈ \n' - body += f'Check the results <<{url_link} |here>.\n' + body = ( + f'πŸ€ΈπŸ»β€β™€οΈ SDGym benchmark results for *{folder_name}* are available! πŸ‹οΈβ€β™€οΈ\n' + f'Check the results <{url_link} |here>.\n' + ) + if pr_url: + body += f'Waiting on merging this PR to update GitHub directory: <{pr_url}|PR Link>\n' post_slack_message(channel, body) diff --git a/tasks.py b/tasks.py index d2ec8f52..938b39f1 100644 --- a/tasks.py +++ b/tasks.py @@ -10,7 +10,7 @@ from invoke import task from packaging.requirements import Requirement from packaging.version import Version - +from sdgym.run_benchmark.utils import post_benchmark_uploaded_message COMPARISONS = {'>=': operator.ge, '>': operator.gt, '<': operator.lt, '<=': operator.le} EGG_STRING = '#egg=' @@ -209,6 +209,11 @@ def run_sdgym_benchmark(c): c.run('python sdgym/run_benchmark/run_benchmark.py') @task -def upload_benchmark_results(c, date=None): +def upload_benchmark_results(c): """Upload the benchmark results to S3.""" - c.run(f'python sdgym/run_benchmark/upload_benchmark_results.py {date}') + c.run(f'python sdgym/run_benchmark/upload_benchmark_results.py') + +@task +def notify_sdgym_benchmark_uploaded(c, folder_name, pr_url=None): + """Notify Slack about the SDGym benchmark upload.""" + post_benchmark_uploaded_message(folder_name, pr_url) \ No newline at end of file diff --git a/tests/unit/run_benchmark/test_upload_benchmark_result.py b/tests/unit/run_benchmark/test_upload_benchmark_result.py index 92757a84..d633bef8 100644 --- a/tests/unit/run_benchmark/test_upload_benchmark_result.py +++ b/tests/unit/run_benchmark/test_upload_benchmark_result.py @@ -101,7 +101,11 @@ def test_get_result_folder_name_and_s3_vars( @patch('sdgym.run_benchmark.upload_benchmark_results.write_uploaded_marker') @patch('sdgym.run_benchmark.upload_benchmark_results.LOGGER') @patch('sdgym.run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') +@patch('sdgym.run_benchmark.upload_benchmark_results.LocalResultsWriter') +@patch('sdgym.run_benchmark.upload_benchmark_results.os.environ.get') def test_upload_results( + mock_os_environ_get, + mock_local_results_writer, mock_output_destination_aws, mock_logger, mock_write_uploaded_marker, @@ -119,9 +123,18 @@ def test_upload_results( result_explorer_instance = mock_sdgym_results_explorer.return_value result_explorer_instance.all_runs_complete.return_value = True result_explorer_instance.summarize.return_value = ('summary', 'results') + mock_os_environ_get.return_value = '/tmp/sdgym_results' # Run - upload_results(aws_access_key_id, aws_secret_access_key, run_name, s3_client, bucket, prefix) + upload_results( + aws_access_key_id, + aws_secret_access_key, + run_name, + s3_client, + bucket, + prefix, + github_env=None, + ) # Assert mock_logger.info.assert_called_once_with( @@ -136,6 +149,9 @@ def test_upload_results( result_explorer_instance.summarize.assert_called_once_with(run_name) mock_s3_results_writer.return_value.write_dataframe.assert_called_once() mock_write_uploaded_marker.assert_called_once_with(s3_client, bucket, prefix, run_name) + mock_local_results_writer.return_value.write_dataframe.assert_called_once_with( + 'summary', '/tmp/sdgym_results/SDGym_results_10_01_2023_summary.csv', index=True + ) @patch('sdgym.run_benchmark.upload_benchmark_results.SDGymResultsExplorer') @@ -165,7 +181,13 @@ def test_upload_results_not_all_runs_complete( # Run with pytest.raises(SystemExit, match='0'): upload_results( - aws_access_key_id, aws_secret_access_key, run_name, s3_client, bucket, prefix + aws_access_key_id, + aws_secret_access_key, + run_name, + s3_client, + bucket, + prefix, + github_env=None, ) # Assert @@ -247,5 +269,5 @@ def test_main( ) mock_upload_already_done.assert_called_once_with('s3_client', 'bucket', 'prefix', 'run_name') mock_upload_results.assert_called_once_with( - 'my_access_key', 'my_secret_key', 'run_name', 's3_client', 'bucket', 'prefix' + 'my_access_key', 'my_secret_key', 'run_name', 's3_client', 'bucket', 'prefix', None ) From 84e4c7ca2c21207021b56412e327f124842a8bf9 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 31 Jul 2025 16:18:57 +0100 Subject: [PATCH 37/53] fix upload --- .github/workflows/readme.yml | 1 + .github/workflows/run_benchmark.yml | 3 - .../workflows/upload_benchmark_results.yml | 75 +++++++++++++------ .../run_benchmark/upload_benchmark_results.py | 3 +- .../test_upload_benchmark_result.py | 4 +- 5 files changed, 56 insertions(+), 30 deletions(-) diff --git a/.github/workflows/readme.yml b/.github/workflows/readme.yml index 55a2b351..bcfda983 100644 --- a/.github/workflows/readme.yml +++ b/.github/workflows/readme.yml @@ -27,5 +27,6 @@ jobs: python -m pip install --upgrade pip python -m pip install invoke rundoc . python -m pip install tomli + python -m pip install slack-sdk - name: Run the README.md run: invoke readme diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index 78b4402f..2b0e27ba 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -1,9 +1,6 @@ name: Run SDGym Benchmark on: - push: - branches: - - issue-425-workflow-sdgym workflow_dispatch: schedule: - cron: '0 5 1 * *' diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index 77e8c27c..a0e648bf 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -15,19 +15,21 @@ on: jobs: upload-sdgym-benchmark: runs-on: ubuntu-latest + steps: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Set up latest Python uses: actions/setup-python@v5 with: python-version-file: 'pyproject.toml' + - name: Install dependencies run: | - python -m pip install --upgrade pip - python -m pip install invoke - python -m pip install -e .[dev] + python -m pip install --upgrade pip + python -m pip install -e .[dev] - name: Upload SDGym Benchmark env: @@ -35,33 +37,58 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} GITHUB_LOCAL_RESULTS_DIR: ${{ runner.temp }}/sdgym-leaderboard-files - run: | invoke upload-benchmark-results - - name: Create pull request for SDGym benchmark result - id: create_pr - if: env.SKIP_UPLOAD != 'true' - uses: peter-evans/create-pull-request@v4 - with: - token: ${{ secrets.GH_ACCESS_TOKEN }} - commit-message: "Upload SDGym Benchmark Results (${{ env.FOLDER_NAME }})" - author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>" - committer: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>" - title: "Upload SDGym Benchmark Results - ${{ env.FOLDER_NAME }}" - body: | - This is an **auto-generated PR** for uploading SDGym benchmark latest results. + echo "GITHUB_LOCAL_RESULTS_DIR=$GITHUB_LOCAL_RESULTS_DIR" >> $GITHUB_ENV + + - name: Check skip upload flag + run: | + if [ "${SKIP_UPLOAD}" = "true" ]; then + echo "Upload skipped. Exiting workflow." + exit 0 + fi + - name: Prepare summary file for PR + run: | + mkdir pr-staging + echo "Looking for: $GITHUB_LOCAL_RESULTS_DIR/${FOLDER_NAME}_summary.csv" + ls -l "$GITHUB_LOCAL_RESULTS_DIR" + cp "$GITHUB_LOCAL_RESULTS_DIR/${FOLDER_NAME}_summary.csv" \ + "pr-staging/SDGym_summary_${FOLDER_NAME}.csv" - branch: sdgym-benchmark-upload - branch-suffix: short-commit-hash - base: gatsby-home + - name: Checkout target repo (sdv-dev.github.io) + run: | + git clone https://github.com/sdv-dev/sdv-dev.github.io.git target-repo + cd target-repo + git checkout gatsby-home + - name: Copy summary and create PR + env: + FOLDER_NAME: ${{ env.FOLDER_NAME }} + run: | + cp pr-staging/SDGym_summary_${FOLDER_NAME}.csv target-repo/assets/ + + cd target-repo + git checkout -b sdgym-benchmark-upload-${FOLDER_NAME} + git config --local user.name "${GITHUB_ACTOR}" + git config --local user.email "${GITHUB_ACTOR_ID}+${GITHUB_ACTOR}@users.noreply.github.com" + + git add assets/ + git commit -m "Upload SDGym Benchmark Summary ($FOLDER_NAME)" + + git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/sdv-dev/sdv-dev.github.io.git + git push origin sdgym-benchmark-upload-${FOLDER_NAME} + + gh pr create \ + --repo sdv-dev/sdv-dev.github.io \ + --head sdgym-benchmark-upload-${FOLDER_NAME} \ + --base gatsby-home \ + --title "Upload SDGym Benchmark Summary ($FOLDER_NAME)" \ + --body "Automated benchmark summary upload" \ + --assignee "${{ github.actor }}" - name: Send Slack notification - if: env.SKIP_UPLOAD != 'true' env: SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} - run: | invoke notify-sdgym-benchmark-uploaded \ - --folder-name ${{ env.FOLDER_NAME }} \ - --pr-url "${{ steps.create_pr.outputs.pull-request-url }}" - + --folder-name $FOLDER_NAME \ + --pr-url "$(gh pr view --json url -q .url)" diff --git a/sdgym/run_benchmark/upload_benchmark_results.py b/sdgym/run_benchmark/upload_benchmark_results.py index 4395ea06..ef573be7 100644 --- a/sdgym/run_benchmark/upload_benchmark_results.py +++ b/sdgym/run_benchmark/upload_benchmark_results.py @@ -92,6 +92,7 @@ def upload_results( ) local_export_dir = os.environ.get('GITHUB_LOCAL_RESULTS_DIR') if local_export_dir: + os.makedirs(local_export_dir, exist_ok=True) local_results_writer.write_dataframe( summary, f'{local_export_dir}/{folder_name}_summary.csv', index=True ) @@ -106,7 +107,7 @@ def main(): folder_name, s3_client, bucket, prefix = get_result_folder_name_and_s3_vars( aws_access_key_id, aws_secret_access_key ) - github_env = os.environ.get('GITHUB_ENV') + github_env = os.getenv('GITHUB_ENV') if upload_already_done(s3_client, bucket, prefix, folder_name): LOGGER.warning('Benchmark results have already been uploaded. Exiting.') if github_env: diff --git a/tests/unit/run_benchmark/test_upload_benchmark_result.py b/tests/unit/run_benchmark/test_upload_benchmark_result.py index d633bef8..1d4826e2 100644 --- a/tests/unit/run_benchmark/test_upload_benchmark_result.py +++ b/tests/unit/run_benchmark/test_upload_benchmark_result.py @@ -217,7 +217,7 @@ def test_main_already_upload( ): """Test the `method` when results are already uploaded.""" # Setup - mock_getenv.side_effect = ['my_access_key', 'my_secret_key'] + mock_getenv.side_effect = ['my_access_key', 'my_secret_key', None] mock_get_result_folder_name_and_s3_vars.return_value = ( 'run_name', 's3_client', @@ -251,7 +251,7 @@ def test_main( ): """Test the `main` method.""" # Setup - mock_getenv.side_effect = ['my_access_key', 'my_secret_key'] + mock_getenv.side_effect = ['my_access_key', 'my_secret_key', None] mock_get_result_folder_name_and_s3_vars.return_value = ( 'run_name', 's3_client', From 3eed4feed98c269be251968f1830002697540ea7 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Fri, 1 Aug 2025 15:12:15 +0100 Subject: [PATCH 38/53] add unit tests --- .../workflows/upload_benchmark_results.yml | 35 ++-- .../run_benchmark/upload_benchmark_results.py | 8 +- sdgym/run_benchmark/utils.py | 74 ++++++- tests/unit/run_benchmark/test__utils.py | 14 -- .../test_upload_benchmark_result.py | 15 +- tests/unit/run_benchmark/test_utils.py | 198 ++++++++++++++++++ 6 files changed, 308 insertions(+), 36 deletions(-) delete mode 100644 tests/unit/run_benchmark/test__utils.py create mode 100644 tests/unit/run_benchmark/test_utils.py diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index a0e648bf..53c9c05a 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -47,43 +47,48 @@ jobs: echo "Upload skipped. Exiting workflow." exit 0 fi - - name: Prepare summary file for PR + - name: Prepare files for PR run: | mkdir pr-staging - echo "Looking for: $GITHUB_LOCAL_RESULTS_DIR/${FOLDER_NAME}_summary.csv" + echo "Looking for files in: $GITHUB_LOCAL_RESULTS_DIR" ls -l "$GITHUB_LOCAL_RESULTS_DIR" - cp "$GITHUB_LOCAL_RESULTS_DIR/${FOLDER_NAME}_summary.csv" \ - "pr-staging/SDGym_summary_${FOLDER_NAME}.csv" + for f in "$GITHUB_LOCAL_RESULTS_DIR"/${FOLDER_NAME}_*.csv; do + base=$(basename "$f") + cp "$f" "pr-staging/SDGym_${base}" + done + + echo "Files staged for PR:" + ls -l pr-staging - name: Checkout target repo (sdv-dev.github.io) run: | git clone https://github.com/sdv-dev/sdv-dev.github.io.git target-repo cd target-repo git checkout gatsby-home - - name: Copy summary and create PR + + - name: Copy results and create PR env: + GH_TOKEN: ${{ secrets.GH_TOKEN }} FOLDER_NAME: ${{ env.FOLDER_NAME }} run: | - cp pr-staging/SDGym_summary_${FOLDER_NAME}.csv target-repo/assets/ - + cp pr-staging/* target-repo/assets/ cd target-repo git checkout -b sdgym-benchmark-upload-${FOLDER_NAME} - git config --local user.name "${GITHUB_ACTOR}" - git config --local user.email "${GITHUB_ACTOR_ID}+${GITHUB_ACTOR}@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com" git add assets/ - git commit -m "Upload SDGym Benchmark Summary ($FOLDER_NAME)" - - git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/sdv-dev/sdv-dev.github.io.git + git commit -m "Upload SDGym Benchmark Results ($FOLDER_NAME)" + git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/sdv-dev/sdv-dev.github.io.git git push origin sdgym-benchmark-upload-${FOLDER_NAME} + # Create PR gh pr create \ --repo sdv-dev/sdv-dev.github.io \ --head sdgym-benchmark-upload-${FOLDER_NAME} \ --base gatsby-home \ - --title "Upload SDGym Benchmark Summary ($FOLDER_NAME)" \ - --body "Automated benchmark summary upload" \ - --assignee "${{ github.actor }}" + --title "Upload SDGym Benchmark Results ($FOLDER_NAME)" \ + --body "Automated SDGym benchmark results upload" - name: Send Slack notification env: diff --git a/sdgym/run_benchmark/upload_benchmark_results.py b/sdgym/run_benchmark/upload_benchmark_results.py index ef573be7..4b0f575a 100644 --- a/sdgym/run_benchmark/upload_benchmark_results.py +++ b/sdgym/run_benchmark/upload_benchmark_results.py @@ -9,7 +9,7 @@ from botocore.exceptions import ClientError from sdgym.result_writer import LocalResultsWriter, S3ResultsWriter -from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS +from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS, get_df_to_plot from sdgym.s3 import S3_REGION, parse_s3_path from sdgym.sdgym_result_explorer.result_explorer import SDGymResultsExplorer @@ -86,7 +86,8 @@ def upload_results( env_file.write('SKIP_UPLOAD=false\n') env_file.write(f'FOLDER_NAME={folder_name}\n') - summary, _ = result_explorer.summarize(folder_name) + summary, results = result_explorer.summarize(folder_name) + df_to_plot = get_df_to_plot(results) result_writer.write_dataframe( summary, f'{OUTPUT_DESTINATION_AWS}{folder_name}/{folder_name}_summary.csv', index=True ) @@ -96,6 +97,9 @@ def upload_results( local_results_writer.write_dataframe( summary, f'{local_export_dir}/{folder_name}_summary.csv', index=True ) + local_results_writer.write_dataframe( + df_to_plot, f'{local_export_dir}/{folder_name}_plot_data.csv', index=False + ) write_uploaded_marker(s3_client, bucket, prefix, folder_name) diff --git a/sdgym/run_benchmark/utils.py b/sdgym/run_benchmark/utils.py index ac020f44..47aa6d8e 100644 --- a/sdgym/run_benchmark/utils.py +++ b/sdgym/run_benchmark/utils.py @@ -3,6 +3,7 @@ import os from datetime import datetime +import numpy as np from slack_sdk import WebClient from sdgym.s3 import parse_s3_path @@ -12,6 +13,37 @@ DEBUG_SLACK_CHANNEL = 'sdv-alerts-debug' SLACK_CHANNEL = 'sdv-alerts' KEY_DATE_FILE = '_BENCHMARK_DATES.json' +PLOTLY_MARKERS = [ + 'circle', + 'square', + 'diamond', + 'cross', + 'x', + 'triangle-up', + 'triangle-down', + 'triangle-left', + 'triangle-right', + 'pentagon', + 'hexagon', + 'hexagon2', + 'octagon', + 'star', + 'hexagram', + 'star-triangle-up', + 'star-triangle-down', + 'star-square', + 'star-diamond', + 'diamond-tall', + 'diamond-wide', + 'hourglass', + 'bowtie', + 'circle-cross', + 'circle-x', + 'square-cross', + 'square-x', + 'diamond-cross', + 'diamond-x', +] # The synthesizers inside the same list will be run by the same ec2 instance SYNTHESIZERS_SPLIT = [ @@ -68,7 +100,7 @@ def post_benchmark_launch_message(date_str): bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS) url_link = get_s3_console_link(bucket, f'{prefix}{folder_name}/') body = 'πŸƒ SDGym benchmark has been launched! EC2 Instances are running. ' - body += f'Intermediate results can be found <{url_link} |here>.\n' + body += f'Intermediate results can be found <{url_link}|here>.\n' post_slack_message(channel, body) @@ -85,3 +117,43 @@ def post_benchmark_uploaded_message(folder_name, pr_url=None): body += f'Waiting on merging this PR to update GitHub directory: <{pr_url}|PR Link>\n' post_slack_message(channel, body) + + +def get_df_to_plot(benchmark_result): + """Get the data to plot from the benchmark result. + + Args: + benchmark_result (DataFrame): The benchmark result DataFrame. + + Returns: + DataFrame: The data to plot. + """ + df_to_plot = benchmark_result.copy() + df_to_plot['total_time'] = df_to_plot['Train_Time'] + df_to_plot['Sample_Time'] + df_to_plot['Aggregated_Time'] = df_to_plot.groupby('Synthesizer')['total_time'].transform('sum') + df_to_plot = ( + df_to_plot.groupby('Synthesizer')[['Aggregated_Time', 'Quality_Score']].mean().reset_index() + ) + df_to_plot['Log10 Aggregated_Time'] = df_to_plot['Aggregated_Time'].apply( + lambda x: np.log10(x) if x > 0 else 0 + ) + df_to_plot = df_to_plot.sort_values( + ['Aggregated_Time', 'Quality_Score'], ascending=[True, False] + ) + df_to_plot['Cumulative Quality Score'] = df_to_plot['Quality_Score'].cummax() + pareto_points = df_to_plot.loc[ + df_to_plot['Quality_Score'] == df_to_plot['Cumulative Quality Score'] + ] + df_to_plot['Pareto'] = df_to_plot.index.isin(pareto_points.index) + df_to_plot['Color'] = df_to_plot['Pareto'].apply(lambda x: '#01E0C9' if x else '#03AFF1') + df_to_plot['Synthesizer'] = df_to_plot['Synthesizer'].str.replace( + 'Synthesizer', '', regex=False + ) + + synthesizers = df_to_plot['Synthesizer'].unique() + marker_map = { + synth: PLOTLY_MARKERS[i % len(PLOTLY_MARKERS)] for i, synth in enumerate(synthesizers) + } + df_to_plot['Marker'] = df_to_plot['Synthesizer'].map(marker_map) + + return df_to_plot.drop(columns=['Cumulative Quality Score']).reset_index(drop=True) diff --git a/tests/unit/run_benchmark/test__utils.py b/tests/unit/run_benchmark/test__utils.py deleted file mode 100644 index 8ec5f6b4..00000000 --- a/tests/unit/run_benchmark/test__utils.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest - -from sdgym.run_benchmark.utils import get_result_folder_name - - -def test_get_result_folder_name(): - """Test the `get_result_folder_name` method.""" - # Setup - expected_error_message = 'Invalid date format: invalid-date. Expected YYYY-MM-DD.' - - # Run and Assert - assert get_result_folder_name('2023-10-01') == 'SDGym_results_10_01_2023' - with pytest.raises(ValueError, match=expected_error_message): - get_result_folder_name('invalid-date') diff --git a/tests/unit/run_benchmark/test_upload_benchmark_result.py b/tests/unit/run_benchmark/test_upload_benchmark_result.py index 1d4826e2..b8059b31 100644 --- a/tests/unit/run_benchmark/test_upload_benchmark_result.py +++ b/tests/unit/run_benchmark/test_upload_benchmark_result.py @@ -1,4 +1,4 @@ -from unittest.mock import Mock, patch +from unittest.mock import Mock, call, patch import pytest from botocore.exceptions import ClientError @@ -103,7 +103,9 @@ def test_get_result_folder_name_and_s3_vars( @patch('sdgym.run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') @patch('sdgym.run_benchmark.upload_benchmark_results.LocalResultsWriter') @patch('sdgym.run_benchmark.upload_benchmark_results.os.environ.get') +@patch('sdgym.run_benchmark.upload_benchmark_results.get_df_to_plot') def test_upload_results( + mock_get_df_to_plot, mock_os_environ_get, mock_local_results_writer, mock_output_destination_aws, @@ -124,6 +126,7 @@ def test_upload_results( result_explorer_instance.all_runs_complete.return_value = True result_explorer_instance.summarize.return_value = ('summary', 'results') mock_os_environ_get.return_value = '/tmp/sdgym_results' + mock_get_df_to_plot.return_value = 'df_to_plot' # Run upload_results( @@ -149,9 +152,13 @@ def test_upload_results( result_explorer_instance.summarize.assert_called_once_with(run_name) mock_s3_results_writer.return_value.write_dataframe.assert_called_once() mock_write_uploaded_marker.assert_called_once_with(s3_client, bucket, prefix, run_name) - mock_local_results_writer.return_value.write_dataframe.assert_called_once_with( - 'summary', '/tmp/sdgym_results/SDGym_results_10_01_2023_summary.csv', index=True - ) + mock_local_results_writer.return_value.write_dataframe.assert_has_calls([ + call('summary', '/tmp/sdgym_results/SDGym_results_10_01_2023_summary.csv', index=True), + call( + 'df_to_plot', '/tmp/sdgym_results/SDGym_results_10_01_2023_plot_data.csv', index=False + ), + ]) + mock_get_df_to_plot.assert_called_once_with('results') @patch('sdgym.run_benchmark.upload_benchmark_results.SDGymResultsExplorer') diff --git a/tests/unit/run_benchmark/test_utils.py b/tests/unit/run_benchmark/test_utils.py new file mode 100644 index 00000000..11cae876 --- /dev/null +++ b/tests/unit/run_benchmark/test_utils.py @@ -0,0 +1,198 @@ +from unittest.mock import patch + +import pandas as pd +import pytest + +from sdgym.run_benchmark.utils import ( + DEBUG_SLACK_CHANNEL, + OUTPUT_DESTINATION_AWS, + _get_slack_client, + get_df_to_plot, + get_result_folder_name, + get_s3_console_link, + post_benchmark_launch_message, + post_benchmark_uploaded_message, + post_slack_message, +) + + +def test_get_result_folder_name(): + """Test the `get_result_folder_name` method.""" + # Setup + expected_error_message = 'Invalid date format: invalid-date. Expected YYYY-MM-DD.' + + # Run and Assert + assert get_result_folder_name('2023-10-01') == 'SDGym_results_10_01_2023' + with pytest.raises(ValueError, match=expected_error_message): + get_result_folder_name('invalid-date') + + +def test_get_s3_console_link(): + """Test the `get_s3_console_link` method.""" + # Setup + bucket = 'my-bucket' + prefix = 'my-prefix/' + + # Run + link = get_s3_console_link(bucket, prefix) + + # Assert + expected_link = ( + f'https://s3.console.aws.amazon.com/s3/buckets/%7Bbucket%7D?prefix={prefix}&showversions=false' + ) + assert link == expected_link + + +@patch('sdgym.run_benchmark.utils.WebClient') +@patch('sdgym.run_benchmark.utils.os.getenv') +def test_get_slack_client(mock_getenv, mock_web_client): + """Test the `_get_slack_client` method.""" + # Setup + mock_getenv.return_value = 'xoxb-test-token' + + # Run + client = _get_slack_client() + + # Assert + mock_getenv.assert_called_once_with('SLACK_TOKEN') + mock_web_client.assert_called_once_with(token='xoxb-test-token') + assert client is mock_web_client.return_value + + +@patch('sdgym.run_benchmark.utils._get_slack_client') +def test_post_slack_message(mock_get_slack_client): + """Test the `post_slack_message` method.""" + # Setup + mock_slack_client = mock_get_slack_client.return_value + channel = 'test-channel' + text = 'Test message' + + # Run + post_slack_message(channel, text) + + # Assert + mock_get_slack_client.assert_called_once() + mock_slack_client.chat_postMessage.assert_called_once_with(channel=channel, text=text) + + +@patch('sdgym.run_benchmark.utils.post_slack_message') +@patch('sdgym.run_benchmark.utils.get_s3_console_link') +@patch('sdgym.run_benchmark.utils.parse_s3_path') +@patch('sdgym.run_benchmark.utils.get_result_folder_name') +def test_post_benchmark_launch_message( + mock_get_result_folder_name, + mock_parse_s3_path, + mock_get_s3_console_link, + mock_post_slack_message, +): + """Test the `post_benchmark_launch_message` method.""" + # Setup + date_str = '2023-10-01' + folder_name = 'SDGym_results_10_01_2023' + mock_get_result_folder_name.return_value = folder_name + mock_parse_s3_path.return_value = ('my-bucket', 'my-prefix/') + url = 'https://s3.console.aws.amazon.com/' + mock_get_s3_console_link.return_value = url + expected_body = ( + 'πŸƒ SDGym benchmark has been launched! EC2 Instances are running. ' + f'Intermediate results can be found <{url}|here>.\n' + ) + # Run + post_benchmark_launch_message(date_str) + + # Assert + mock_get_result_folder_name.assert_called_once_with(date_str) + mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS) + mock_get_s3_console_link.assert_called_once_with('my-bucket', f'my-prefix/{folder_name}/') + mock_post_slack_message.assert_called_once_with(DEBUG_SLACK_CHANNEL, expected_body) + + +@patch('sdgym.run_benchmark.utils.post_slack_message') +@patch('sdgym.run_benchmark.utils.get_s3_console_link') +@patch('sdgym.run_benchmark.utils.parse_s3_path') +def test_post_benchmark_uploaded_message( + mock_parse_s3_path, + mock_get_s3_console_link, + mock_post_slack_message, +): + """Test the `post_benchmark_uploaded_message` method.""" + # Setup + folder_name = 'SDGym_results_10_01_2023' + mock_parse_s3_path.return_value = ('my-bucket', 'my-prefix/') + url = 'https://s3.console.aws.amazon.com/' + mock_get_s3_console_link.return_value = url + expected_body = ( + f'πŸ€ΈπŸ»β€β™€οΈ SDGym benchmark results for *{folder_name}* are available! πŸ‹οΈβ€β™€οΈ\n' + f'Check the results <{url} |here>.\n' + ) + + # Run + post_benchmark_uploaded_message(folder_name) + + # Assert + mock_post_slack_message.assert_called_once_with(DEBUG_SLACK_CHANNEL, expected_body) + mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS) + mock_get_s3_console_link.assert_called_once_with( + 'my-bucket', f'my-prefix/{folder_name}/{folder_name}_summary.csv' + ) + + +@patch('sdgym.run_benchmark.utils.post_slack_message') +@patch('sdgym.run_benchmark.utils.get_s3_console_link') +@patch('sdgym.run_benchmark.utils.parse_s3_path') +def test_post_benchmark_uploaded_message_with_pull_request( + mock_parse_s3_path, + mock_get_s3_console_link, + mock_post_slack_message, +): + """Test the `post_benchmark_uploaded_message` with a pull request URL.""" + # Setup + folder_name = 'SDGym_results_10_01_2023' + pr_url = 'https://github.com/user/repo/pull/123' + mock_parse_s3_path.return_value = ('my-bucket', 'my-prefix/') + url = 'https://s3.console.aws.amazon.com/' + mock_get_s3_console_link.return_value = url + expected_body = ( + f'πŸ€ΈπŸ»β€β™€οΈ SDGym benchmark results for *{folder_name}* are available! πŸ‹οΈβ€β™€οΈ\n' + f'Check the results <{url} |here>.\n' + f'Waiting on merging this PR to update GitHub directory: <{pr_url}|PR Link>\n' + ) + + # Run + post_benchmark_uploaded_message(folder_name, pr_url) + + # Assert + mock_post_slack_message.assert_called_once_with(DEBUG_SLACK_CHANNEL, expected_body) + mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS) + mock_get_s3_console_link.assert_called_once_with( + 'my-bucket', f'my-prefix/{folder_name}/{folder_name}_summary.csv' + ) + + +def test_get_df_to_plot(): + """Test the `get_df_to_plot` method.""" + # Setup + data = pd.DataFrame({ + 'Synthesizer': ( + ['GaussianCopulaSynthesizer'] * 2 + ['CTGANSynthesizer'] * 2 + ['TVAESynthesizer'] * 2 + ), + 'Dataset': ['Dataset1', 'Dataset2'] * 3, + 'Train_Time': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + 'Sample_Time': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], + 'Quality_Score': [0.8, 0.9, 0.7, 0.6, 0.5, 0.4], + }) + + # Run + result = get_df_to_plot(data) + + # Assert + expected_result = pd.DataFrame({ + 'Synthesizer': ['GaussianCopula', 'CTGAN', 'TVAE'], + 'Aggregated_Time': [3.3, 7.7, 12.1], + 'Quality_Score': [0.85, 0.65, 0.45], + 'Log10 Aggregated_Time': [0.5185139398778875, 0.8864907251724818, 1.08278537031645], + 'Pareto': [True, False, False], + 'Color': ['#01E0C9', '#03AFF1', '#03AFF1'], + 'Marker': ['circle', 'square', 'diamond'], + }) + pd.testing.assert_frame_equal(result, expected_result) From 614b419d2b5f0b7e6a2ebdfbb8077148fff2202d Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Fri, 1 Aug 2025 17:51:51 +0100 Subject: [PATCH 39/53] clean run_benchmark --- .github/workflows/run_benchmark.yml | 2 +- .../workflows/upload_benchmark_results.yml | 31 +++++++++++-------- sdgym/run_benchmark/run_benchmark.py | 4 +-- sdgym/run_benchmark/utils.py | 9 ++---- .../unit/run_benchmark/test_run_benchmark.py | 2 +- 5 files changed, 25 insertions(+), 23 deletions(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index 2b0e27ba..8f3704b7 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -3,7 +3,7 @@ name: Run SDGym Benchmark on: workflow_dispatch: schedule: - - cron: '0 5 1 * *' + - cron: '0 5 2 * *' jobs: run-sdgym-benchmark: diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index 53c9c05a..affcd3fb 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -35,43 +35,39 @@ jobs: env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} GITHUB_LOCAL_RESULTS_DIR: ${{ runner.temp }}/sdgym-leaderboard-files run: | invoke upload-benchmark-results echo "GITHUB_LOCAL_RESULTS_DIR=$GITHUB_LOCAL_RESULTS_DIR" >> $GITHUB_ENV - - name: Check skip upload flag - run: | - if [ "${SKIP_UPLOAD}" = "true" ]; then - echo "Upload skipped. Exiting workflow." - exit 0 - fi - name: Prepare files for PR + if: env.SKIP_UPLOAD != 'true' run: | mkdir pr-staging echo "Looking for files in: $GITHUB_LOCAL_RESULTS_DIR" ls -l "$GITHUB_LOCAL_RESULTS_DIR" for f in "$GITHUB_LOCAL_RESULTS_DIR"/${FOLDER_NAME}_*.csv; do base=$(basename "$f") - cp "$f" "pr-staging/SDGym_${base}" + cp "$f" "pr-staging/${base}" done echo "Files staged for PR:" ls -l pr-staging - name: Checkout target repo (sdv-dev.github.io) + if: env.SKIP_UPLOAD != 'true' run: | git clone https://github.com/sdv-dev/sdv-dev.github.io.git target-repo cd target-repo git checkout gatsby-home - name: Copy results and create PR + if: env.SKIP_UPLOAD != 'true' env: GH_TOKEN: ${{ secrets.GH_TOKEN }} FOLDER_NAME: ${{ env.FOLDER_NAME }} run: | - cp pr-staging/* target-repo/assets/ + cp pr-staging/* target-repo/assets/sdgym-leaderboard-files/ cd target-repo git checkout -b sdgym-benchmark-upload-${FOLDER_NAME} git config --local user.name "github-actions[bot]" @@ -82,18 +78,27 @@ jobs: git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/sdv-dev/sdv-dev.github.io.git git push origin sdgym-benchmark-upload-${FOLDER_NAME} - # Create PR gh pr create \ --repo sdv-dev/sdv-dev.github.io \ --head sdgym-benchmark-upload-${FOLDER_NAME} \ --base gatsby-home \ --title "Upload SDGym Benchmark Results ($FOLDER_NAME)" \ - --body "Automated SDGym benchmark results upload" + --body "Automated benchmark results upload" \ + --reviewer "pcarapic15" + + # Capture PR URL + PR_URL=$(gh pr view sdgym-benchmark-upload-${FOLDER_NAME} \ + --repo sdv-dev/sdv-dev.github.io \ + --json url -q .url) + + echo "PR URL: $PR_URL" + echo "PR_URL=$PR_URL" >> $GITHUB_ENV - name: Send Slack notification + if: env.SKIP_UPLOAD != 'true' env: SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} run: | invoke notify-sdgym-benchmark-uploaded \ - --folder-name $FOLDER_NAME \ - --pr-url "$(gh pr view --json url -q .url)" + --folder-name "$FOLDER_NAME" \ + --pr-url "$PR_URL" diff --git a/sdgym/run_benchmark/run_benchmark.py b/sdgym/run_benchmark/run_benchmark.py index 5a8c9a05..5ae5c609 100644 --- a/sdgym/run_benchmark/run_benchmark.py +++ b/sdgym/run_benchmark/run_benchmark.py @@ -46,14 +46,14 @@ def main(): aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') date_str = datetime.now(timezone.utc).strftime('%Y-%m-%d') - for synthesizer_group in SYNTHESIZERS_SPLIT[:2]: + for synthesizer_group in SYNTHESIZERS_SPLIT: benchmark_single_table_aws( output_destination=OUTPUT_DESTINATION_AWS, - sdv_datasets=['expedia_hotel_logs', 'fake_companies'], aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, synthesizers=synthesizer_group, compute_privacy_score=False, + timeout=345600, # 4 days ) append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str) diff --git a/sdgym/run_benchmark/utils.py b/sdgym/run_benchmark/utils.py index 47aa6d8e..1e409252 100644 --- a/sdgym/run_benchmark/utils.py +++ b/sdgym/run_benchmark/utils.py @@ -49,14 +49,11 @@ SYNTHESIZERS_SPLIT = [ ['UniformSynthesizer', 'ColumnSynthesizer', 'GaussianCopulaSynthesizer'], ['TVAESynthesizer'], + ['CopulaGANSynthesizer'], + ['CTGANSynthesizer'], + ['RealTabFormerSynthesizer'], ] -""" -['CopulaGANSynthesizer'], -['CTGANSynthesizer'], -['RealTabFormerSynthesizer'], -""" - def get_result_folder_name(date_str): """Get the result folder name based on the date string.""" diff --git a/tests/unit/run_benchmark/test_run_benchmark.py b/tests/unit/run_benchmark/test_run_benchmark.py index ffa5211a..aacab84e 100644 --- a/tests/unit/run_benchmark/test_run_benchmark.py +++ b/tests/unit/run_benchmark/test_run_benchmark.py @@ -129,8 +129,8 @@ def test_main( aws_access_key_id='my_access_key', aws_secret_access_key='my_secret_key', synthesizers=synthesizer, - sdv_datasets=['expedia_hotel_logs', 'fake_companies'], compute_privacy_score=False, + timeout=345600, ) ) From e63a58d49c4a609ada9948856597a810d5ec6fa6 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 4 Aug 2025 14:32:37 +0100 Subject: [PATCH 40/53] cleaning 1 --- .github/workflows/run_benchmark.yml | 2 +- sdgym/benchmark.py | 2 +- sdgym/run_benchmark/utils.py | 7 +++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index 8f3704b7..7e66075a 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -3,7 +3,7 @@ name: Run SDGym Benchmark on: workflow_dispatch: schedule: - - cron: '0 5 2 * *' + - cron: '0 5 5 * *' jobs: run-sdgym-benchmark: diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 1b9a94f9..b1a12e17 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1297,7 +1297,7 @@ def _get_user_data_script(access_key, secret_key, region_name, script_content): echo "======== Install Dependencies in venv ============" pip install --upgrade pip - pip install "sdgym[all] @ git+https://github.com/sdv-dev/SDGym.git@issue-425-workflow-sdgym#egg=sdgym" + pip install sdgym[all] pip install s3fs echo "======== Write Script ===========" diff --git a/sdgym/run_benchmark/utils.py b/sdgym/run_benchmark/utils.py index 1e409252..88b609d4 100644 --- a/sdgym/run_benchmark/utils.py +++ b/sdgym/run_benchmark/utils.py @@ -8,8 +8,8 @@ from sdgym.s3 import parse_s3_path -OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' -UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/' +OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Benchmarks/' +UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Benchmarks/' DEBUG_SLACK_CHANNEL = 'sdv-alerts-debug' SLACK_CHANNEL = 'sdv-alerts' KEY_DATE_FILE = '_BENCHMARK_DATES.json' @@ -47,8 +47,7 @@ # The synthesizers inside the same list will be run by the same ec2 instance SYNTHESIZERS_SPLIT = [ - ['UniformSynthesizer', 'ColumnSynthesizer', 'GaussianCopulaSynthesizer'], - ['TVAESynthesizer'], + ['UniformSynthesizer', 'ColumnSynthesizer', 'GaussianCopulaSynthesizer', 'TVAESynthesizer'], ['CopulaGANSynthesizer'], ['CTGANSynthesizer'], ['RealTabFormerSynthesizer'], From a21c7f89fb2c509a21582f1a0ea9a105cdddf38c Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 4 Aug 2025 18:34:51 +0100 Subject: [PATCH 41/53] lauch benchmark with RealTabFormer --- .github/workflows/run_benchmark.yml | 3 +++ sdgym/benchmark.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index 7e66075a..fcb2d2d6 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -1,6 +1,9 @@ name: Run SDGym Benchmark on: + push: + branches: + - issue-425-workflow-sdgym workflow_dispatch: schedule: - cron: '0 5 5 * *' diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index b1a12e17..1b9a94f9 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1297,7 +1297,7 @@ def _get_user_data_script(access_key, secret_key, region_name, script_content): echo "======== Install Dependencies in venv ============" pip install --upgrade pip - pip install sdgym[all] + pip install "sdgym[all] @ git+https://github.com/sdv-dev/SDGym.git@issue-425-workflow-sdgym#egg=sdgym" pip install s3fs echo "======== Write Script ===========" From fad69082936fc03a4851f99a274048872fa5c071 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 5 Aug 2025 10:32:52 +0100 Subject: [PATCH 42/53] debug run with timeout 1 --- .github/workflows/upload_benchmark_results.yml | 3 --- sdgym/benchmark.py | 2 ++ sdgym/run_benchmark/run_benchmark.py | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index affcd3fb..dc31af68 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -1,9 +1,6 @@ name: Upload SDGym Benchmark results on: - push: - branches: - - issue-425-workflow-sdgym workflow_run: workflows: ["Run SDGym Benchmark"] types: diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 1b9a94f9..b3ac218d 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -677,6 +677,8 @@ def _run_job(args): output = {} try: if timeout: + print('LAAA') + print(timeout) output = _score_with_timeout( timeout=timeout, synthesizer=synthesizer, diff --git a/sdgym/run_benchmark/run_benchmark.py b/sdgym/run_benchmark/run_benchmark.py index 5ae5c609..0ebfbcf0 100644 --- a/sdgym/run_benchmark/run_benchmark.py +++ b/sdgym/run_benchmark/run_benchmark.py @@ -51,13 +51,14 @@ def main(): output_destination=OUTPUT_DESTINATION_AWS, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, + sdv_datasets=['expedia_hotel_logs', 'fake_companies'], synthesizers=synthesizer_group, compute_privacy_score=False, timeout=345600, # 4 days ) append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str) - post_benchmark_launch_message(date_str) + #post_benchmark_launch_message(date_str) if __name__ == '__main__': From 2556dc4f92950a36facc562137829ff94ee9d79c Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 5 Aug 2025 11:08:15 +0100 Subject: [PATCH 43/53] debug run with timeout 2 --- sdgym/benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index b3ac218d..e4408095 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -1275,7 +1275,7 @@ def _get_s3_script_content( scores = _run_jobs(None, job_args_list, False, result_writer=result_writer) run_id_filename = job_args_list[0][-1]['run_id'] _update_run_id_file(run_id_filename, result_writer) -s3_client.delete_object(Bucket='{bucket_name}', Key='{job_args_key}') +#s3_client.delete_object(Bucket='{bucket_name}', Key='{job_args_key}') """ From 0b3f0f69bc94f25cf925fa9014df3f2e0bc747bf Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 5 Aug 2025 11:47:03 +0100 Subject: [PATCH 44/53] debug run with timeout 3 --- sdgym/benchmark.py | 34 +++++++++++++++++++++------- sdgym/run_benchmark/run_benchmark.py | 2 +- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index e4408095..fcb26a12 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -547,8 +547,15 @@ def _score_with_timeout( with multiprocessing_context(): with multiprocessing.Manager() as manager: output = manager.dict() + + def safe_score(*args): + try: + _score(*args) + except Exception as e: + output['error'] = str(e) + process = multiprocessing.Process( - target=_score, + target=safe_score, args=( synthesizer, data, @@ -567,13 +574,26 @@ def _score_with_timeout( process.start() process.join(timeout) - process.terminate() - output = dict(output) - if output.get('timeout'): - LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name) + if process.is_alive(): + output['timeout'] = True + process.terminate() + process.join() # ensure termination completes + + result = dict(output) + if result.get('timeout'): + LOGGER.error( + 'Timeout running %s on dataset %s', + synthesizer['name'], dataset_name + ) + elif result.get('error'): + LOGGER.error( + 'Error running %s on dataset %s: %s', + synthesizer['name'], dataset_name, result['error'] + ) + + return result - return output def _format_output( @@ -677,8 +697,6 @@ def _run_job(args): output = {} try: if timeout: - print('LAAA') - print(timeout) output = _score_with_timeout( timeout=timeout, synthesizer=synthesizer, diff --git a/sdgym/run_benchmark/run_benchmark.py b/sdgym/run_benchmark/run_benchmark.py index 0ebfbcf0..5dc59e1c 100644 --- a/sdgym/run_benchmark/run_benchmark.py +++ b/sdgym/run_benchmark/run_benchmark.py @@ -46,7 +46,7 @@ def main(): aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') date_str = datetime.now(timezone.utc).strftime('%Y-%m-%d') - for synthesizer_group in SYNTHESIZERS_SPLIT: + for synthesizer_group in SYNTHESIZERS_SPLIT[:2]: benchmark_single_table_aws( output_destination=OUTPUT_DESTINATION_AWS, aws_access_key_id=aws_access_key_id, From b93a237898523644a043aeead4570e2955c2c8a9 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 5 Aug 2025 13:34:00 +0100 Subject: [PATCH 45/53] debug run with timeout 4 --- sdgym/benchmark.py | 81 ++++++++++++++-------------- sdgym/run_benchmark/run_benchmark.py | 3 +- 2 files changed, 40 insertions(+), 44 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index fcb26a12..4ba4c5b4 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -8,6 +8,7 @@ import pickle import re import textwrap +import threading import tracemalloc import warnings from collections import defaultdict @@ -42,7 +43,7 @@ from sdgym.errors import SDGymError from sdgym.metrics import get_metrics from sdgym.progress import TqdmLogger, progress -from sdgym.result_writer import LocalResultsWriter +from sdgym.result_writer import LocalResultsWriter, S3ResultsWriter from sdgym.s3 import ( S3_PREFIX, S3_REGION, @@ -544,56 +545,52 @@ def _score_with_timeout( synthesizer_path=None, result_writer=None, ): + output = {} + args = ( + synthesizer, + data, + metadata, + metrics, + output, + compute_quality_score, + compute_diagnostic_score, + compute_privacy_score, + modality, + dataset_name, + synthesizer_path, + result_writer, + ) + if isinstance(result_writer, S3ResultsWriter): + process = threading.Thread( + target=_score, + args=args, + daemon=True, + ) + process.start() + process.join(timeout) + if process.is_alive(): + LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name) + return {'timeout': True, 'error': 'Timeout'} + + return process.result + with multiprocessing_context(): with multiprocessing.Manager() as manager: output = manager.dict() - - def safe_score(*args): - try: - _score(*args) - except Exception as e: - output['error'] = str(e) - process = multiprocessing.Process( - target=safe_score, - args=( - synthesizer, - data, - metadata, - metrics, - output, - compute_quality_score, - compute_diagnostic_score, - compute_privacy_score, - modality, - dataset_name, - synthesizer_path, - result_writer, - ), + target=_score, + args=args, ) process.start() process.join(timeout) + process.terminate() - if process.is_alive(): - output['timeout'] = True - process.terminate() - process.join() # ensure termination completes - - result = dict(output) - if result.get('timeout'): - LOGGER.error( - 'Timeout running %s on dataset %s', - synthesizer['name'], dataset_name - ) - elif result.get('error'): - LOGGER.error( - 'Error running %s on dataset %s: %s', - synthesizer['name'], dataset_name, result['error'] - ) - - return result + output = dict(output) + if output.get('timeout'): + LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name) + return output def _format_output( @@ -1293,7 +1290,7 @@ def _get_s3_script_content( scores = _run_jobs(None, job_args_list, False, result_writer=result_writer) run_id_filename = job_args_list[0][-1]['run_id'] _update_run_id_file(run_id_filename, result_writer) -#s3_client.delete_object(Bucket='{bucket_name}', Key='{job_args_key}') +s3_client.delete_object(Bucket='{bucket_name}', Key='{job_args_key}') """ diff --git a/sdgym/run_benchmark/run_benchmark.py b/sdgym/run_benchmark/run_benchmark.py index 5dc59e1c..56067f1a 100644 --- a/sdgym/run_benchmark/run_benchmark.py +++ b/sdgym/run_benchmark/run_benchmark.py @@ -12,7 +12,6 @@ OUTPUT_DESTINATION_AWS, SYNTHESIZERS_SPLIT, get_result_folder_name, - post_benchmark_launch_message, ) from sdgym.s3 import get_s3_client, parse_s3_path @@ -58,7 +57,7 @@ def main(): ) append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str) - #post_benchmark_launch_message(date_str) + # post_benchmark_launch_message(date_str) if __name__ == '__main__': From 734045e201bc3e262a83bedda614113351a89194 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 5 Aug 2025 13:54:34 +0100 Subject: [PATCH 46/53] debug run with timeout 5 --- sdgym/benchmark.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 4ba4c5b4..f34f23db 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -545,7 +545,7 @@ def _score_with_timeout( synthesizer_path=None, result_writer=None, ): - output = {} + output = {} if isinstance(result_writer, S3ResultsWriter) else None args = ( synthesizer, data, @@ -561,36 +561,28 @@ def _score_with_timeout( result_writer, ) if isinstance(result_writer, S3ResultsWriter): - process = threading.Thread( - target=_score, - args=args, - daemon=True, - ) - process.start() - process.join(timeout) - if process.is_alive(): + thread = threading.Thread(target=_score, args=args, daemon=True) + thread.start() + thread.join(timeout) + if thread.is_alive(): LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name) return {'timeout': True, 'error': 'Timeout'} - return process.result + return output with multiprocessing_context(): with multiprocessing.Manager() as manager: output = manager.dict() - process = multiprocessing.Process( - target=_score, - args=args, - ) - + args = args[:4] + (output,) + args[5:] # replace output=None with manager.dict() + process = multiprocessing.Process(target=_score, args=args) process.start() process.join(timeout) - process.terminate() - - output = dict(output) - if output.get('timeout'): + if process.is_alive(): LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name) + process.terminate() + return {'timeout': True, 'error': 'Timeout'} - return output + return dict(output) def _format_output( From e38dde416f3ab9439b11adacbd17d5718097c350 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 5 Aug 2025 14:27:22 +0100 Subject: [PATCH 47/53] test with 1s time out --- sdgym/benchmark.py | 9 +++++---- sdgym/run_benchmark/run_benchmark.py | 5 +++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index f34f23db..384203b0 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -577,12 +577,13 @@ def _score_with_timeout( process = multiprocessing.Process(target=_score, args=args) process.start() process.join(timeout) - if process.is_alive(): + process.terminate() + + output = dict(output) + if output.get('timeout'): LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name) - process.terminate() - return {'timeout': True, 'error': 'Timeout'} - return dict(output) + return output def _format_output( diff --git a/sdgym/run_benchmark/run_benchmark.py b/sdgym/run_benchmark/run_benchmark.py index 56067f1a..c4373ab3 100644 --- a/sdgym/run_benchmark/run_benchmark.py +++ b/sdgym/run_benchmark/run_benchmark.py @@ -12,6 +12,7 @@ OUTPUT_DESTINATION_AWS, SYNTHESIZERS_SPLIT, get_result_folder_name, + post_benchmark_launch_message, ) from sdgym.s3 import get_s3_client, parse_s3_path @@ -53,11 +54,11 @@ def main(): sdv_datasets=['expedia_hotel_logs', 'fake_companies'], synthesizers=synthesizer_group, compute_privacy_score=False, - timeout=345600, # 4 days + timeout=1, # 4 days ) append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str) - # post_benchmark_launch_message(date_str) + post_benchmark_launch_message(date_str) if __name__ == '__main__': From 584f3a7824900e211cf25d8ee7da4c72de2c4048 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Tue, 5 Aug 2025 14:48:12 +0100 Subject: [PATCH 48/53] run benchmark with timeout --- sdgym/run_benchmark/run_benchmark.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sdgym/run_benchmark/run_benchmark.py b/sdgym/run_benchmark/run_benchmark.py index c4373ab3..5ae5c609 100644 --- a/sdgym/run_benchmark/run_benchmark.py +++ b/sdgym/run_benchmark/run_benchmark.py @@ -46,15 +46,14 @@ def main(): aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') date_str = datetime.now(timezone.utc).strftime('%Y-%m-%d') - for synthesizer_group in SYNTHESIZERS_SPLIT[:2]: + for synthesizer_group in SYNTHESIZERS_SPLIT: benchmark_single_table_aws( output_destination=OUTPUT_DESTINATION_AWS, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, - sdv_datasets=['expedia_hotel_logs', 'fake_companies'], synthesizers=synthesizer_group, compute_privacy_score=False, - timeout=1, # 4 days + timeout=345600, # 4 days ) append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str) From f01d7131149396313dd73b42f4e16f0655273ebc Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 18 Aug 2025 14:41:57 +0100 Subject: [PATCH 49/53] directly commit changes instead of creating a PR --- .github/workflows/run_benchmark.yml | 3 -- .../workflows/upload_benchmark_results.yml | 29 +++++++------------ sdgym/run_benchmark/utils.py | 6 ++-- tasks.py | 4 +-- tests/unit/run_benchmark/test_utils.py | 10 +++---- 5 files changed, 21 insertions(+), 31 deletions(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index fcb2d2d6..7e66075a 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -1,9 +1,6 @@ name: Run SDGym Benchmark on: - push: - branches: - - issue-425-workflow-sdgym workflow_dispatch: schedule: - cron: '0 5 5 * *' diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index dc31af68..fd8da7f4 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -1,6 +1,9 @@ name: Upload SDGym Benchmark results on: + push: + branches: + - issue-425-workflow-sdgym workflow_run: workflows: ["Run SDGym Benchmark"] types: @@ -66,30 +69,20 @@ jobs: run: | cp pr-staging/* target-repo/assets/sdgym-leaderboard-files/ cd target-repo - git checkout -b sdgym-benchmark-upload-${FOLDER_NAME} + git checkout gatsby-home git config --local user.name "github-actions[bot]" git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com" git add assets/ - git commit -m "Upload SDGym Benchmark Results ($FOLDER_NAME)" + git commit -m "Upload SDGym Benchmark Results ($FOLDER_NAME)" || echo "No changes to commit" git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/sdv-dev/sdv-dev.github.io.git - git push origin sdgym-benchmark-upload-${FOLDER_NAME} - - gh pr create \ - --repo sdv-dev/sdv-dev.github.io \ - --head sdgym-benchmark-upload-${FOLDER_NAME} \ - --base gatsby-home \ - --title "Upload SDGym Benchmark Results ($FOLDER_NAME)" \ - --body "Automated benchmark results upload" \ - --reviewer "pcarapic15" + git push origin gatsby-home - # Capture PR URL - PR_URL=$(gh pr view sdgym-benchmark-upload-${FOLDER_NAME} \ - --repo sdv-dev/sdv-dev.github.io \ - --json url -q .url) + COMMIT_HASH=$(git rev-parse HEAD) + COMMIT_URL="https://github.com/sdv-dev/sdv-dev.github.io/commit/$%7BCOMMIT_HASH%7D" - echo "PR URL: $PR_URL" - echo "PR_URL=$PR_URL" >> $GITHUB_ENV + echo "Commit URL: $COMMIT_URL" + echo "COMMIT_URL=$COMMIT_URL" >> $GITHUB_ENV - name: Send Slack notification if: env.SKIP_UPLOAD != 'true' @@ -98,4 +91,4 @@ jobs: run: | invoke notify-sdgym-benchmark-uploaded \ --folder-name "$FOLDER_NAME" \ - --pr-url "$PR_URL" + --commit-url "$COMMIT_URL" diff --git a/sdgym/run_benchmark/utils.py b/sdgym/run_benchmark/utils.py index 88b609d4..9adc4923 100644 --- a/sdgym/run_benchmark/utils.py +++ b/sdgym/run_benchmark/utils.py @@ -100,7 +100,7 @@ def post_benchmark_launch_message(date_str): post_slack_message(channel, body) -def post_benchmark_uploaded_message(folder_name, pr_url=None): +def post_benchmark_uploaded_message(folder_name, commit_url=None): """Post benchmark uploaded message to sdv-alerts slack channel.""" channel = DEBUG_SLACK_CHANNEL bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS) @@ -109,8 +109,8 @@ def post_benchmark_uploaded_message(folder_name, pr_url=None): f'πŸ€ΈπŸ»β€β™€οΈ SDGym benchmark results for *{folder_name}* are available! πŸ‹οΈβ€β™€οΈ\n' f'Check the results <{url_link} |here>.\n' ) - if pr_url: - body += f'Waiting on merging this PR to update GitHub directory: <{pr_url}|PR Link>\n' + if commit_url: + body += f'or on GitHub: <{commit_url}|Commit Link>\n' post_slack_message(channel, body) diff --git a/tasks.py b/tasks.py index 938b39f1..0f8a47b5 100644 --- a/tasks.py +++ b/tasks.py @@ -214,6 +214,6 @@ def upload_benchmark_results(c): c.run(f'python sdgym/run_benchmark/upload_benchmark_results.py') @task -def notify_sdgym_benchmark_uploaded(c, folder_name, pr_url=None): +def notify_sdgym_benchmark_uploaded(c, folder_name, commit_url=None): """Notify Slack about the SDGym benchmark upload.""" - post_benchmark_uploaded_message(folder_name, pr_url) \ No newline at end of file + post_benchmark_uploaded_message(folder_name, commit_url) \ No newline at end of file diff --git a/tests/unit/run_benchmark/test_utils.py b/tests/unit/run_benchmark/test_utils.py index 11cae876..db88c8d4 100644 --- a/tests/unit/run_benchmark/test_utils.py +++ b/tests/unit/run_benchmark/test_utils.py @@ -140,26 +140,26 @@ def test_post_benchmark_uploaded_message( @patch('sdgym.run_benchmark.utils.post_slack_message') @patch('sdgym.run_benchmark.utils.get_s3_console_link') @patch('sdgym.run_benchmark.utils.parse_s3_path') -def test_post_benchmark_uploaded_message_with_pull_request( +def test_post_benchmark_uploaded_message_with_commit( mock_parse_s3_path, mock_get_s3_console_link, mock_post_slack_message, ): - """Test the `post_benchmark_uploaded_message` with a pull request URL.""" + """Test the `post_benchmark_uploaded_message` with a commit URL.""" # Setup folder_name = 'SDGym_results_10_01_2023' - pr_url = 'https://github.com/user/repo/pull/123' + commit_url = 'https://github.com/user/repo/pull/123' mock_parse_s3_path.return_value = ('my-bucket', 'my-prefix/') url = 'https://s3.console.aws.amazon.com/' mock_get_s3_console_link.return_value = url expected_body = ( f'πŸ€ΈπŸ»β€β™€οΈ SDGym benchmark results for *{folder_name}* are available! πŸ‹οΈβ€β™€οΈ\n' f'Check the results <{url} |here>.\n' - f'Waiting on merging this PR to update GitHub directory: <{pr_url}|PR Link>\n' + f'or on GitHub: <{commit_url}|Commit Link>\n' ) # Run - post_benchmark_uploaded_message(folder_name, pr_url) + post_benchmark_uploaded_message(folder_name, commit_url) # Assert mock_post_slack_message.assert_called_once_with(DEBUG_SLACK_CHANNEL, expected_body) From 715cebf7a1fc4f1d519e9035b6d92ea95bf0c48d Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 18 Aug 2025 15:43:42 +0100 Subject: [PATCH 50/53] fix minimum version --- pyproject.toml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f580b20b..c0ea5cf1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,10 +47,8 @@ dependencies = [ "scipy>=1.12.0;python_version>='3.12' and python_version<'3.13'", "scipy>=1.14.1;python_version>='3.13'", 'tabulate>=0.8.3,<0.9', - "torch>=1.13.0;python_version<'3.11'", - "torch>=2.0.0;python_version>='3.11' and python_version<'3.12'", - "torch>=2.2.0;python_version>='3.12' and python_version<'3.13'", - "torch>=2.6.0;python_version>='3.13'", + "torch>=2.2.0;python_version>='3.8' and python_version<'3.9'", + "torch>=2.6.0;python_version>='3.9'", 'tqdm>=4.66.3', 'XlsxWriter>=1.2.8', 'rdt>=1.17.0', @@ -71,10 +69,9 @@ sdgym = { main = 'sdgym.cli.__main__:main' } [project.optional-dependencies] dask = ['dask', 'distributed'] realtabformer = [ - 'realtabformer>=0.2.2', - "torch>=2.1.0;python_version>='3.8' and python_version<'3.12'", - "torch>=2.2.0;python_version>='3.12' and python_version<'3.13'", - "torch>=2.6.0;python_version>='3.13'", + 'realtabformer>=0.2.3', + "torch>=2.2.0;python_version>='3.8' and python_version<'3.9'", + "torch>=2.6.0;python_version>='3.9'", 'transformers<4.51', ] test = [ From 7eb91af974e0213a3a48921c3a85a8f394289a43 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Mon, 18 Aug 2025 17:09:59 +0100 Subject: [PATCH 51/53] clean up --- .github/workflows/upload_benchmark_results.yml | 7 ++----- sdgym/result_writer.py | 2 +- sdgym/run_benchmark/utils.py | 4 ++-- tests/unit/run_benchmark/test_utils.py | 4 ++-- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index fd8da7f4..e70d5354 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -1,9 +1,6 @@ name: Upload SDGym Benchmark results on: - push: - branches: - - issue-425-workflow-sdgym workflow_run: workflows: ["Run SDGym Benchmark"] types: @@ -40,7 +37,7 @@ jobs: invoke upload-benchmark-results echo "GITHUB_LOCAL_RESULTS_DIR=$GITHUB_LOCAL_RESULTS_DIR" >> $GITHUB_ENV - - name: Prepare files for PR + - name: Prepare files for commit if: env.SKIP_UPLOAD != 'true' run: | mkdir pr-staging @@ -61,7 +58,7 @@ jobs: cd target-repo git checkout gatsby-home - - name: Copy results and create PR + - name: Copy results and commit if: env.SKIP_UPLOAD != 'true' env: GH_TOKEN: ${{ secrets.GH_TOKEN }} diff --git a/sdgym/result_writer.py b/sdgym/result_writer.py index b69a4070..05702b7e 100644 --- a/sdgym/result_writer.py +++ b/sdgym/result_writer.py @@ -39,7 +39,7 @@ def write_dataframe(self, data, file_path, append=False, index=False): if file_path.exists() and append: data.to_csv(file_path, mode='a', index=index, header=False) else: - data.to_csv(file_path, mode='w', index=False) + data.to_csv(file_path, mode='w', index=index, header=True) def write_pickle(self, obj, file_path): """Write a Python object to a pickle file.""" diff --git a/sdgym/run_benchmark/utils.py b/sdgym/run_benchmark/utils.py index 9adc4923..28a52d03 100644 --- a/sdgym/run_benchmark/utils.py +++ b/sdgym/run_benchmark/utils.py @@ -107,10 +107,10 @@ def post_benchmark_uploaded_message(folder_name, commit_url=None): url_link = get_s3_console_link(bucket, f'{prefix}{folder_name}/{folder_name}_summary.csv') body = ( f'πŸ€ΈπŸ»β€β™€οΈ SDGym benchmark results for *{folder_name}* are available! πŸ‹οΈβ€β™€οΈ\n' - f'Check the results <{url_link} |here>.\n' + f'Check the results <{url_link} |here>' ) if commit_url: - body += f'or on GitHub: <{commit_url}|Commit Link>\n' + body += f' or on GitHub: <{commit_url}|Commit Link>\n' post_slack_message(channel, body) diff --git a/tests/unit/run_benchmark/test_utils.py b/tests/unit/run_benchmark/test_utils.py index db88c8d4..7aa8c126 100644 --- a/tests/unit/run_benchmark/test_utils.py +++ b/tests/unit/run_benchmark/test_utils.py @@ -123,7 +123,7 @@ def test_post_benchmark_uploaded_message( mock_get_s3_console_link.return_value = url expected_body = ( f'πŸ€ΈπŸ»β€β™€οΈ SDGym benchmark results for *{folder_name}* are available! πŸ‹οΈβ€β™€οΈ\n' - f'Check the results <{url} |here>.\n' + f'Check the results <{url} |here>' ) # Run @@ -154,7 +154,7 @@ def test_post_benchmark_uploaded_message_with_commit( mock_get_s3_console_link.return_value = url expected_body = ( f'πŸ€ΈπŸ»β€β™€οΈ SDGym benchmark results for *{folder_name}* are available! πŸ‹οΈβ€β™€οΈ\n' - f'Check the results <{url} |here>.\n' + f'Check the results <{url} |here> ' f'or on GitHub: <{commit_url}|Commit Link>\n' ) From 91182e354f80643a4b87cb4c46a3f6d2d974b686 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Thu, 21 Aug 2025 12:35:16 +0100 Subject: [PATCH 52/53] immediate update 1 --- sdgym/result_writer.py | 17 +++++++++++ .../run_benchmark/upload_benchmark_results.py | 28 +++++++++++-------- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/sdgym/result_writer.py b/sdgym/result_writer.py index 05702b7e..2daec4fc 100644 --- a/sdgym/result_writer.py +++ b/sdgym/result_writer.py @@ -41,6 +41,23 @@ def write_dataframe(self, data, file_path, append=False, index=False): else: data.to_csv(file_path, mode='w', index=index, header=True) + def write_xlsx(self, dataframes, file_path, index=False): + """ + Write DataFrames to an Excel file, updating existing sheets or adding new ones, + while keeping all other sheets intact. + """ + file_path = Path(file_path) + + if file_path.exists(): + with pd.ExcelWriter(file_path, mode="a", engine="openpyxl", if_sheet_exists="replace") as writer: + for sheet_name, df in dataframes.items(): + df.to_excel(writer, sheet_name=sheet_name, index=index) + else: + with pd.ExcelWriter(file_path, engine="openpyxl") as writer: + for sheet_name, df in dataframes.items(): + df.to_excel(writer, sheet_name=sheet_name, index=index) + + def write_pickle(self, obj, file_path): """Write a Python object to a pickle file.""" with open(file_path, 'wb') as f: diff --git a/sdgym/run_benchmark/upload_benchmark_results.py b/sdgym/run_benchmark/upload_benchmark_results.py index 4b0f575a..0edf4dcf 100644 --- a/sdgym/run_benchmark/upload_benchmark_results.py +++ b/sdgym/run_benchmark/upload_benchmark_results.py @@ -23,7 +23,7 @@ def get_latest_run_from_file(s3_client, bucket, key): body = object['Body'].read().decode('utf-8') data = json.loads(body) latest = sorted(data['runs'], key=lambda x: x['date'])[-1] - return latest['folder_name'] + return latest except s3_client.exceptions.ClientError as e: raise RuntimeError(f'Failed to read {key} from S3: {e}') @@ -56,15 +56,17 @@ def get_result_folder_name_and_s3_vars(aws_access_key_id, aws_secret_access_key) aws_secret_access_key=aws_secret_access_key, region_name=S3_REGION, ) - folder_name = get_latest_run_from_file(s3_client, bucket, f'{prefix}_BENCHMARK_DATES.json') + folder_infos = get_latest_run_from_file(s3_client, bucket, f'{prefix}_BENCHMARK_DATES.json') - return folder_name, s3_client, bucket, prefix + return folder_infos['folder_name'], s3_client, bucket, prefix def upload_results( - aws_access_key_id, aws_secret_access_key, folder_name, s3_client, bucket, prefix, github_env + aws_access_key_id, aws_secret_access_key, folder_infos, s3_client, bucket, prefix, github_env ): """Upload benchmark results to S3.""" + folder_name = folder_infos['folder_name'] + run_date = folder_infos['date'] result_explorer = SDGymResultsExplorer( OUTPUT_DESTINATION_AWS, aws_access_key_id=aws_access_key_id, @@ -94,11 +96,13 @@ def upload_results( local_export_dir = os.environ.get('GITHUB_LOCAL_RESULTS_DIR') if local_export_dir: os.makedirs(local_export_dir, exist_ok=True) - local_results_writer.write_dataframe( - summary, f'{local_export_dir}/{folder_name}_summary.csv', index=True - ) - local_results_writer.write_dataframe( - df_to_plot, f'{local_export_dir}/{folder_name}_plot_data.csv', index=False + datas = { + 'Wins': summary, + f'{run_date}_plot_data': df_to_plot, + f'{run_date}_Detailed_results': results, + } + local_results_writer.write_xlsx( + datas, f'{local_export_dir}/{folder_name}_results.xlsx' ) write_uploaded_marker(s3_client, bucket, prefix, folder_name) @@ -108,11 +112,11 @@ def main(): """Main function to upload benchmark results.""" aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') - folder_name, s3_client, bucket, prefix = get_result_folder_name_and_s3_vars( + folder_infos, s3_client, bucket, prefix = get_result_folder_name_and_s3_vars( aws_access_key_id, aws_secret_access_key ) github_env = os.getenv('GITHUB_ENV') - if upload_already_done(s3_client, bucket, prefix, folder_name): + if upload_already_done(s3_client, bucket, prefix, folder_infos['folder_name']): LOGGER.warning('Benchmark results have already been uploaded. Exiting.') if github_env: with open(github_env, 'a') as env_file: @@ -121,7 +125,7 @@ def main(): sys.exit(0) upload_results( - aws_access_key_id, aws_secret_access_key, folder_name, s3_client, bucket, prefix, github_env + aws_access_key_id, aws_secret_access_key, folder_infos, s3_client, bucket, prefix, github_env ) From f2cf6e627d1fc2b7d62834278bfa60a61564aa57 Mon Sep 17 00:00:00 2001 From: R-Palazzo Date: Fri, 22 Aug 2025 16:23:37 +0100 Subject: [PATCH 53/53] immediate update 2 --- .../workflows/upload_benchmark_results.yml | 2 +- pyproject.toml | 4 + sdgym/result_writer.py | 57 +++-- .../run_benchmark/upload_benchmark_results.py | 196 +++++++++++++++--- sdgym/sdgym_result_explorer/result_handler.py | 4 + .../test_result_explorer.py | 13 +- .../test_upload_benchmark_result.py | 59 ++++-- .../test_result_handler.py | 3 +- 8 files changed, 270 insertions(+), 68 deletions(-) diff --git a/.github/workflows/upload_benchmark_results.yml b/.github/workflows/upload_benchmark_results.yml index e70d5354..ead247f0 100644 --- a/.github/workflows/upload_benchmark_results.yml +++ b/.github/workflows/upload_benchmark_results.yml @@ -30,6 +30,7 @@ jobs: - name: Upload SDGym Benchmark env: + PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} GITHUB_LOCAL_RESULTS_DIR: ${{ runner.temp }}/sdgym-leaderboard-files @@ -69,7 +70,6 @@ jobs: git checkout gatsby-home git config --local user.name "github-actions[bot]" git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com" - git add assets/ git commit -m "Upload SDGym Benchmark Results ($FOLDER_NAME)" || echo "No changes to commit" git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/sdv-dev/sdv-dev.github.io.git diff --git a/pyproject.toml b/pyproject.toml index c0ea5cf1..6651e264 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,10 @@ dependencies = [ 'rdt>=1.17.0', 'sdmetrics>=0.20.1', 'sdv>=1.21.0', + 'openpyxl>=3.0.0', + 'kaleido>=0.2.1', + 'pillow>=9.0.0', + 'pydrive2>=1.3.1' ] [project.urls] diff --git a/sdgym/result_writer.py b/sdgym/result_writer.py index 2daec4fc..2e94aa22 100644 --- a/sdgym/result_writer.py +++ b/sdgym/result_writer.py @@ -6,7 +6,10 @@ from pathlib import Path import pandas as pd +import plotly.graph_objects as go import yaml +from openpyxl import load_workbook +from openpyxl.drawing.image import Image as XLImage from sdgym.s3 import parse_s3_path @@ -30,8 +33,8 @@ def write_yaml(self, data, file_path, append=False): pass -class LocalResultsWriter(ResultsWriter): - """Results writer for local file system.""" +class LocalResultsWriter: + """Local results writer for saving results to the local filesystem.""" def write_dataframe(self, data, file_path, append=False, index=False): """Write a DataFrame to a CSV file.""" @@ -41,22 +44,50 @@ def write_dataframe(self, data, file_path, append=False, index=False): else: data.to_csv(file_path, mode='w', index=index, header=True) - def write_xlsx(self, dataframes, file_path, index=False): - """ - Write DataFrames to an Excel file, updating existing sheets or adding new ones, - while keeping all other sheets intact. + def process_data(self, writer, file_path, temp_images, sheet_name, obj, index=False): + """Process a data item (DataFrame or Figure) and write it to the Excel writer.""" + if isinstance(obj, pd.DataFrame): + obj.to_excel(writer, sheet_name=sheet_name, index=index) + elif isinstance(obj, go.Figure): + img_path = file_path.parent / f'{sheet_name}.png' + obj.write_image(img_path) + temp_images[sheet_name] = img_path + + def write_xlsx(self, data, file_path, index=False): + """Write DataFrames and Plotly figures to an Excel file. + + - DataFrames are saved as tables in their own sheets. + - Plotly figures are exported to PNG and embedded in their own sheets. + - Temporary PNG files are deleted after embedding. + - Newly written sheets are moved to the front. """ file_path = Path(file_path) - + temp_images = {} + file_path.parent.mkdir(parents=True, exist_ok=True) if file_path.exists(): - with pd.ExcelWriter(file_path, mode="a", engine="openpyxl", if_sheet_exists="replace") as writer: - for sheet_name, df in dataframes.items(): - df.to_excel(writer, sheet_name=sheet_name, index=index) + writer = pd.ExcelWriter( + file_path, mode='a', engine='openpyxl', if_sheet_exists='replace' + ) else: - with pd.ExcelWriter(file_path, engine="openpyxl") as writer: - for sheet_name, df in dataframes.items(): - df.to_excel(writer, sheet_name=sheet_name, index=index) + writer = pd.ExcelWriter(file_path, mode='w', engine='openpyxl') + + with writer: + for sheet_name, obj in data.items(): + self.process_data(writer, file_path, temp_images, sheet_name, obj, index=index) + + wb = load_workbook(file_path) + for sheet_name, img_path in temp_images.items(): + ws = wb[sheet_name] if sheet_name in wb.sheetnames else wb.create_sheet(sheet_name) + ws.add_image(XLImage(img_path), 'A1') + + for sheet_name in reversed(data.keys()): + ws = wb[sheet_name] + wb._sheets.remove(ws) + wb._sheets.insert(0, ws) + wb.save(file_path) + for img_path in temp_images.values(): + img_path.unlink(missing_ok=True) def write_pickle(self, obj, file_path): """Write a Python object to a pickle file.""" diff --git a/sdgym/run_benchmark/upload_benchmark_results.py b/sdgym/run_benchmark/upload_benchmark_results.py index 0edf4dcf..be9b6858 100644 --- a/sdgym/run_benchmark/upload_benchmark_results.py +++ b/sdgym/run_benchmark/upload_benchmark_results.py @@ -3,17 +3,38 @@ import json import logging import os +import shutil import sys +import tempfile +from pathlib import Path import boto3 +import numpy as np +import plotly.express as px from botocore.exceptions import ClientError +from oauth2client.client import OAuth2Credentials +from plotly import graph_objects as go +from pydrive2.auth import GoogleAuth +from pydrive2.drive import GoogleDrive +from scipy.interpolate import interp1d -from sdgym.result_writer import LocalResultsWriter, S3ResultsWriter +from sdgym.result_writer import LocalResultsWriter from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS, get_df_to_plot from sdgym.s3 import S3_REGION, parse_s3_path from sdgym.sdgym_result_explorer.result_explorer import SDGymResultsExplorer LOGGER = logging.getLogger(__name__) +SYNTHESIZER_TO_GLOBAL_POSITION = { + 'CTGAN': 'middle right', + 'TVAE': 'middle left', + 'GaussianCopula': 'bottom center', + 'Uniform': 'top center', + 'Column': 'top center', + 'CopulaGAN': 'top center', + 'RealTabFormer': 'bottom center', +} +SDGYM_FILE_ID = '1W3tsGOOtbtTw3g0EVE0irLgY_TN_cy2W4ONiZQ57OPo' +RESULT_FILENAME = 'SDGym Monthly Run.xlsx' def get_latest_run_from_file(s3_client, bucket, key): @@ -58,13 +79,128 @@ def get_result_folder_name_and_s3_vars(aws_access_key_id, aws_secret_access_key) ) folder_infos = get_latest_run_from_file(s3_client, bucket, f'{prefix}_BENCHMARK_DATES.json') - return folder_infos['folder_name'], s3_client, bucket, prefix + return folder_infos, s3_client, bucket, prefix + + +def generate_graph(plot_table): + """Generate a scatter plot for the benchmark results.""" + fig = px.scatter( + plot_table, + x='Aggregated_Time', + y='Quality_Score', + color='Synthesizer', + text='Synthesizer', + title='Mean Quality Score vs Aggregated Time (Over All Datasets)', + labels={'Aggregated_Time': 'Aggregated Time [s]', 'Quality_Score': 'Mean Quality Score'}, + log_x=True, + color_discrete_sequence=px.colors.qualitative.Plotly, + ) + + for trace in fig.data: + synthesizer_name = trace.name + shape = plot_table.loc[plot_table['Synthesizer'] == synthesizer_name, 'Marker'].values[0] + color = plot_table.loc[plot_table['Synthesizer'] == synthesizer_name, 'Color'].values[0] + trace_positions = SYNTHESIZER_TO_GLOBAL_POSITION.get(synthesizer_name, 'top center') + trace.update( + marker=dict(size=14, color=color), textposition=trace_positions, marker_symbol=shape + ) + + fig.update_layout( + xaxis=dict( + tickformat='.0e', + tickmode='array', + tickvals=[1e1, 1e2, 1e3, 1e4, 1e5, 1e6], + ticktext=[ + '101', + '102', + '103', + '104', + '105', + '106', + ], + showgrid=False, + zeroline=False, + title='Aggregated Time [s]', + range=[0.6, 6], + ), + yaxis=dict(showgrid=False, zeroline=False, range=[0.54, 0.92]), + plot_bgcolor='#F5F5F8', + ) + + fig.update_traces(textfont=dict(size=16)) + pareto_points = plot_table.loc[plot_table['Pareto']] + x_pareto = pareto_points['Aggregated_Time'].values + y_pareto = pareto_points['Quality_Score'].values + sorted_indices = np.argsort(x_pareto) + x_sorted = x_pareto[sorted_indices] + y_sorted = y_pareto[sorted_indices] + log_x_sorted = np.log10(x_sorted) + interp = interp1d(log_x_sorted, y_sorted, kind='linear', fill_value='extrapolate') + log_x_fit = np.linspace(0.7, 6, 100) + y_fit = interp(log_x_fit) + x_fit = np.power(10, log_x_fit) + + # Plot smooth interpolation + fig.add_trace( + go.Scatter( + x=x_fit, + y=y_fit, + mode='lines', + name='Pareto Frontier', + line=dict(color='black', width=2), + ) + ) + x_shade = np.concatenate([x_fit, x_fit[::-1]]) + y_shade = np.concatenate([y_fit, np.full_like(x_fit, min(y_fit))[::-1]]) + fig.add_trace( + go.Scatter( + x=x_shade, + y=y_shade, + fill='toself', + fillcolor='rgba(0, 0, 54, 0.25)', + line=dict(color='#000036'), + hoverinfo='skip', + showlegend=False, + ) + ) + + return fig + + +def upload_to_drive(file_path, file_id): + """Upload a local file to a Google Drive folder. + + Args: + file_path (str or Path): Path to the local file to upload. + file_id (str): Google Drive file ID. + """ + file_path = Path(file_path) + if not file_path.exists(): + raise FileNotFoundError(f'File not found: {file_path}') + + creds_dict = json.loads(os.environ['PYDRIVE_CREDENTIALS']) + creds = OAuth2Credentials( + access_token=creds_dict['access_token'], + client_id=creds_dict.get('client_id'), + client_secret=creds_dict.get('client_secret'), + refresh_token=creds_dict.get('refresh_token'), + token_expiry=None, + token_uri='https://oauth2.googleapis.com/token', + user_agent=None, + ) + gauth = GoogleAuth() + gauth.credentials = creds + drive = GoogleDrive(gauth) + + gfile = drive.CreateFile({'id': file_id}) + gfile.SetContentFile(file_path) + gfile.Upload(param={'supportsAllDrives': True}) def upload_results( aws_access_key_id, aws_secret_access_key, folder_infos, s3_client, bucket, prefix, github_env ): - """Upload benchmark results to S3.""" + """Upload benchmark results to S3, GDrive, and save locally.""" folder_name = folder_infos['folder_name'] run_date = folder_infos['date'] result_explorer = SDGymResultsExplorer( @@ -72,7 +208,6 @@ def upload_results( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, ) - result_writer = S3ResultsWriter(s3_client) local_results_writer = LocalResultsWriter() if not result_explorer.all_runs_complete(folder_name): LOGGER.warning(f'Run {folder_name} is not complete yet. Exiting.') @@ -81,31 +216,38 @@ def upload_results( env_file.write('SKIP_UPLOAD=true\n') sys.exit(0) - else: - LOGGER.info(f'Run {folder_name} is complete! Proceeding with summarization...') - if github_env: - with open(github_env, 'a') as env_file: - env_file.write('SKIP_UPLOAD=false\n') - env_file.write(f'FOLDER_NAME={folder_name}\n') + + LOGGER.info(f'Run {folder_name} is complete! Proceeding with summarization...') + if github_env: + with open(github_env, 'a') as env_file: + env_file.write('SKIP_UPLOAD=false\n') + env_file.write(f'FOLDER_NAME={folder_name}\n') summary, results = result_explorer.summarize(folder_name) df_to_plot = get_df_to_plot(results) - result_writer.write_dataframe( - summary, f'{OUTPUT_DESTINATION_AWS}{folder_name}/{folder_name}_summary.csv', index=True - ) + figure = generate_graph(df_to_plot) local_export_dir = os.environ.get('GITHUB_LOCAL_RESULTS_DIR') - if local_export_dir: - os.makedirs(local_export_dir, exist_ok=True) - datas = { - 'Wins': summary, - f'{run_date}_plot_data': df_to_plot, - f'{run_date}_Detailed_results': results, - } - local_results_writer.write_xlsx( - datas, f'{local_export_dir}/{folder_name}_results.xlsx' - ) + temp_dir = None + if not local_export_dir: + temp_dir = tempfile.mkdtemp() + local_export_dir = temp_dir + os.makedirs(local_export_dir, exist_ok=True) + local_file_path = os.path.join(local_export_dir, RESULT_FILENAME) + s3_key = f'{prefix}{RESULT_FILENAME}' + s3_client.download_file(bucket, s3_key, local_file_path) + datas = { + 'Wins': summary, + f'{run_date}_Detailed_results': results, + f'{run_date}_plot_data': df_to_plot, + f'{run_date}_plot_image': figure, + } + local_results_writer.write_xlsx(datas, local_file_path) + upload_to_drive(local_file_path, SDGYM_FILE_ID) + s3_client.upload_file(local_file_path, bucket, s3_key) write_uploaded_marker(s3_client, bucket, prefix, folder_name) + if temp_dir: + shutil.rmtree(temp_dir) def main(): @@ -125,7 +267,13 @@ def main(): sys.exit(0) upload_results( - aws_access_key_id, aws_secret_access_key, folder_infos, s3_client, bucket, prefix, github_env + aws_access_key_id, + aws_secret_access_key, + folder_infos, + s3_client, + bucket, + prefix, + github_env, ) diff --git a/sdgym/sdgym_result_explorer/result_handler.py b/sdgym/sdgym_result_explorer/result_handler.py index 52808319..c8f4073f 100644 --- a/sdgym/sdgym_result_explorer/result_handler.py +++ b/sdgym/sdgym_result_explorer/result_handler.py @@ -86,6 +86,9 @@ def _get_summarize_table(self, folder_to_results, folder_infos): summarized_results[column_name] = column_data summarized_results = summarized_results.fillna('-') + summarized_results = summarized_results.reset_index() + summarized_results = summarized_results.rename(columns={'index': 'Synthesizer'}) + return summarized_results def _get_column_name_infos(self, folder_to_results): @@ -121,6 +124,7 @@ def _process_results(self, results): 'summarize results.' ) + filtered_results = filtered_results.sort_values(by=['Dataset', 'Synthesizer']) return filtered_results.reset_index(drop=True) def summarize(self, folder_name): diff --git a/tests/integration/sdgym_result_explorer/test_result_explorer.py b/tests/integration/sdgym_result_explorer/test_result_explorer.py index 2a10270e..f56fd346 100644 --- a/tests/integration/sdgym_result_explorer/test_result_explorer.py +++ b/tests/integration/sdgym_result_explorer/test_result_explorer.py @@ -58,16 +58,19 @@ def test_summarize(): # Assert expected_summary = pd.DataFrame({ + 'Synthesizer': ['CTGANSynthesizer', 'CopulaGANSynthesizer', 'TVAESynthesizer'], '10_11_2024 - # datasets: 9 - sdgym version: 0.9.1': [6, 4, 5], '05_10_2024 - # datasets: 9 - sdgym version: 0.8.0': [4, 4, 5], '04_05_2024 - # datasets: 9 - sdgym version: 0.7.0': [5, 3, 5], - 'Synthesizer': ['CTGANSynthesizer', 'CopulaGANSynthesizer', 'TVAESynthesizer'], }) - expected_results = pd.read_csv( - 'tests/integration/sdgym_result_explorer/_benchmark_results/' - 'SDGym_results_10_11_2024/results_10_11_2024_1.csv', + expected_results = ( + pd.read_csv( + 'tests/integration/sdgym_result_explorer/_benchmark_results/' + 'SDGym_results_10_11_2024/results_10_11_2024_1.csv', + ) + .sort_values(by=['Dataset', 'Synthesizer']) + .reset_index(drop=True) ) expected_results['Win'] = expected_results['Win'].astype('int64') - expected_summary = expected_summary.set_index('Synthesizer') pd.testing.assert_frame_equal(summary, expected_summary) pd.testing.assert_frame_equal(results, expected_results) diff --git a/tests/unit/run_benchmark/test_upload_benchmark_result.py b/tests/unit/run_benchmark/test_upload_benchmark_result.py index b8059b31..63c18c41 100644 --- a/tests/unit/run_benchmark/test_upload_benchmark_result.py +++ b/tests/unit/run_benchmark/test_upload_benchmark_result.py @@ -1,9 +1,10 @@ -from unittest.mock import Mock, call, patch +from unittest.mock import Mock, patch import pytest from botocore.exceptions import ClientError from sdgym.run_benchmark.upload_benchmark_results import ( + SDGYM_FILE_ID, get_result_folder_name_and_s3_vars, main, upload_already_done, @@ -97,29 +98,32 @@ def test_get_result_folder_name_and_s3_vars( @patch('sdgym.run_benchmark.upload_benchmark_results.SDGymResultsExplorer') -@patch('sdgym.run_benchmark.upload_benchmark_results.S3ResultsWriter') @patch('sdgym.run_benchmark.upload_benchmark_results.write_uploaded_marker') @patch('sdgym.run_benchmark.upload_benchmark_results.LOGGER') @patch('sdgym.run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') @patch('sdgym.run_benchmark.upload_benchmark_results.LocalResultsWriter') @patch('sdgym.run_benchmark.upload_benchmark_results.os.environ.get') @patch('sdgym.run_benchmark.upload_benchmark_results.get_df_to_plot') +@patch('sdgym.run_benchmark.upload_benchmark_results.generate_graph') +@patch('sdgym.run_benchmark.upload_benchmark_results.upload_to_drive') def test_upload_results( + mock_upload_to_drive, + mock_generate_graph, mock_get_df_to_plot, mock_os_environ_get, mock_local_results_writer, mock_output_destination_aws, mock_logger, mock_write_uploaded_marker, - mock_s3_results_writer, mock_sdgym_results_explorer, ): """Test the `upload_results` method.""" # Setup aws_access_key_id = 'my_access_key' aws_secret_access_key = 'my_secret_key' - run_name = 'SDGym_results_10_01_2023' - s3_client = 's3_client' + folder_infos = {'folder_name': 'SDGym_results_10_01_2023', 'date': '10_01_2023'} + run_name = folder_infos['folder_name'] + s3_client = Mock() bucket = 'bucket' prefix = 'prefix' result_explorer_instance = mock_sdgym_results_explorer.return_value @@ -127,12 +131,19 @@ def test_upload_results( result_explorer_instance.summarize.return_value = ('summary', 'results') mock_os_environ_get.return_value = '/tmp/sdgym_results' mock_get_df_to_plot.return_value = 'df_to_plot' + mock_generate_graph.return_value = 'plot_image' + datas = { + 'Wins': 'summary', + '10_01_2023_Detailed_results': 'results', + '10_01_2023_plot_data': 'df_to_plot', + '10_01_2023_plot_image': 'plot_image', + } # Run upload_results( aws_access_key_id, aws_secret_access_key, - run_name, + folder_infos, s3_client, bucket, prefix, @@ -140,6 +151,10 @@ def test_upload_results( ) # Assert + mock_upload_to_drive.assert_called_once_with( + '/tmp/sdgym_results/SDGym Monthly Run.xlsx', SDGYM_FILE_ID + ) + mock_generate_graph.assert_called_once() mock_logger.info.assert_called_once_with( f'Run {run_name} is complete! Proceeding with summarization...' ) @@ -150,19 +165,14 @@ def test_upload_results( ) result_explorer_instance.all_runs_complete.assert_called_once_with(run_name) result_explorer_instance.summarize.assert_called_once_with(run_name) - mock_s3_results_writer.return_value.write_dataframe.assert_called_once() mock_write_uploaded_marker.assert_called_once_with(s3_client, bucket, prefix, run_name) - mock_local_results_writer.return_value.write_dataframe.assert_has_calls([ - call('summary', '/tmp/sdgym_results/SDGym_results_10_01_2023_summary.csv', index=True), - call( - 'df_to_plot', '/tmp/sdgym_results/SDGym_results_10_01_2023_plot_data.csv', index=False - ), - ]) + mock_local_results_writer.return_value.write_xlsx.assert_called_once_with( + datas, '/tmp/sdgym_results/SDGym Monthly Run.xlsx' + ) mock_get_df_to_plot.assert_called_once_with('results') @patch('sdgym.run_benchmark.upload_benchmark_results.SDGymResultsExplorer') -@patch('sdgym.run_benchmark.upload_benchmark_results.S3ResultsWriter') @patch('sdgym.run_benchmark.upload_benchmark_results.write_uploaded_marker') @patch('sdgym.run_benchmark.upload_benchmark_results.LOGGER') @patch('sdgym.run_benchmark.upload_benchmark_results.OUTPUT_DESTINATION_AWS') @@ -170,15 +180,15 @@ def test_upload_results_not_all_runs_complete( mock_output_destination_aws, mock_logger, mock_write_uploaded_marker, - mock_s3_results_writer, mock_sdgym_results_explorer, ): """Test the `upload_results` when not all runs are complete.""" # Setup aws_access_key_id = 'my_access_key' aws_secret_access_key = 'my_secret_key' - run_name = 'SDGym_results_10_01_2023' - s3_client = 's3_client' + folder_infos = {'folder_name': 'SDGym_results_10_01_2023', 'date': '10_01_2023'} + run_name = folder_infos['folder_name'] + s3_client = Mock() bucket = 'bucket' prefix = 'prefix' result_explorer_instance = mock_sdgym_results_explorer.return_value @@ -190,7 +200,7 @@ def test_upload_results_not_all_runs_complete( upload_results( aws_access_key_id, aws_secret_access_key, - run_name, + folder_infos, s3_client, bucket, prefix, @@ -206,7 +216,6 @@ def test_upload_results_not_all_runs_complete( ) result_explorer_instance.all_runs_complete.assert_called_once_with(run_name) result_explorer_instance.summarize.assert_not_called() - mock_s3_results_writer.return_value.write_dataframe.assert_not_called() mock_write_uploaded_marker.assert_not_called() @@ -225,8 +234,9 @@ def test_main_already_upload( """Test the `method` when results are already uploaded.""" # Setup mock_getenv.side_effect = ['my_access_key', 'my_secret_key', None] + folder_infos = {'folder_name': 'SDGym_results_10_01_2023', 'date': '10_01_2023'} mock_get_result_folder_name_and_s3_vars.return_value = ( - 'run_name', + folder_infos, 's3_client', 'bucket', 'prefix', @@ -259,8 +269,9 @@ def test_main( """Test the `main` method.""" # Setup mock_getenv.side_effect = ['my_access_key', 'my_secret_key', None] + folder_infos = {'folder_name': 'SDGym_results_10_11_2024', 'date': '10_11_2024'} mock_get_result_folder_name_and_s3_vars.return_value = ( - 'run_name', + folder_infos, 's3_client', 'bucket', 'prefix', @@ -274,7 +285,9 @@ def test_main( mock_get_result_folder_name_and_s3_vars.assert_called_once_with( 'my_access_key', 'my_secret_key' ) - mock_upload_already_done.assert_called_once_with('s3_client', 'bucket', 'prefix', 'run_name') + mock_upload_already_done.assert_called_once_with( + 's3_client', 'bucket', 'prefix', folder_infos['folder_name'] + ) mock_upload_results.assert_called_once_with( - 'my_access_key', 'my_secret_key', 'run_name', 's3_client', 'bucket', 'prefix', None + 'my_access_key', 'my_secret_key', folder_infos, 's3_client', 'bucket', 'prefix', None ) diff --git a/tests/unit/sdgym_result_explorer/test_result_handler.py b/tests/unit/sdgym_result_explorer/test_result_handler.py index 96cbc4c8..4a3ef0cf 100644 --- a/tests/unit/sdgym_result_explorer/test_result_handler.py +++ b/tests/unit/sdgym_result_explorer/test_result_handler.py @@ -64,10 +64,9 @@ def test__get_summarize_table(self): # Assert expected_summary = pd.DataFrame({ - '07_15_2025 - # datasets: 3 - sdgym version: 0.9.0': [2, 1], 'Synthesizer': ['Synth1', 'Synth2'], + '07_15_2025 - # datasets: 3 - sdgym version: 0.9.0': [2, 1], }) - expected_summary = expected_summary.set_index('Synthesizer') pd.testing.assert_frame_equal(result, expected_summary) def test_get_column_name_infos(self):