added test code

wenting-zhao · wenting-zhao · commit 31b4919ef68b · 2024-12-27T22:46:00.000-05:00
diff --git a/commit0/cli.py b/commit0/cli.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 from typing import Union, List
 from typing_extensions import Annotated
+import commit0.harness.batch_run_pytest_ids
 import commit0.harness.run_pytest_ids
 import commit0.harness.get_pytest_ids
 import commit0.harness.build
@@ -300,6 +301,49 @@ def test(
     )
 
 
+@commit0_app.command()
+def batch_test(
+    test_ids: str = typer.Argument(
+        None,
+        help='All ways pytest supports to run and select tests. Please provide a single string. Example: "test_mod.py", "testing/", "test_mod.py::test_func", "-k \'MyClass and not method\'"',
+    ),
+    backend: str = typer.Option("modal", help="Backend to use for testing"),
+    timeout: int = typer.Option(1800, help="Timeout for tests in seconds"),
+    num_cpus: int = typer.Option(1, help="Number of CPUs to use"),
+    reference: Annotated[
+        bool, typer.Option("--reference", help="Test the reference commit")
+    ] = False,
+    coverage: Annotated[
+        bool, typer.Option("--coverage", help="Whether to get coverage information")
+    ] = False,
+    rebuild: bool = typer.Option(
+        False, "--rebuild", help="Whether to rebuild an image"
+    ),
+    commit0_config_file: str = typer.Option(
+        ".commit0.yaml",
+        help="Path to the commit0 dot file, where the setup config is stored",
+    ),
+    verbose: int = typer.Option(
+        1,
+        "--verbose",
+        "-v",
+        help="Set this to 2 for more logging information",
+        count=True,
+    ),
+) -> None:
+    """Run tests on a Commit0 repository."""
+    commit0.harness.batch_run_pytest_ids.main(
+        test_ids,
+        reference,
+        coverage,
+        backend,
+        timeout,
+        num_cpus,
+        rebuild,
+        verbose,
+    )
+
+
 @commit0_app.command()
 def evaluate(
     branch: Union[str, None] = typer.Option(
diff --git a/examples/star/run.sh b/examples/star/run.sh
@@ -1,7 +1,7 @@
 python examples/star/star.py \
   --model_name_or_path meta-llama/Llama-3.1-8B-Instruct \
   --dataset_name commit0/mbpp \
-  -n 10 \
+  -n 100 \
   --output_dir outputs \
   --low_cpu_mem_usage \
   --with_tracking \
@@ -10,5 +10,6 @@ python examples/star/star.py \
   --learning_rate 1e-6 \
   --per_device_train_batch_size 1 \
   --gradient_accumulation_steps 8 \
-  --max_workers 64
+  --max_workers 64 \
+  --temperature 1
 
diff --git a/examples/star/test.py b/examples/star/test.py
@@ -0,0 +1,44 @@
+"""Get test accuracy"""
+
+from datasets import load_dataset
+from examples.star.inference import generate_predictions
+from examples.star.utils import (
+    execute_tests,
+    generate_prompt,
+    parse_args,
+)
+
+
+def main() -> None:
+    args = parse_args()
+    ds = load_dataset(args.dataset_name, args.dataset_config_name)['test']
+    model_name = args.model_name_or_path
+
+    # sample
+    all_samples = generate_predictions(
+        model_name, ds, args.temperature, args.n
+    )
+    ds.add_column(name="sample", column=all_samples).to_json(
+        f"{args.output_dir}/data/{model_name.split('/')[-1]}-test-samples.json"
+    )
+    assert len(ds) == len(all_samples)
+
+    # verify and construct the training set
+    all_traces, all_execution_results = execute_tests(
+        ds, all_samples, max_workers=args.max_workers
+    )
+    passed = 0
+    for example, execution_results, samples in zip(
+        ds, all_execution_results, all_samples
+    ):
+        for execution_result, sample in zip(execution_results, samples):
+            # pytest exit code: https://docs.pytest.org/en/stable/reference/exit-codes.html
+            if execution_result == 0:
+                passed += 1
+    print(f"passed: {passed/len(ds)}")
+
+if __name__ == "__main__":
+    main()
+
+
+__all__ = []
diff --git a/examples/star/test.sh b/examples/star/test.sh
@@ -0,0 +1,8 @@
+python examples/star/test.py \
+  --model_name_or_path $1 \
+  --dataset_name commit0/mbpp \
+  -n $2 \
+  --output_dir outputs \
+  --max_workers 64 \
+  --temperature 0
+