diff --git a/commit0/harness/build.py b/commit0/harness/build.py index 216a10c..b94a910 100644 --- a/commit0/harness/build.py +++ b/commit0/harness/build.py @@ -45,7 +45,7 @@ def main( repo_name = example["repo"].split("/")[-1] if split != "all" and repo_name not in SPLIT[split]: continue - spec = make_spec(example, dataset_type) + spec = make_spec(example, dataset_type, absolute=True) specs.append(spec) client = docker.from_env() diff --git a/commit0/harness/constants.py b/commit0/harness/constants.py index f7a86f2..c66834b 100644 --- a/commit0/harness/constants.py +++ b/commit0/harness/constants.py @@ -64,7 +64,11 @@ def items(self) -> ItemsView[str, object]: PASS_TO_FAIL = "PASS_TO_FAIL" # Evaluation backends -EVAL_BACKENDS = ["local", "modal"] +EVAL_BACKENDS = ["local", "modal", "e2b"] +# Use absolute for docker and modal. Backends with sudo access +ABSOLUTE_REPO_DIR = "/testbed" +# Use relative for e2b, with no sudo access +RELATIVE_REPO_DIR = "testbed" # available commands COMMANDS = [ diff --git a/commit0/harness/docker_build.py b/commit0/harness/docker_build.py index 1c9944a..93669d4 100644 --- a/commit0/harness/docker_build.py +++ b/commit0/harness/docker_build.py @@ -125,7 +125,7 @@ def build_base_images( """ # Get the base images to build from the dataset - test_specs = get_specs_from_dataset(dataset, dataset_type) + test_specs = get_specs_from_dataset(dataset, dataset_type, absolute=True) base_images = { x.base_image_key: (x.base_dockerfile, x.platform) for x in test_specs } @@ -166,7 +166,7 @@ def get_repo_configs_to_build( """ image_scripts = dict() - test_specs = get_specs_from_dataset(dataset, dataset_type) + test_specs = get_specs_from_dataset(dataset, dataset_type, absolute=True) for test_spec in test_specs: # Check if the base image exists diff --git a/commit0/harness/execution_context.py b/commit0/harness/execution_context.py index b6b0893..5047119 100644 --- a/commit0/harness/execution_context.py +++ b/commit0/harness/execution_context.py @@ -10,6 +10,7 @@ import modal import modal.io_streams from enum import auto +from e2b_code_interpreter import Sandbox from strenum import StrEnum from pathlib import Path import time @@ -33,6 +34,7 @@ class ExecutionBackend(StrEnum): LOCAL = auto() MODAL = auto() + E2B = auto() class ExecutionContext(ABC): @@ -219,3 +221,69 @@ def __exit__( exctb: Optional[TracebackType], ) -> None: close_logger(self.logger) + + +class E2B(ExecutionContext): + def __init__( + self, + spec: Spec, + logger: logging.Logger, + timeout: int, + num_cpus: int, + log_dir: Path, + files_to_copy: Optional[Files] = None, + files_to_collect: Optional[list[str]] = None, + rebuild_image: bool = False, + ): + super().__init__( + spec, + logger, + timeout, + num_cpus, + log_dir, + files_to_copy=files_to_copy, + files_to_collect=files_to_collect, + ) + + self.sb = Sandbox(timeout=timeout) + self.sb.commands.run("curl -LsSf https://astral.sh/uv/install.sh | sh") + + # setup sandbox env + self.sb.files.write("setup.sh", spec.setup_script) + self.sb.commands.run("bash setup.sh") + + # prepare for eval + if files_to_copy: + for _, f in files_to_copy.items(): + with open(f["src"], "r") as fp: # type: ignore + content = fp.read() + self.sb.files.write(f["dest"].name, content) # type: ignore + + def exec_run_with_timeout(self, command: str) -> tuple[str, bool, float]: + """Execute command on E2B sandbox + For timeouts, we could maybe use the error code or check whether the + sandbox is still alive. + + The exit code is given by: result.exit_code + + For now, we can just check if the sandbox is still alive. + """ + # TODO: setup timeout + start_time = time.time() + result = self.sb.commands.run(command, timeout=0) + if self.files_to_collect is not None: + for fname in self.files_to_collect: + with (self.log_dir / fname).open("w") as f: + f.write(self.sb.files.read(f"testbed/{fname}")) + timed_out = self.sb.is_running() + end_time = time.time() + return result.stderr, timed_out, end_time - start_time + + def __exit__( + self, + exctype: Optional[Type[BaseException]], + excinst: Optional[BaseException], + exctb: Optional[TracebackType], + ) -> None: + self.sb.kill() + close_logger(self.logger) diff --git a/commit0/harness/run_pytest_ids.py b/commit0/harness/run_pytest_ids.py index 06857df..54c7047 100644 --- a/commit0/harness/run_pytest_ids.py +++ b/commit0/harness/run_pytest_ids.py @@ -26,6 +26,7 @@ ExecutionBackend, Docker, Modal, + E2B, ) @@ -52,6 +53,7 @@ def main( dataset_name, split=dataset_split ) # type: ignore dataset_name = dataset_name.lower() + absolute = backend != "e2b" spec = None example = None repo_name = None @@ -76,7 +78,7 @@ def main( if repo_name in os.path.basename(repo_or_repo_dir) or repo_or_repo_dir.endswith( repo_name ): - spec = make_spec(example, dataset_type) + spec = make_spec(example, dataset_type, absolute) break assert spec is not None, "No spec available" assert example is not None, "No example available" @@ -187,19 +189,28 @@ def main( backend = backend.upper() if ExecutionBackend(backend) == ExecutionBackend.MODAL: - logger.info("Runnning on Modal") + logger.info("Running on Modal") execution_context = Modal elif ExecutionBackend(backend) == ExecutionBackend.LOCAL: - logger.info("Runnning locally") + logger.info("Running locally") execution_context = Docker + elif ExecutionBackend(backend) == ExecutionBackend.E2B: + logger.info("Running E2B") + execution_context = E2B else: raise ValueError( f"Evaluation must be from {', '.join(EVAL_BACKENDS)}, but {backend} is provided." ) files_to_copy = Files( - eval_script={"src": eval_file, "dest": Path("/eval.sh")}, - patch={"src": patch_file, "dest": Path("/patch.diff")}, + eval_script={ + "src": eval_file, + "dest": Path("/eval.sh" if absolute else "eval.sh"), + }, + patch={ + "src": patch_file, + "dest": Path("/patch.diff" if absolute else "patch.diff"), + }, ) files_to_collect = [ "report.json", @@ -209,6 +220,11 @@ def main( if coverage: files_to_collect.append("coverage.json") + eval_command = ( + "/bin/bash /eval.sh" + if ExecutionBackend(backend) != ExecutionBackend.E2B + else "/bin/bash eval.sh" + ) try: with execution_context( spec, @@ -221,7 +237,7 @@ def main( rebuild_image, ) as context: output, timed_out, total_runtime = context.exec_run_with_timeout( - "/bin/bash /eval.sh" + eval_command ) logger.info(output) if timed_out: diff --git a/commit0/harness/spec.py b/commit0/harness/spec.py index 77cbc15..7a71118 100644 --- a/commit0/harness/spec.py +++ b/commit0/harness/spec.py @@ -4,6 +4,8 @@ from typing import Union, cast, Optional from commit0.harness.constants import ( + ABSOLUTE_REPO_DIR, + RELATIVE_REPO_DIR, RepoInstance, SimpleInstance, ) @@ -17,6 +19,7 @@ class Spec(ABC): """A dataclass that represents a test specification for a single instance of SWE-bench.""" + absolute: bool repo: str # repo dir on docker repo_directory: str @@ -164,11 +167,12 @@ def make_repo_script_list(self) -> list[str]: def make_eval_script_list(self) -> list[str]: """Run the tests.""" + diff_path = "/patch.diff" if self.absolute else "../patch.diff" eval_script_list = [ f"cd {self.repo_directory}", "source .venv/bin/activate", f"git reset --hard {self.instance['base_commit']}", - "git apply --allow-empty -v /patch.diff", + f"git apply --allow-empty -v {diff_path}", "git status", f"{self.instance['test']['test_cmd']} --json-report --json-report-file=report.json --continue-on-collection-errors{{coverage}} {{test_ids}} > test_output.txt 2>&1", "echo $? > pytest_exit_code.txt", @@ -306,39 +310,45 @@ def make_eval_script_list(self) -> list[str]: def get_specs_from_dataset( dataset: Union[list[Union[RepoInstance, SimpleInstance]], list[Spec]], dataset_type: str, + absolute: bool, ) -> list[Spec]: """Idempotent function that converts a list of RepoInstance objects to a list of Spec objects.""" if isinstance(dataset[0], Spec): return cast(list[Spec], dataset) return list( map( - lambda instance: make_spec(instance, dataset_type), + lambda instance: make_spec(instance, dataset_type, absolute), cast(list["RepoInstance"], dataset), ) ) -def make_spec(instance: Union[RepoInstance, SimpleInstance], dataset_type: str) -> Spec: +def make_spec( + instance: Union[RepoInstance, SimpleInstance], dataset_type: str, absolute: bool +) -> Spec: + repo_directory = ABSOLUTE_REPO_DIR if absolute else RELATIVE_REPO_DIR if isinstance(instance, Spec): return instance - repo_directory = "/testbed" if dataset_type == "commit0": return Commit0Spec( repo=instance["instance_id"], repo_directory=repo_directory, instance=instance, + absolute=absolute, ) elif dataset_type == "swebench": return SWEBenchSpec( repo=instance["instance_id"], repo_directory=repo_directory, instance=instance, + absolute=absolute, ) elif dataset_type == "simple": return SimpleSpec( repo="simple", # all benchmarks with mere function writing will share the simple docker image repo_directory=repo_directory, instance=instance, + absolute=absolute, ) else: raise NotImplementedError( diff --git a/pyproject.toml b/pyproject.toml index 33bdac4..7eabe00 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,8 @@ dependencies = [ "datasets==3.0.1", "modal>=0.66.26", "strenum>=0.4.15", + "e2b-code-interpreter>=1.0.4", + "python-dotenv>=1.0.1", ] classifiers = [ "License :: OSI Approved :: MIT License", diff --git a/uv.lock b/uv.lock index 3b9ed42..e5680b4 100644 --- a/uv.lock +++ b/uv.lock @@ -427,12 +427,14 @@ source = { editable = "." } dependencies = [ { name = "datasets" }, { name = "docker" }, + { name = "e2b-code-interpreter" }, { name = "fastcore" }, { name = "ghapi" }, { name = "gitpython" }, { name = "modal" }, { name = "pre-commit" }, { name = "pytest" }, + { name = "python-dotenv" }, { name = "ruff" }, { name = "strenum" }, { name = "typer" }, @@ -450,6 +452,7 @@ requires-dist = [ { name = "aider-chat", marker = "extra == 'agent'", git = "https://github.com/wenting-zhao/aider.git" }, { name = "datasets", specifier = "==3.0.1" }, { name = "docker", specifier = ">=7.1.0" }, + { name = "e2b-code-interpreter", specifier = ">=1.0.4" }, { name = "fastcore", specifier = ">=1.7.8" }, { name = "ghapi", specifier = ">=1.0.6" }, { name = "gitpython", specifier = ">=3.1.43" }, @@ -458,6 +461,7 @@ requires-dist = [ { name = "pre-commit", specifier = ">=3.8.0" }, { name = "pymupdf", marker = "extra == 'agent'", specifier = ">=1.24.5" }, { name = "pytest", specifier = ">=8.3.3" }, + { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "ruff", specifier = ">=0.6.4" }, { name = "strenum", specifier = ">=0.4.15" }, { name = "typer", specifier = ">=0.12.0" }, @@ -556,6 +560,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774 }, ] +[[package]] +name = "e2b" +version = "1.0.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "httpcore" }, + { name = "httpx" }, + { name = "packaging" }, + { name = "protobuf" }, + { name = "python-dateutil" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/28/c05fe7a49005e2e98017941c05df15e2b096e8d57c1abcf2fca05e11abef/e2b-1.0.5.tar.gz", hash = "sha256:43c82705af7b7d4415c2510ff77dab4dc075351e0b769d6adf8e0d7bb4868d13", size = 44374 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/92/80/35a7050f011f603599ce3d579fe3a5f424c9256574e132f4b75260d9ffb5/e2b-1.0.5-py3-none-any.whl", hash = "sha256:a71bdec46f33d3e38e87d475d7fd2939bd7b6b753b819c9639ca211cd375b79e", size = 81717 }, +] + +[[package]] +name = "e2b-code-interpreter" +version = "1.0.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "e2b" }, + { name = "httpx" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9a/0e/95be4c53ee2fa6879d517ce7bca1656249e6bcdc377422b19ad636c59748/e2b_code_interpreter-1.0.4.tar.gz", hash = "sha256:fec5651d98ca0d03dd038c5df943a0beaeb59c6d422112356f55f2b662d8dea1", size = 9273 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/92/99/ce5c3953db2818976a640bce5af2dbc347fae0b00b105728b6f110a696e5/e2b_code_interpreter-1.0.4-py3-none-any.whl", hash = "sha256:e8cea4946b3457072a524250aee712f7f8d44834b91cd9c13da3bdf96eda1a6e", size = 12062 }, +] + [[package]] name = "exceptiongroup" version = "1.2.2"