import json import re from unittest.mock import patch import pytest from pydantic import BaseModel from minisweagent import package_dir from minisweagent.models.test_models import DeterministicModel, make_output from minisweagent.run.benchmarks.swebench import ( filter_instances, get_swebench_docker_image_name, main, remove_from_preds_file, update_preds_file, ) def _make_model_from_fixture(text_outputs: list[str], cost_per_call: float = 1.0, **kwargs) -> DeterministicModel: """Create a DeterministicModel from trajectory fixture data (raw text outputs).""" def parse_command(text: str) -> list[dict]: match = re.search(r"```mswea_bash_command\s*\n(.*?)\n```", text, re.DOTALL) return [{"command": match.group(1)}] if match else [] return DeterministicModel( outputs=[make_output(text, parse_command(text), cost=cost_per_call) for text in text_outputs], cost_per_call=cost_per_call, **kwargs, ) @pytest.mark.slow @pytest.mark.parametrize("workers", [1, 2]) def test_swebench_end_to_end(github_test_data, tmp_path, workers): """Test the complete SWEBench flow using the _test subset with deterministic model""" model_responses = github_test_data["model_responses"] with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model: # Use side_effect to create a new model instance for each worker mock_get_model.side_effect = lambda **kwargs: _make_model_from_fixture(model_responses, cost_per_call=0.1) main( subset="_test", split="test", slice_spec="0:1", output=str(tmp_path), workers=workers, filter_spec="swe-agent__test-repo-1", config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")], environment_class="docker", ) traj_file_path = package_dir.parent.parent / "tests" / "test_data" / "github_issue.traj.json" trajectory = json.loads(traj_file_path.read_text()) last_message = trajectory[-1]["content"] instance_id = "swe-agent__test-repo-1" expected_result = { instance_id: { "model_name_or_path": "deterministic", "instance_id": instance_id, "model_patch": last_message, } } with open(tmp_path / "preds.json") as f: actual_result = json.load(f) assert actual_result == expected_result traj_output_file = tmp_path / instance_id / f"{instance_id}.traj.json" output_trajectory = json.loads(traj_output_file.read_text()) assert output_trajectory["messages"][-1]["content"] == last_message def test_get_image_name_with_existing_image_name(): """Test get_image_name when image_name is already provided""" instance = {"image_name": "custom/image:tag", "instance_id": "test__repo__1"} assert get_swebench_docker_image_name(instance) == "custom/image:tag" def test_get_image_name_without_image_name(): """Test get_image_name when image_name needs to be constructed""" instance = {"instance_id": "swe-agent__test-repo__1"} expected = "docker.io/swebench/sweb.eval.x86_64.swe-agent_1776_test-repo_1776_1:latest" assert get_swebench_docker_image_name(instance) == expected def test_get_image_name_with_none_image_name(): """Test get_image_name when image_name is explicitly None""" instance = {"image_name": None, "instance_id": "django__django__4.0"} expected = "docker.io/swebench/sweb.eval.x86_64.django_1776_django_1776_4.0:latest" assert get_swebench_docker_image_name(instance) == expected def test_get_image_name_with_complex_instance_id(): """Test get_image_name with complex instance_id containing multiple double underscores""" instance = {"instance_id": "project__sub__module__version__1.2.3"} expected = "docker.io/swebench/sweb.eval.x86_64.project_1776_sub_1776_module_1776_version_1776_1.2.3:latest" assert get_swebench_docker_image_name(instance) == expected def test_filter_instances_no_filters(): """Test filter_instances with no filtering applied""" instances = [{"instance_id": "repo1__test1"}, {"instance_id": "repo2__test2"}, {"instance_id": "repo3__test3"}] result = filter_instances(instances, filter_spec="", slice_spec="") assert result == instances def test_filter_instances_regex_filter(): """Test filter_instances with regex filtering""" instances = [ {"instance_id": "django__test1"}, {"instance_id": "flask__test2"}, {"instance_id": "django__test3"}, {"instance_id": "requests__test4"}, ] result = filter_instances(instances, filter_spec=r"django__.*", slice_spec="") expected = [{"instance_id": "django__test1"}, {"instance_id": "django__test3"}] assert result == expected def test_filter_instances_slice_only(): """Test filter_instances with slice specification""" instances = [{"instance_id": f"repo{i}__test{i}"} for i in range(10)] result = filter_instances(instances, filter_spec="", slice_spec="2:5") expected = [{"instance_id": "repo2__test2"}, {"instance_id": "repo3__test3"}, {"instance_id": "repo4__test4"}] assert result == expected def test_filter_instances_slice_start_only(): """Test filter_instances with slice start only""" instances = [{"instance_id": f"repo{i}__test{i}"} for i in range(5)] result = filter_instances(instances, filter_spec="", slice_spec="3:") expected = [{"instance_id": "repo3__test3"}, {"instance_id": "repo4__test4"}] assert result == expected def test_filter_instances_slice_end_only(): """Test filter_instances with slice end only""" instances = [{"instance_id": f"repo{i}__test{i}"} for i in range(5)] result = filter_instances(instances, filter_spec="", slice_spec=":2") expected = [{"instance_id": "repo0__test0"}, {"instance_id": "repo1__test1"}] assert result == expected def test_filter_instances_filter_and_slice(): """Test filter_instances with both filtering and slicing""" instances = [ {"instance_id": "django__test1"}, {"instance_id": "flask__test2"}, {"instance_id": "django__test3"}, {"instance_id": "django__test4"}, {"instance_id": "requests__test5"}, ] result = filter_instances(instances, filter_spec=r"django__.*", slice_spec="1:3") expected = [{"instance_id": "django__test3"}, {"instance_id": "django__test4"}] assert result == expected def test_filter_instances_shuffle(): """Test filter_instances with shuffle enabled produces deterministic results""" instances = [{"instance_id": f"repo{i:02d}__test{i}"} for i in range(10)] # Test that shuffle produces same result with same seed result1 = filter_instances(instances.copy(), filter_spec="", slice_spec="", shuffle=True) result2 = filter_instances(instances.copy(), filter_spec="", slice_spec="", shuffle=True) assert result1 == result2 # Test that shuffled result is different from original order result_no_shuffle = filter_instances(instances.copy(), filter_spec="", slice_spec="", shuffle=False) assert result1 != result_no_shuffle def test_filter_instances_empty_list(): """Test filter_instances with empty input list""" result = filter_instances([], filter_spec=r".*", slice_spec="0:5", shuffle=True) assert result == [] def test_filter_instances_no_matches(): """Test filter_instances when regex matches nothing""" instances = [{"instance_id": "django__test1"}, {"instance_id": "flask__test2"}] result = filter_instances(instances, filter_spec=r"nonexistent__.*", slice_spec="") assert result == [] def test_update_preds_file_new_file(tmp_path): """Test update_preds_file when output file doesn't exist""" output_path = tmp_path / "preds.json" update_preds_file(output_path, "test__instance__1", "test_model", "test_result") assert output_path.exists() result = json.loads(output_path.read_text()) expected = { "test__instance__1": { "model_name_or_path": "test_model", "instance_id": "test__instance__1", "model_patch": "test_result", } } assert result == expected def test_update_preds_file_existing_file(tmp_path): """Test update_preds_file when output file already exists""" output_path = tmp_path / "preds.json" # Create initial file with one instance initial_data = { "existing__instance": { "model_name_or_path": "old_model", "instance_id": "existing__instance", "model_patch": "old_result", } } output_path.write_text(json.dumps(initial_data)) # Add new instance update_preds_file(output_path, "new__instance", "new_model", "new_result") result = json.loads(output_path.read_text()) expected = { "existing__instance": { "model_name_or_path": "old_model", "instance_id": "existing__instance", "model_patch": "old_result", }, "new__instance": { "model_name_or_path": "new_model", "instance_id": "new__instance", "model_patch": "new_result", }, } assert result == expected def test_update_preds_file_overwrite_existing(tmp_path): """Test update_preds_file overwrites existing instance""" output_path = tmp_path / "preds.json" # Create initial file initial_data = { "test__instance": { "model_name_or_path": "old_model", "instance_id": "test__instance", "model_patch": "old_result", } } output_path.write_text(json.dumps(initial_data)) # Update existing instance update_preds_file(output_path, "test__instance", "new_model", "new_result") result = json.loads(output_path.read_text()) expected = { "test__instance": { "model_name_or_path": "new_model", "instance_id": "test__instance", "model_patch": "new_result", } } assert result == expected def test_remove_from_preds_file_existing(tmp_path): """Test remove_from_preds_file removes existing instance""" output_path = tmp_path / "preds.json" # Create file with multiple instances initial_data = { "instance1": {"model_name_or_path": "model1", "instance_id": "instance1", "model_patch": "result1"}, "instance2": {"model_name_or_path": "model2", "instance_id": "instance2", "model_patch": "result2"}, } output_path.write_text(json.dumps(initial_data)) # Remove one instance remove_from_preds_file(output_path, "instance1") result = json.loads(output_path.read_text()) expected = {"instance2": {"model_name_or_path": "model2", "instance_id": "instance2", "model_patch": "result2"}} assert result == expected def test_remove_from_preds_file_nonexistent_instance(tmp_path): """Test remove_from_preds_file with nonexistent instance""" output_path = tmp_path / "preds.json" initial_data = {"instance1": {"model_name_or_path": "model1", "instance_id": "instance1", "model_patch": "result1"}} output_path.write_text(json.dumps(initial_data)) # Try to remove nonexistent instance remove_from_preds_file(output_path, "nonexistent") # File should be unchanged result = json.loads(output_path.read_text()) assert result == initial_data def test_remove_from_preds_file_no_file(tmp_path): """Test remove_from_preds_file when file doesn't exist""" output_path = tmp_path / "preds.json" # Should not raise an error remove_from_preds_file(output_path, "any_instance") # File should still not exist assert not output_path.exists() @pytest.mark.slow def test_redo_existing_false_skips_existing(github_test_data, tmp_path): """Test that redo_existing=False skips instances that already have results""" model_responses = github_test_data["model_responses"] # Create existing preds.json with one instance preds_file = tmp_path / "preds.json" existing_data = { "swe-agent__test-repo-1": { "model_name_or_path": "previous_model", "instance_id": "swe-agent__test-repo-1", "model_patch": "previous_result", } } preds_file.write_text(json.dumps(existing_data)) with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model: mock_get_model.side_effect = lambda **kwargs: _make_model_from_fixture(model_responses) main( subset="_test", split="test", slice_spec="0:1", output=str(tmp_path), workers=1, filter_spec="swe-agent__test-repo-1", redo_existing=False, config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")], ) # Should still have the original result result = json.loads(preds_file.read_text()) assert result == existing_data @pytest.mark.slow def test_redo_existing_true_overwrites_existing(github_test_data, tmp_path): """Test that redo_existing=True processes instances even if they already have results""" model_responses = github_test_data["model_responses"] # Create existing preds.json with one instance preds_file = tmp_path / "preds.json" existing_data = { "swe-agent__test-repo-1": { "model_name_or_path": "previous_model", "instance_id": "swe-agent__test-repo-1", "model_patch": "previous_result", } } preds_file.write_text(json.dumps(existing_data)) with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model: mock_get_model.side_effect = lambda **kwargs: _make_model_from_fixture(model_responses, cost_per_call=0.1) main( subset="_test", split="test", slice_spec="0:1", output=str(tmp_path), workers=1, filter_spec="swe-agent__test-repo-1", redo_existing=True, config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")], environment_class="docker", ) # Should have new result from deterministic model traj_file_path = package_dir.parent.parent / "tests" / "test_data" / "github_issue.traj.json" trajectory = json.loads(traj_file_path.read_text()) expected_result = trajectory[-1]["content"] result = json.loads(preds_file.read_text()) assert result["swe-agent__test-repo-1"]["model_patch"] == expected_result assert result["swe-agent__test-repo-1"]["model_name_or_path"] == "deterministic" class ExceptionModelConfig(BaseModel): model_name: str = "exception_model" class ExceptionModel: """Test model that raises exceptions during processing.""" def __init__(self, exception_type: type[Exception] = RuntimeError, exception_message: str = "Test exception"): self.exception_type = exception_type self.exception_message = exception_message self.cost = 0.0 self.n_calls = 0 self.config = ExceptionModelConfig() def query(self, *args, **kwargs): self.n_calls += 1 raise self.exception_type(self.exception_message) def format_message(self, **kwargs) -> dict: return dict(**kwargs) def format_observation_messages( self, message: dict, outputs: list[dict], template_vars: dict | None = None ) -> list[dict]: return [self.format_message(role="user", content=str(o)) for o in outputs] def get_template_vars(self, **kwargs) -> dict: return self.config.model_dump() | {"n_model_calls": self.n_calls, "model_cost": self.cost} def serialize(self) -> dict: return { "info": { "model_stats": { "instance_cost": self.cost, "api_calls": self.n_calls, }, "config": { "model": self.config.model_dump(mode="json"), "model_type": f"{self.__class__.__module__}.{self.__class__.__name__}", }, } } @pytest.mark.slow @pytest.mark.parametrize("workers", [1, 2]) def test_exception_handling_in_agent_run(tmp_path, workers): """Test that exceptions during agent.run() are properly handled and recorded""" with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model: mock_get_model.return_value = ExceptionModel(RuntimeError, "Agent processing failed") with patch("minisweagent.run.benchmarks.swebench.RunBatchProgressManager") as mock_progress_class: mock_progress_manager = mock_progress_class.return_value mock_progress_manager.render_group = None # For Live context manager main( subset="_test", split="test", slice_spec="0:1", output=str(tmp_path), workers=workers, filter_spec="swe-agent__test-repo-1", config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")], environment_class="docker", ) # Check that prediction file contains exception information preds_file = tmp_path / "preds.json" assert preds_file.exists() result = json.loads(preds_file.read_text()) instance_id = "swe-agent__test-repo-1" assert instance_id in result assert result[instance_id]["model_patch"] == "" assert result[instance_id]["model_name_or_path"] == "exception_model" # Check that trajectory file contains exception information traj_file = tmp_path / instance_id / f"{instance_id}.traj.json" assert traj_file.exists() traj_data = json.loads(traj_file.read_text()) assert traj_data["instance_id"] == instance_id assert traj_data["info"]["exit_status"] == "RuntimeError" assert traj_data["info"]["submission"] == "" assert traj_data["info"]["exception_str"] == "Agent processing failed" @pytest.mark.slow @pytest.mark.parametrize("workers", [1, 2]) def test_different_exception_types(tmp_path, workers): """Test that different exception types are properly recorded""" with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model: mock_get_model.return_value = ExceptionModel(ValueError, "Invalid input provided") with patch("minisweagent.run.benchmarks.swebench.RunBatchProgressManager") as mock_progress_class: mock_progress_manager = mock_progress_class.return_value mock_progress_manager.render_group = None # For Live context manager main( subset="_test", split="test", slice_spec="0:1", output=str(tmp_path), workers=workers, filter_spec="swe-agent__test-repo-1", config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")], environment_class="docker", ) # Check trajectory file for correct exception type instance_id = "swe-agent__test-repo-1" traj_file = tmp_path / instance_id / f"{instance_id}.traj.json" traj_data = json.loads(traj_file.read_text()) assert traj_data["info"]["exit_status"] == "ValueError" assert traj_data["info"]["submission"] == "" assert traj_data["info"]["exception_str"] == "Invalid input provided" @pytest.mark.slow def test_exception_handling_with_progress_manager(tmp_path): """Test that progress manager receives exception notifications in multithreaded mode""" with patch("minisweagent.run.benchmarks.swebench.get_model") as mock_get_model: mock_get_model.return_value = ExceptionModel(ConnectionError, "Network timeout") with patch("minisweagent.run.benchmarks.swebench.RunBatchProgressManager") as mock_progress_class: mock_progress_manager = mock_progress_class.return_value mock_progress_manager.render_group = None # For Live context manager main( subset="_test", split="test", slice_spec="0:1", output=str(tmp_path), workers=2, # Use multithreaded to test progress manager filter_spec="swe-agent__test-repo-1", config_spec=[str(package_dir / "config" / "benchmarks" / "swebench.yaml")], environment_class="docker", ) # Verify progress manager methods were called mock_progress_manager.on_instance_start.assert_called_once_with("swe-agent__test-repo-1") mock_progress_manager.on_instance_end.assert_called_once_with("swe-agent__test-repo-1", "ConnectionError") # on_uncaught_exception should not be called since exceptions are handled properly mock_progress_manager.on_uncaught_exception.assert_not_called()