import pytest from minisweagent.models.utils.openai_multimodal import ( DEFAULT_MULTIMODAL_REGEX, _expand_content_string, expand_multimodal_content, ) @pytest.mark.parametrize( ("content", "expected"), [ ( "Just plain text", [{"type": "text", "text": "Just plain text"}], ), ( "Text before image_urlhttps://example.com/image.png text after", [ {"type": "text", "text": "Text before "}, {"type": "image_url", "image_url": {"url": "https://example.com/image.png"}}, {"type": "text", "text": " text after"}, ], ), ( "image_urldata:image/png;base64,iVBORw0KGgoAAAANS", [{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBORw0KGgoAAAANS"}}], ), ], ) def test_expand_content_string(content, expected): """Test _expand_content_string with various content patterns.""" assert _expand_content_string(content=content, pattern=DEFAULT_MULTIMODAL_REGEX) == expected def test_expand_content_string_multiple_images(): """Test _expand_content_string with multiple images.""" content = ( "First image_urlimage1.png " "middle image_urlimage2.jpg end" ) result = _expand_content_string(content=content, pattern=DEFAULT_MULTIMODAL_REGEX) assert len(result) == 5 assert result[0] == {"type": "text", "text": "First "} assert result[1] == {"type": "image_url", "image_url": {"url": "image1.png"}} assert result[2] == {"type": "text", "text": " middle "} assert result[3] == {"type": "image_url", "image_url": {"url": "image2.jpg"}} assert result[4] == {"type": "text", "text": " end"} def test_expand_content_string_multiline(): """Test _expand_content_string handles multiline image content.""" content = """Here is an image: image_urldata:image/png;base64, iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk After image""" result = _expand_content_string(content=content, pattern=DEFAULT_MULTIMODAL_REGEX) assert len(result) == 3 assert result[0] == {"type": "text", "text": "Here is an image:\n"} assert result[1]["type"] == "image_url" assert "data:image/png;base64" in result[1]["image_url"]["url"] assert result[2] == {"type": "text", "text": "\nAfter image"} def test_expand_content_string_whitespace_handling(): """Test that whitespace in image URLs is stripped but preserved in text.""" content = "Text \nimage_url image_url \nMore text" result = _expand_content_string(content=content, pattern=DEFAULT_MULTIMODAL_REGEX) assert result[0]["text"] == "Text \n" assert result[1]["image_url"]["url"] == "image_url" assert result[2]["text"] == " \nMore text" def test_expand_content_string_adjacent_images(): """Test multiple images with no text between them.""" content = ( "image_urlimg1" "image_urlimg2" ) result = _expand_content_string(content=content, pattern=DEFAULT_MULTIMODAL_REGEX) assert len(result) == 2 assert result[0] == {"type": "image_url", "image_url": {"url": "img1"}} assert result[1] == {"type": "image_url", "image_url": {"url": "img2"}} def test_expand_multimodal_content_string(): """Test expand_multimodal_content with string input.""" content = ( "Text image_urlimage.png more" ) result = expand_multimodal_content(content, pattern=DEFAULT_MULTIMODAL_REGEX) assert len(result) == 3 assert result[0]["type"] == "text" assert result[1]["type"] == "image_url" assert result[2]["type"] == "text" def test_expand_multimodal_content_list(): """Test expand_multimodal_content with list input.""" content = [ "plain text", "text image_urlimage.png more", ] result = expand_multimodal_content(content, pattern=DEFAULT_MULTIMODAL_REGEX) assert len(result) == 2 assert result[0] == [{"type": "text", "text": "plain text"}] assert len(result[1]) == 3 def test_expand_multimodal_content_dict(): """Test expand_multimodal_content with dict input.""" content = { "role": "user", "content": "text image_urlimage.png", } result = expand_multimodal_content(content, pattern=DEFAULT_MULTIMODAL_REGEX) assert result["role"] == "user" assert len(result["content"]) == 2 def test_expand_multimodal_content_dict_no_content_key(): """Test expand_multimodal_content with dict without 'content' key.""" input_dict = {"role": "user", "other": "data"} assert expand_multimodal_content(input_dict, pattern=DEFAULT_MULTIMODAL_REGEX) == input_dict def test_expand_multimodal_content_nested(): """Test expand_multimodal_content with nested structures.""" content = { "role": "user", "content": [ "text image_urlimage.png", {"nested": "value"}, ], } result = expand_multimodal_content(content, pattern=DEFAULT_MULTIMODAL_REGEX) assert result["role"] == "user" assert len(result["content"]) == 2 assert len(result["content"][0]) == 2 def test_expand_multimodal_content_preserves_original(): """Test that expand_multimodal_content deep copies and doesn't modify original.""" original = { "role": "user", "content": "text image_urlimage.png", } original_content = original["content"] expand_multimodal_content(original, pattern=DEFAULT_MULTIMODAL_REGEX) assert original["content"] == original_content def test_model_format_message_with_multimodal(): """Test that model.format_message applies multimodal transformation when configured.""" from minisweagent.models.test_models import DeterministicModel model = DeterministicModel(outputs=[], multimodal_regex=DEFAULT_MULTIMODAL_REGEX) result = model.format_message( role="user", content="Hello image_urlimage.png", ) assert result["role"] == "user" assert len(result["content"]) == 2 assert result["content"][0]["type"] == "text" assert result["content"][1]["type"] == "image_url" def test_model_format_message_without_multimodal(): """Test that model.format_message returns plain dict when multimodal is disabled.""" from minisweagent.models.test_models import DeterministicModel model = DeterministicModel(outputs=[]) result = model.format_message(role="user", content="Hello world") assert result == {"role": "user", "content": "Hello world"} def test_unknown_content_type_ignored(): """Test that unknown content types are ignored.""" content = ( "Text unknown_typedata more" ) result = _expand_content_string(content=content, pattern=DEFAULT_MULTIMODAL_REGEX) # Unknown type is not added, so we get text before, nothing for unknown, text after assert len(result) == 2 assert result[0] == {"type": "text", "text": "Text "} assert result[1] == {"type": "text", "text": " more"}