openai · stneng · Dec 26, 2025
diff --git a/src/agents/mcp/util.py b/src/agents/mcp/util.py
@@ -11,7 +11,7 @@
 from ..logger import logger
 from ..run_context import RunContextWrapper
 from ..strict_schema import ensure_strict_json_schema
-from ..tool import FunctionTool, Tool
+from ..tool import FunctionTool, Tool, ToolOutputImageDict, ToolOutputTextDict
 from ..tracing import FunctionSpanData, get_current_span, mcp_tools_span
 from ..util._types import MaybeAwaitable
 
@@ -181,7 +181,12 @@ def to_function_tool(
     @classmethod
     async def invoke_mcp_tool(
         cls, server: "MCPServer", tool: "MCPTool", context: RunContextWrapper[Any], input_json: str
-    ) -> str:
+    ) -> Union[
+        str,
+        ToolOutputTextDict,
+        ToolOutputImageDict,
+        list[Union[ToolOutputTextDict, ToolOutputImageDict]],
+    ]:
         """Invoke an MCP tool and return the result as a string."""
         try:
             json_data: dict[str, Any] = json.loads(input_json) if input_json else {}
@@ -211,25 +216,39 @@ async def invoke_mcp_tool(
             logger.debug(f"MCP tool {tool.name} returned {result}")
 
         # If structured content is requested and available, use it exclusively
+        tool_output: Union[
+            str,
+            ToolOutputTextDict,
+            ToolOutputImageDict,
+            list[Union[ToolOutputTextDict, ToolOutputImageDict]],
+        ]
         if server.use_structured_content and result.structuredContent:
             tool_output = json.dumps(result.structuredContent)
         else:
-            # Fall back to regular text content processing
-            # The MCP tool result is a list of content items, whereas OpenAI tool
-            # outputs are a single string. We'll try to convert.
-            if len(result.content) == 1:
-                tool_output = result.content[0].model_dump_json()
-            elif len(result.content) > 1:
-                tool_results = [item.model_dump(mode="json") for item in result.content]
-                tool_output = json.dumps(tool_results)
+            tool_output_list: list[Union[ToolOutputTextDict, ToolOutputImageDict]] = []
+            for item in result.content:
+                if item.type == "text":
+                    tool_output_list.append(ToolOutputTextDict(type="text", text=item.text))
+                elif item.type == "image":
+                    tool_output_list.append(
+                        ToolOutputImageDict(
+                            type="image", image_url=f"data:{item.mimeType};base64,{item.data}"
+                        )
+                    )
+                else:
+                    # Fall back to regular text content
+                    tool_output_list.append(
+                        ToolOutputTextDict(type="text", text=str(item.model_dump(mode="json")))
+                    )
+            if len(tool_output_list) == 1:
+                tool_output = tool_output_list[0]
             else:
-                # Empty content is a valid result (e.g., "no results found")
-                tool_output = "[]"
+                tool_output = tool_output_list
 
         current_span = get_current_span()
         if current_span:
             if isinstance(current_span.span_data, FunctionSpanData):
-                current_span.span_data.output = tool_output
+                current_span.span_data.output = json.dumps(tool_output)
                 current_span.span_data.mcp_data = {
                     "server": server.name,
                 }

diff --git a/tests/mcp/test_mcp_tracing.py b/tests/mcp/test_mcp_tracing.py
@@ -62,7 +62,7 @@ async def test_mcp_tracing():
                                 "data": {
                                     "name": "test_tool_1",
                                     "input": "",
-                                    "output": '{"type":"text","text":"result_test_tool_1_{}","annotations":null,"meta":null}',  # noqa: E501
+                                    "output": "{'type': 'text', 'text': 'result_test_tool_1_{}'}",  # noqa: E501
                                     "mcp_data": {"server": "fake_mcp_server"},
                                 },
                             },
@@ -133,7 +133,7 @@ async def test_mcp_tracing():
                                 "data": {
                                     "name": "test_tool_2",
                                     "input": "",
-                                    "output": '{"type":"text","text":"result_test_tool_2_{}","annotations":null,"meta":null}',  # noqa: E501
+                                    "output": "{'type': 'text', 'text': 'result_test_tool_2_{}'}",  # noqa: E501
                                     "mcp_data": {"server": "fake_mcp_server"},
                                 },
                             },
@@ -197,7 +197,7 @@ async def test_mcp_tracing():
                                 "data": {
                                     "name": "test_tool_3",
                                     "input": "",
-                                    "output": '{"type":"text","text":"result_test_tool_3_{}","annotations":null,"meta":null}',  # noqa: E501
+                                    "output": "{'type': 'text', 'text': 'result_test_tool_3_{}'}",  # noqa: E501
                                     "mcp_data": {"server": "fake_mcp_server"},
                                 },
                             },

diff --git a/tests/mcp/test_mcp_util.py b/tests/mcp/test_mcp_util.py
@@ -3,7 +3,7 @@
 
 import pytest
 from inline_snapshot import snapshot
-from mcp.types import CallToolResult, TextContent, Tool as MCPTool
+from mcp.types import CallToolResult, ImageContent, TextContent, Tool as MCPTool
 from pydantic import BaseModel, TypeAdapter
 
 from agents import Agent, FunctionTool, RunContextWrapper
@@ -254,39 +254,45 @@ async def test_mcp_fastmcp_behavior_verification():
     ctx = RunContextWrapper(context=None)
     tool = MCPTool(name="test_tool", inputSchema={})
 
-    # Case 1: None -> "[]".
+    # Case 1: None -> [].
     server._custom_content = []
     result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "")
-    assert result == "[]", f"None should return '[]', got {result}"
+    assert result == [], f"None should return [], got {result}"
 
-    # Case 2: [] -> "[]".
+    # Case 2: [] -> [].
     server._custom_content = []
     result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "")
-    assert result == "[]", f"[] should return '[]', got {result}"
+    assert result == [], f"[] should return [], got {result}"
 
-    # Case 3: {} -> {"type":"text","text":"{}","annotations":null,"meta":null}.
+    # Case 3: {} -> {"type": "text", "text": "{}"}.
     server._custom_content = [TextContent(text="{}", type="text")]
     result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "")
-    expected = '{"type":"text","text":"{}","annotations":null,"meta":null}'
+    expected = {"type": "text", "text": "{}"}
     assert result == expected, f"{{}} should return {expected}, got {result}"
 
-    # Case 4: [{}] -> {"type":"text","text":"{}","annotations":null,"meta":null}.
+    # Case 4: [{}] -> {"type": "text", "text": "{}"}.
     server._custom_content = [TextContent(text="{}", type="text")]
     result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "")
-    expected = '{"type":"text","text":"{}","annotations":null,"meta":null}'
+    expected = {"type": "text", "text": "{}"}
     assert result == expected, f"[{{}}] should return {expected}, got {result}"
 
-    # Case 5: [[]] -> "[]".
+    # Case 5: [[]] -> [].
     server._custom_content = []
     result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "")
-    assert result == "[]", f"[[]] should return '[]', got {result}"
+    assert result == [], f"[[]] should return [], got {result}"
 
     # Case 6: String values work normally.
     server._custom_content = [TextContent(text="hello", type="text")]
     result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "")
-    expected = '{"type":"text","text":"hello","annotations":null,"meta":null}'
+    expected = {"type": "text", "text": "hello"}
     assert result == expected, f"String should return {expected}, got {result}"
 
+    # Case 7: Image content works normally.
+    server._custom_content = [ImageContent(data="AAAA", mimeType="image/png", type="image")]
+    result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "")
+    expected = {"type": "image", "image_url": "data:image/png;base64,AAAA"}
+    assert result == expected, f"Image should return {expected}, got {result}"
+
 
 @pytest.mark.asyncio
 async def test_agent_convert_schemas_unset():
@@ -393,23 +399,23 @@ async def call_tool(self, tool_name: str, arguments: dict[str, Any] | None) -> C
             False,
             [TextContent(text="text content", type="text")],
             {"data": "structured_value", "type": "structured"},
-            '{"type":"text","text":"text content","annotations":null,"meta":null}',
+            {"type": "text", "text": "text content"},
         ),
         # Scenario 3: use_structured_content=True but no structured content
         # Should fall back to text content
         (
             True,
             [TextContent(text="fallback text", type="text")],
             None,
-            '{"type":"text","text":"fallback text","annotations":null,"meta":null}',
+            {"type": "text", "text": "fallback text"},
         ),
         # Scenario 4: use_structured_content=True with empty structured content (falsy)
         # Should fall back to text content
         (
             True,
             [TextContent(text="fallback text", type="text")],
             {},
-            '{"type":"text","text":"fallback text","annotations":null,"meta":null}',
+            {"type": "text", "text": "fallback text"},
         ),
         # Scenario 5: use_structured_content=True, structured content available, empty text content
         # Should return structured content
@@ -420,8 +426,7 @@ async def call_tool(self, tool_name: str, arguments: dict[str, Any] | None) -> C
             False,
             [TextContent(text="first", type="text"), TextContent(text="second", type="text")],
             {"ignored": "structured"},
-            '[{"type": "text", "text": "first", "annotations": null, "meta": null}, '
-            '{"type": "text", "text": "second", "annotations": null, "meta": null}]',
+            [{"type": "text", "text": "first"}, {"type": "text", "text": "second"}],
         ),
         # Scenario 7: use_structured_content=True, multiple text content, with structured content
         # Should return only structured content (text content ignored)
@@ -436,10 +441,10 @@ async def call_tool(self, tool_name: str, arguments: dict[str, Any] | None) -> C
         ),
         # Scenario 8: use_structured_content=False, empty content
         # Should return empty array
-        (False, [], None, "[]"),
+        (False, [], None, []),
         # Scenario 9: use_structured_content=True, empty content, no structured content
         # Should return empty array
-        (True, [], None, "[]"),
+        (True, [], None, []),
     ],
 )
 @pytest.mark.asyncio
@@ -492,6 +497,7 @@ async def test_structured_content_priority_over_text():
     # Should return only structured content
     import json
 
+    assert isinstance(result, str)
     parsed_result = json.loads(result)
     assert parsed_result == structured_content
     assert "This should be ignored" not in result
@@ -518,11 +524,9 @@ async def test_structured_content_fallback_behavior():
     result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "{}")
 
     # Should fall back to text content
-    import json
-
-    parsed_result = json.loads(result)
-    assert parsed_result["text"] == "Fallback content"
-    assert parsed_result["type"] == "text"
+    assert isinstance(result, dict)
+    assert result["type"] == "text"
+    assert result["text"] == "Fallback content"
 
 
 @pytest.mark.asyncio
@@ -547,10 +551,9 @@ async def test_backwards_compatibility_unchanged():
     result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "{}")
 
     # Should return only text content (structured content ignored)
-    import json
-
-    parsed_result = json.loads(result)
-    assert parsed_result["text"] == "Traditional text output"
+    assert isinstance(result, dict)
+    assert result["type"] == "text"
+    assert result["text"] == "Traditional text output"
     assert "modern" not in result
 
 
@@ -576,11 +579,9 @@ async def test_empty_structured_content_fallback():
     result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "{}")
 
     # Should fall back to text content because empty dict is falsy
-    import json
-
-    parsed_result = json.loads(result)
-    assert parsed_result["text"] == "Should use this text"
-    assert parsed_result["type"] == "text"
+    assert isinstance(result, dict)
+    assert result["type"] == "text"
+    assert result["text"] == "Should use this text"
 
 
 @pytest.mark.asyncio
@@ -610,6 +611,7 @@ async def test_complex_structured_content():
     # Should return the complex structured content as-is
     import json
 
+    assert isinstance(result, str)
     parsed_result = json.loads(result)
     assert parsed_result == complex_structured
     assert len(parsed_result["results"]) == 2
@@ -644,6 +646,7 @@ async def test_multiple_content_items_with_structured():
     # Should return only structured content, ignoring all text items
     import json
 
+    assert isinstance(result, str)
     parsed_result = json.loads(result)
     assert parsed_result == structured_content
     assert "First text item" not in result
@@ -668,10 +671,9 @@ async def test_multiple_content_items_without_structured():
     result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "{}")
 
     # Should return JSON array of text content items
-    import json
-
-    parsed_result = json.loads(result)
-    assert isinstance(parsed_result, list)
-    assert len(parsed_result) == 2
-    assert parsed_result[0]["text"] == "First"
-    assert parsed_result[1]["text"] == "Second"
+    assert isinstance(result, list)
+    assert len(result) == 2
+    assert result[0]["type"] == "text"
+    assert result[0]["text"] == "First"
+    assert result[1]["type"] == "text"
+    assert result[1]["text"] == "Second"