From bd975a047a2b9faa2b68b26f87f1758d414d986b Mon Sep 17 00:00:00 2001 From: stneng Date: Fri, 26 Dec 2025 11:40:09 +0000 Subject: [PATCH] Image response from an MCP server --- src/agents/mcp/util.py | 45 +++++++++++++------ tests/mcp/test_mcp_tracing.py | 6 +-- tests/mcp/test_mcp_util.py | 82 ++++++++++++++++++----------------- 3 files changed, 77 insertions(+), 56 deletions(-) diff --git a/src/agents/mcp/util.py b/src/agents/mcp/util.py index 6cfe5c96d..085b42506 100644 --- a/src/agents/mcp/util.py +++ b/src/agents/mcp/util.py @@ -11,7 +11,7 @@ from ..logger import logger from ..run_context import RunContextWrapper from ..strict_schema import ensure_strict_json_schema -from ..tool import FunctionTool, Tool +from ..tool import FunctionTool, Tool, ToolOutputImageDict, ToolOutputTextDict from ..tracing import FunctionSpanData, get_current_span, mcp_tools_span from ..util._types import MaybeAwaitable @@ -181,7 +181,12 @@ def to_function_tool( @classmethod async def invoke_mcp_tool( cls, server: "MCPServer", tool: "MCPTool", context: RunContextWrapper[Any], input_json: str - ) -> str: + ) -> Union[ + str, + ToolOutputTextDict, + ToolOutputImageDict, + list[Union[ToolOutputTextDict, ToolOutputImageDict]], + ]: """Invoke an MCP tool and return the result as a string.""" try: json_data: dict[str, Any] = json.loads(input_json) if input_json else {} @@ -211,25 +216,39 @@ async def invoke_mcp_tool( logger.debug(f"MCP tool {tool.name} returned {result}") # If structured content is requested and available, use it exclusively + tool_output: Union[ + str, + ToolOutputTextDict, + ToolOutputImageDict, + list[Union[ToolOutputTextDict, ToolOutputImageDict]], + ] if server.use_structured_content and result.structuredContent: tool_output = json.dumps(result.structuredContent) else: - # Fall back to regular text content processing - # The MCP tool result is a list of content items, whereas OpenAI tool - # outputs are a single string. We'll try to convert. - if len(result.content) == 1: - tool_output = result.content[0].model_dump_json() - elif len(result.content) > 1: - tool_results = [item.model_dump(mode="json") for item in result.content] - tool_output = json.dumps(tool_results) + tool_output_list: list[Union[ToolOutputTextDict, ToolOutputImageDict]] = [] + for item in result.content: + if item.type == "text": + tool_output_list.append(ToolOutputTextDict(type="text", text=item.text)) + elif item.type == "image": + tool_output_list.append( + ToolOutputImageDict( + type="image", image_url=f"data:{item.mimeType};base64,{item.data}" + ) + ) + else: + # Fall back to regular text content + tool_output_list.append( + ToolOutputTextDict(type="text", text=str(item.model_dump(mode="json"))) + ) + if len(tool_output_list) == 1: + tool_output = tool_output_list[0] else: - # Empty content is a valid result (e.g., "no results found") - tool_output = "[]" + tool_output = tool_output_list current_span = get_current_span() if current_span: if isinstance(current_span.span_data, FunctionSpanData): - current_span.span_data.output = tool_output + current_span.span_data.output = json.dumps(tool_output) current_span.span_data.mcp_data = { "server": server.name, } diff --git a/tests/mcp/test_mcp_tracing.py b/tests/mcp/test_mcp_tracing.py index 33dfa5ea1..9cb3454b1 100644 --- a/tests/mcp/test_mcp_tracing.py +++ b/tests/mcp/test_mcp_tracing.py @@ -62,7 +62,7 @@ async def test_mcp_tracing(): "data": { "name": "test_tool_1", "input": "", - "output": '{"type":"text","text":"result_test_tool_1_{}","annotations":null,"meta":null}', # noqa: E501 + "output": "{'type': 'text', 'text': 'result_test_tool_1_{}'}", # noqa: E501 "mcp_data": {"server": "fake_mcp_server"}, }, }, @@ -133,7 +133,7 @@ async def test_mcp_tracing(): "data": { "name": "test_tool_2", "input": "", - "output": '{"type":"text","text":"result_test_tool_2_{}","annotations":null,"meta":null}', # noqa: E501 + "output": "{'type': 'text', 'text': 'result_test_tool_2_{}'}", # noqa: E501 "mcp_data": {"server": "fake_mcp_server"}, }, }, @@ -197,7 +197,7 @@ async def test_mcp_tracing(): "data": { "name": "test_tool_3", "input": "", - "output": '{"type":"text","text":"result_test_tool_3_{}","annotations":null,"meta":null}', # noqa: E501 + "output": "{'type': 'text', 'text': 'result_test_tool_3_{}'}", # noqa: E501 "mcp_data": {"server": "fake_mcp_server"}, }, }, diff --git a/tests/mcp/test_mcp_util.py b/tests/mcp/test_mcp_util.py index e434f7542..3ed0b7dad 100644 --- a/tests/mcp/test_mcp_util.py +++ b/tests/mcp/test_mcp_util.py @@ -3,7 +3,7 @@ import pytest from inline_snapshot import snapshot -from mcp.types import CallToolResult, TextContent, Tool as MCPTool +from mcp.types import CallToolResult, ImageContent, TextContent, Tool as MCPTool from pydantic import BaseModel, TypeAdapter from agents import Agent, FunctionTool, RunContextWrapper @@ -254,39 +254,45 @@ async def test_mcp_fastmcp_behavior_verification(): ctx = RunContextWrapper(context=None) tool = MCPTool(name="test_tool", inputSchema={}) - # Case 1: None -> "[]". + # Case 1: None -> []. server._custom_content = [] result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "") - assert result == "[]", f"None should return '[]', got {result}" + assert result == [], f"None should return [], got {result}" - # Case 2: [] -> "[]". + # Case 2: [] -> []. server._custom_content = [] result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "") - assert result == "[]", f"[] should return '[]', got {result}" + assert result == [], f"[] should return [], got {result}" - # Case 3: {} -> {"type":"text","text":"{}","annotations":null,"meta":null}. + # Case 3: {} -> {"type": "text", "text": "{}"}. server._custom_content = [TextContent(text="{}", type="text")] result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "") - expected = '{"type":"text","text":"{}","annotations":null,"meta":null}' + expected = {"type": "text", "text": "{}"} assert result == expected, f"{{}} should return {expected}, got {result}" - # Case 4: [{}] -> {"type":"text","text":"{}","annotations":null,"meta":null}. + # Case 4: [{}] -> {"type": "text", "text": "{}"}. server._custom_content = [TextContent(text="{}", type="text")] result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "") - expected = '{"type":"text","text":"{}","annotations":null,"meta":null}' + expected = {"type": "text", "text": "{}"} assert result == expected, f"[{{}}] should return {expected}, got {result}" - # Case 5: [[]] -> "[]". + # Case 5: [[]] -> []. server._custom_content = [] result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "") - assert result == "[]", f"[[]] should return '[]', got {result}" + assert result == [], f"[[]] should return [], got {result}" # Case 6: String values work normally. server._custom_content = [TextContent(text="hello", type="text")] result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "") - expected = '{"type":"text","text":"hello","annotations":null,"meta":null}' + expected = {"type": "text", "text": "hello"} assert result == expected, f"String should return {expected}, got {result}" + # Case 7: Image content works normally. + server._custom_content = [ImageContent(data="AAAA", mimeType="image/png", type="image")] + result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "") + expected = {"type": "image", "image_url": ""} + assert result == expected, f"Image should return {expected}, got {result}" + @pytest.mark.asyncio async def test_agent_convert_schemas_unset(): @@ -393,7 +399,7 @@ async def call_tool(self, tool_name: str, arguments: dict[str, Any] | None) -> C False, [TextContent(text="text content", type="text")], {"data": "structured_value", "type": "structured"}, - '{"type":"text","text":"text content","annotations":null,"meta":null}', + {"type": "text", "text": "text content"}, ), # Scenario 3: use_structured_content=True but no structured content # Should fall back to text content @@ -401,7 +407,7 @@ async def call_tool(self, tool_name: str, arguments: dict[str, Any] | None) -> C True, [TextContent(text="fallback text", type="text")], None, - '{"type":"text","text":"fallback text","annotations":null,"meta":null}', + {"type": "text", "text": "fallback text"}, ), # Scenario 4: use_structured_content=True with empty structured content (falsy) # Should fall back to text content @@ -409,7 +415,7 @@ async def call_tool(self, tool_name: str, arguments: dict[str, Any] | None) -> C True, [TextContent(text="fallback text", type="text")], {}, - '{"type":"text","text":"fallback text","annotations":null,"meta":null}', + {"type": "text", "text": "fallback text"}, ), # Scenario 5: use_structured_content=True, structured content available, empty text content # Should return structured content @@ -420,8 +426,7 @@ async def call_tool(self, tool_name: str, arguments: dict[str, Any] | None) -> C False, [TextContent(text="first", type="text"), TextContent(text="second", type="text")], {"ignored": "structured"}, - '[{"type": "text", "text": "first", "annotations": null, "meta": null}, ' - '{"type": "text", "text": "second", "annotations": null, "meta": null}]', + [{"type": "text", "text": "first"}, {"type": "text", "text": "second"}], ), # Scenario 7: use_structured_content=True, multiple text content, with structured content # Should return only structured content (text content ignored) @@ -436,10 +441,10 @@ async def call_tool(self, tool_name: str, arguments: dict[str, Any] | None) -> C ), # Scenario 8: use_structured_content=False, empty content # Should return empty array - (False, [], None, "[]"), + (False, [], None, []), # Scenario 9: use_structured_content=True, empty content, no structured content # Should return empty array - (True, [], None, "[]"), + (True, [], None, []), ], ) @pytest.mark.asyncio @@ -492,6 +497,7 @@ async def test_structured_content_priority_over_text(): # Should return only structured content import json + assert isinstance(result, str) parsed_result = json.loads(result) assert parsed_result == structured_content assert "This should be ignored" not in result @@ -518,11 +524,9 @@ async def test_structured_content_fallback_behavior(): result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "{}") # Should fall back to text content - import json - - parsed_result = json.loads(result) - assert parsed_result["text"] == "Fallback content" - assert parsed_result["type"] == "text" + assert isinstance(result, dict) + assert result["type"] == "text" + assert result["text"] == "Fallback content" @pytest.mark.asyncio @@ -547,10 +551,9 @@ async def test_backwards_compatibility_unchanged(): result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "{}") # Should return only text content (structured content ignored) - import json - - parsed_result = json.loads(result) - assert parsed_result["text"] == "Traditional text output" + assert isinstance(result, dict) + assert result["type"] == "text" + assert result["text"] == "Traditional text output" assert "modern" not in result @@ -576,11 +579,9 @@ async def test_empty_structured_content_fallback(): result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "{}") # Should fall back to text content because empty dict is falsy - import json - - parsed_result = json.loads(result) - assert parsed_result["text"] == "Should use this text" - assert parsed_result["type"] == "text" + assert isinstance(result, dict) + assert result["type"] == "text" + assert result["text"] == "Should use this text" @pytest.mark.asyncio @@ -610,6 +611,7 @@ async def test_complex_structured_content(): # Should return the complex structured content as-is import json + assert isinstance(result, str) parsed_result = json.loads(result) assert parsed_result == complex_structured assert len(parsed_result["results"]) == 2 @@ -644,6 +646,7 @@ async def test_multiple_content_items_with_structured(): # Should return only structured content, ignoring all text items import json + assert isinstance(result, str) parsed_result = json.loads(result) assert parsed_result == structured_content assert "First text item" not in result @@ -668,10 +671,9 @@ async def test_multiple_content_items_without_structured(): result = await MCPUtil.invoke_mcp_tool(server, tool, ctx, "{}") # Should return JSON array of text content items - import json - - parsed_result = json.loads(result) - assert isinstance(parsed_result, list) - assert len(parsed_result) == 2 - assert parsed_result[0]["text"] == "First" - assert parsed_result[1]["text"] == "Second" + assert isinstance(result, list) + assert len(result) == 2 + assert result[0]["type"] == "text" + assert result[0]["text"] == "First" + assert result[1]["type"] == "text" + assert result[1]["text"] == "Second"