feat: allow custom-client for OpenAIModel and GeminiModel

poshinchen · poshinchen · commit 3208acf5c9a6 · 2025-12-18T16:46:53.000-05:00
diff --git a/src/strands/models/gemini.py b/src/strands/models/gemini.py
@@ -48,23 +48,41 @@ class GeminiConfig(TypedDict, total=False):
     def __init__(
         self,
         *,
+        client: Optional[genai.Client] = None,
         client_args: Optional[dict[str, Any]] = None,
         **model_config: Unpack[GeminiConfig],
     ) -> None:
         """Initialize provider instance.
 
         Args:
+            client: Pre-configured Gemini client to reuse across requests.
+                When provided, this client will be reused for all requests and will NOT be closed
+                by the model. The caller is responsible for managing the client lifecycle.
+                This is useful for:
+                - Injecting custom client wrappers
+                - Reusing connection pools within a single event loop/worker
+                - Centralizing observability, retries, and networking policy
+                Note: The client should not be shared across different asyncio event loops.
             client_args: Arguments for the underlying Gemini client (e.g., api_key).
                 For a complete list of supported arguments, see https://googleapis.github.io/python-genai/.
+                Note: If `client` is provided, this parameter is ignored.
             **model_config: Configuration options for the Gemini model.
+
+        Raises:
+            ValueError: If both `client` and `client_args` are provided.
         """
         validate_config_keys(model_config, GeminiModel.GeminiConfig)
         self.config = GeminiModel.GeminiConfig(**model_config)
 
-        logger.debug("config=<%s> | initializing", self.config)
+        # Validate that only one client configuration method is provided
+        if client is not None and client_args is not None and len(client_args) > 0:
+            raise ValueError("Only one of 'client' or 'client_args' should be provided, not both.")
 
+        self._injected_client = client
         self.client_args = client_args or {}
 
+        logger.debug("config=<%s> | initializing", self.config)
+
     @override
     def update_config(self, **model_config: Unpack[GeminiConfig]) -> None:  # type: ignore[override]
         """Update the Gemini model configuration with the provided arguments.
@@ -365,9 +383,16 @@ async def stream(
         """
         request = self._format_request(messages, tool_specs, system_prompt, self.config.get("params"))
 
-        client = genai.Client(**self.client_args).aio
+        # Determine which client to use based on configuration
+        if self._injected_client is not None:
+            # Use the injected client (caller manages lifecycle)
+            client_aio = self._injected_client.aio
+        else:
+            # Create a new client from client_args
+            client_aio = genai.Client(**self.client_args).aio
+
         try:
-            response = await client.models.generate_content_stream(**request)
+            response = await client_aio.models.generate_content_stream(**request)
 
             yield self._format_chunk({"chunk_type": "message_start"})
             yield self._format_chunk({"chunk_type": "content_start", "data_type": "text"})
@@ -448,6 +473,14 @@ async def structured_output(
             "response_schema": output_model.model_json_schema(),
         }
         request = self._format_request(prompt, None, system_prompt, params)
-        client = genai.Client(**self.client_args).aio
-        response = await client.models.generate_content(**request)
+
+        # Determine which client to use based on configuration
+        if self._injected_client is not None:
+            # Use the injected client (caller manages lifecycle)
+            client_aio = self._injected_client.aio
+        else:
+            # Create a new client from client_args
+            client_aio = genai.Client(**self.client_args).aio
+
+        response = await client_aio.models.generate_content(**request)
         yield {"output": output_model.model_validate(response.parsed)}
diff --git a/src/strands/models/openai.py b/src/strands/models/openai.py
@@ -55,16 +55,40 @@ class OpenAIConfig(TypedDict, total=False):
         model_id: str
         params: Optional[dict[str, Any]]
 
-    def __init__(self, client_args: Optional[dict[str, Any]] = None, **model_config: Unpack[OpenAIConfig]) -> None:
+    def __init__(
+        self,
+        client: Optional[Client] = None,
+        client_args: Optional[dict[str, Any]] = None,
+        **model_config: Unpack[OpenAIConfig],
+    ) -> None:
         """Initialize provider instance.
 
         Args:
-            client_args: Arguments for the OpenAI client.
+            client: Pre-configured OpenAI-compatible client to reuse across requests.
+                When provided, this client will be reused for all requests and will NOT be closed
+                by the model. The caller is responsible for managing the client lifecycle.
+                This is useful for:
+                - Injecting custom client wrappers (e.g., GuardrailsAsyncOpenAI)
+                - Reusing connection pools within a single event loop/worker
+                - Centralizing observability, retries, and networking policy
+                - Pointing to custom model gateways
+                Note: The client should not be shared across different asyncio event loops.
+            client_args: Arguments for the OpenAI client (legacy approach).
                 For a complete list of supported arguments, see https://pypi.org/project/openai/.
+                Note: If `client` is provided, this parameter is ignored.
             **model_config: Configuration options for the OpenAI model.
+
+        Raises:
+            ValueError: If both `client` and `client_args` are provided.
         """
         validate_config_keys(model_config, self.OpenAIConfig)
         self.config = dict(model_config)
+
+        # Validate that only one client configuration method is provided
+        if client is not None and client_args is not None and len(client_args) > 0:
+            raise ValueError("Only one of 'client' or 'client_args' should be provided, not both.")
+
+        self._injected_client = client
         self.client_args = client_args or {}
 
         logger.debug("config=<%s> | initializing", self.config)
@@ -454,12 +478,20 @@ async def stream(
 
         logger.debug("invoking model")
 
-        # We initialize an OpenAI context on every request so as to avoid connection sharing in the underlying httpx
-        # client. The asyncio event loop does not allow connections to be shared. For more details, please refer to
-        # https://github.com/encode/httpx/discussions/2959.
-        async with openai.AsyncOpenAI(**self.client_args) as client:
+        # Determine which client to use based on configuration
+        if self._injected_client is not None:
+            # Use the injected client (caller manages lifecycle)
+            client_to_use = self._injected_client
+        else:
+            # Create a new client from client_args
+            # We initialize an OpenAI context on every request so as to avoid connection sharing in the underlying
+            # httpx client. The asyncio event loop does not allow connections to be shared. For more details, please
+            # refer to https://github.com/encode/httpx/discussions/2959.
+            client_to_use = openai.AsyncOpenAI(**self.client_args)
+
+        try:
             try:
-                response = await client.chat.completions.create(**request)
+                response = await client_to_use.chat.completions.create(**request)
             except openai.BadRequestError as e:
                 # Check if this is a context length exceeded error
                 if hasattr(e, "code") and e.code == "context_length_exceeded":
@@ -532,6 +564,11 @@ async def stream(
             if event and hasattr(event, "usage") and event.usage:
                 yield self.format_chunk({"chunk_type": "metadata", "data": event.usage})
 
+        finally:
+            # Only close the client if we created it (not injected)
+            if self._injected_client is None and hasattr(client_to_use, "close"):
+                await client_to_use.close()
+
         logger.debug("finished streaming response from model")
 
     def _stream_switch_content(self, data_type: str, prev_data_type: str | None) -> tuple[list[StreamEvent], str]:
@@ -573,40 +610,53 @@ async def structured_output(
             ContextWindowOverflowException: If the input exceeds the model's context window.
             ModelThrottledException: If the request is throttled by OpenAI (rate limits).
         """
-        # We initialize an OpenAI context on every request so as to avoid connection sharing in the underlying httpx
-        # client. The asyncio event loop does not allow connections to be shared. For more details, please refer to
-        # https://github.com/encode/httpx/discussions/2959.
-        async with openai.AsyncOpenAI(**self.client_args) as client:
-            try:
-                response: ParsedChatCompletion = await client.beta.chat.completions.parse(
+        # Determine which client to use based on configuration
+        if self._injected_client is not None:
+            # Use the injected client (caller manages lifecycle)
+            client_to_use = self._injected_client
+        else:
+            # Create a new client from client_args
+            # We initialize an OpenAI context on every request so as to avoid connection sharing in the underlying
+            # httpx client. The asyncio event loop does not allow connections to be shared. For more details, please
+            # refer to https://github.com/encode/httpx/discussions/2959.
+            client_to_use = openai.AsyncOpenAI(**self.client_args)
+
+        try:
+            if hasattr(client_to_use, "beta"):
+                response: ParsedChatCompletion = await client_to_use.beta.chat.completions.parse(
                     model=self.get_config()["model_id"],
                     messages=self.format_request(prompt, system_prompt=system_prompt)["messages"],
                     response_format=output_model,
                 )
-            except openai.BadRequestError as e:
-                # Check if this is a context length exceeded error
-                if hasattr(e, "code") and e.code == "context_length_exceeded":
-                    logger.warning("OpenAI threw context window overflow error")
-                    raise ContextWindowOverflowException(str(e)) from e
-                # Re-raise other BadRequestError exceptions
-                raise
-            except openai.RateLimitError as e:
-                # All rate limit errors should be treated as throttling, not context overflow
-                # Rate limits (including TPM) require waiting/retrying, not context reduction
-                logger.warning("OpenAI threw rate limit error")
-                raise ModelThrottledException(str(e)) from e
-
-        parsed: T | None = None
-        # Find the first choice with tool_calls
-        if len(response.choices) > 1:
-            raise ValueError("Multiple choices found in the OpenAI response.")
+        except openai.BadRequestError as e:
+            # Check if this is a context length exceeded error
+            if hasattr(e, "code") and e.code == "context_length_exceeded":
+                logger.warning("OpenAI threw context window overflow error")
+                raise ContextWindowOverflowException(str(e)) from e
+            # Re-raise other BadRequestError exceptions
+            raise
+        except openai.RateLimitError as e:
+            # All rate limit errors should be treated as throttling, not context overflow
+            # Rate limits (including TPM) require waiting/retrying, not context reduction
+            logger.warning("OpenAI threw rate limit error")
+            raise ModelThrottledException(str(e)) from e
+        else:
+            parsed: T | None = None
+            # Find the first choice with tool_calls
+            if len(response.choices) > 1:
+                raise ValueError("Multiple choices found in the OpenAI response.")
+
+            for choice in response.choices:
+                if isinstance(choice.message.parsed, output_model):
+                    parsed = choice.message.parsed
+                    break
 
-        for choice in response.choices:
-            if isinstance(choice.message.parsed, output_model):
-                parsed = choice.message.parsed
-                break
+            if parsed:
+                yield {"output": parsed}
+            else:
+                raise ValueError("No valid tool use or tool use input was found in the OpenAI response.")
 
-        if parsed:
-            yield {"output": parsed}
-        else:
-            raise ValueError("No valid tool use or tool use input was found in the OpenAI response.")
+        finally:
+            # Only close the client if we created it (not injected)
+            if self._injected_client is None and hasattr(client_to_use, "close"):
+                await client_to_use.close()
diff --git a/tests/strands/models/test_gemini.py b/tests/strands/models/test_gemini.py
@@ -637,3 +637,77 @@ async def test_stream_handles_non_json_error(gemini_client, model, messages, cap
 
     assert "Gemini API returned non-JSON error" in caplog.text
     assert f"error_message=<{error_message}>" in caplog.text
+
+
+@pytest.mark.asyncio
+async def test_stream_with_injected_client(model_id, agenerator, alist):
+    """Test that stream works with an injected client and doesn't close it."""
+    # Create a mock injected client
+    mock_injected_client = unittest.mock.Mock()
+    mock_injected_client.aio = unittest.mock.AsyncMock()
+
+    mock_injected_client.aio.models.generate_content_stream.return_value = agenerator(
+        [
+            genai.types.GenerateContentResponse(
+                candidates=[
+                    genai.types.Candidate(
+                        content=genai.types.Content(
+                            parts=[genai.types.Part(text="Hello")],
+                        ),
+                        finish_reason="STOP",
+                    ),
+                ],
+                usage_metadata=genai.types.GenerateContentResponseUsageMetadata(
+                    prompt_token_count=1,
+                    total_token_count=3,
+                ),
+            ),
+        ]
+    )
+
+    # Create model with injected client
+    model = GeminiModel(client=mock_injected_client, model_id=model_id)
+
+    messages = [{"role": "user", "content": [{"text": "test"}]}]
+    response = model.stream(messages)
+    tru_events = await alist(response)
+
+    # Verify events were generated
+    assert len(tru_events) > 0
+
+    # Verify the injected client was used
+    mock_injected_client.aio.models.generate_content_stream.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_structured_output_with_injected_client(model_id, weather_output, alist):
+    """Test that structured_output works with an injected client and doesn't close it."""
+    # Create a mock injected client
+    mock_injected_client = unittest.mock.Mock()
+    mock_injected_client.aio = unittest.mock.AsyncMock()
+
+    mock_injected_client.aio.models.generate_content.return_value = unittest.mock.Mock(
+        parsed=weather_output.model_dump()
+    )
+
+    # Create model with injected client
+    model = GeminiModel(client=mock_injected_client, model_id=model_id)
+
+    messages = [{"role": "user", "content": [{"text": "Generate weather"}]}]
+    stream = model.structured_output(type(weather_output), messages)
+    events = await alist(stream)
+
+    # Verify output was generated
+    assert len(events) == 1
+    assert events[0] == {"output": weather_output}
+
+    # Verify the injected client was used
+    mock_injected_client.aio.models.generate_content.assert_called_once()
+
+
+def test_init_with_both_client_and_client_args_raises_error():
+    """Test that providing both client and client_args raises ValueError."""
+    mock_client = unittest.mock.Mock()
+
+    with pytest.raises(ValueError, match="Only one of 'client' or 'client_args' should be provided"):
+        GeminiModel(client=mock_client, client_args={"api_key": "test"}, model_id="test-model")
diff --git a/tests/strands/models/test_openai.py b/tests/strands/models/test_openai.py