diff --git a/src/uipath/_cli/_evals/_configurable_factory.py b/src/uipath/_cli/_evals/_configurable_factory.py new file mode 100644 index 000000000..6ae473dc1 --- /dev/null +++ b/src/uipath/_cli/_evals/_configurable_factory.py @@ -0,0 +1,167 @@ +"""Configurable runtime factory that supports model settings overrides.""" + +import json +import logging +import os +import tempfile +from pathlib import Path + +from uipath.runtime import UiPathRuntimeFactoryProtocol, UiPathRuntimeProtocol + +from ._models._evaluation_set import EvaluationSetModelSettings + +logger = logging.getLogger(__name__) + + +class ConfigurableRuntimeFactory: + """Wrapper factory that supports model settings overrides for evaluation runs. + + This factory wraps an existing UiPathRuntimeFactoryProtocol implementation + and allows applying model settings overrides when creating runtimes. + """ + + def __init__(self, base_factory: UiPathRuntimeFactoryProtocol): + """Initialize with a base factory to wrap.""" + self.base_factory = base_factory + self.model_settings_override: EvaluationSetModelSettings | None = None + self._temp_files: list[str] = [] + + def set_model_settings_override( + self, settings: EvaluationSetModelSettings | None + ) -> None: + """Set model settings to override when creating runtimes. + + Args: + settings: The model settings to apply, or None to clear overrides + """ + self.model_settings_override = settings + + async def new_runtime( + self, entrypoint: str, runtime_id: str + ) -> UiPathRuntimeProtocol: + """Create a new runtime with optional model settings overrides. + + If model settings override is configured, creates a temporary modified + entrypoint file with the overridden settings. + + Args: + entrypoint: Path to the agent entrypoint file + runtime_id: Unique identifier for the runtime instance + + Returns: + A new runtime instance with overrides applied if configured + """ + # If no overrides, delegate directly to base factory + if not self.model_settings_override: + return await self.base_factory.new_runtime(entrypoint, runtime_id) + + # Apply overrides by creating modified entrypoint + modified_entrypoint = self._apply_overrides( + entrypoint, self.model_settings_override + ) + if modified_entrypoint: + # Track temp file for cleanup + self._temp_files.append(modified_entrypoint) + return await self.base_factory.new_runtime(modified_entrypoint, runtime_id) + + # If override failed, fall back to original + return await self.base_factory.new_runtime(entrypoint, runtime_id) + + def _apply_overrides( + self, entrypoint: str, settings: EvaluationSetModelSettings + ) -> str | None: + """Apply model settings overrides to an agent entrypoint. + + Creates a temporary modified version of the entrypoint file with + the specified model settings overrides applied. + + Args: + entrypoint: Path to the original entrypoint file + settings: Model settings to override + + Returns: + Path to temporary modified entrypoint, or None if override not needed/failed + """ + if ( + settings.model == "same-as-agent" + and settings.temperature == "same-as-agent" + ): + logger.debug( + "Both model and temperature are 'same-as-agent', no override needed" + ) + return None + + entrypoint_path = Path(entrypoint) + if not entrypoint_path.exists(): + logger.warning(f"Entrypoint file '{entrypoint_path}' not found") + return None + + try: + with open(entrypoint_path, "r") as f: + agent_data = json.load(f) + except (json.JSONDecodeError, IOError) as e: + logger.error(f"Failed to load entrypoint file: {e}") + return None + + original_settings = agent_data.get("settings", {}) + modified_settings = original_settings.copy() + + # Override model if not "same-as-agent" + if settings.model != "same-as-agent": + modified_settings["model"] = settings.model + logger.debug( + f"Overriding model: {original_settings.get('model')} -> {settings.model}" + ) + + # Override temperature if not "same-as-agent" + if settings.temperature not in ["same-as-agent", None]: + if isinstance(settings.temperature, (int, float)): + modified_settings["temperature"] = float(settings.temperature) + elif isinstance(settings.temperature, str): + try: + modified_settings["temperature"] = float(settings.temperature) + except ValueError: + logger.warning( + f"Invalid temperature value: '{settings.temperature}'" + ) + + if "temperature" in modified_settings: + logger.debug( + f"Overriding temperature: {original_settings.get('temperature')} -> " + f"{modified_settings['temperature']}" + ) + + if modified_settings == original_settings: + return None + + agent_data["settings"] = modified_settings + + # Create a temporary file with the modified agent definition + try: + temp_fd, temp_path = tempfile.mkstemp( + suffix=".json", prefix="agent_override_" + ) + with os.fdopen(temp_fd, "w") as temp_file: + json.dump(agent_data, temp_file, indent=2) + + logger.info(f"Created temporary entrypoint with overrides: {temp_path}") + return temp_path + except Exception as e: + logger.error(f"Failed to create temporary entrypoint file: {e}") + return None + + async def dispose(self) -> None: + """Dispose resources and clean up temporary files.""" + # Clean up any temporary files created + for temp_file in self._temp_files: + try: + os.unlink(temp_file) + logger.debug(f"Cleaned up temporary file: {temp_file}") + except Exception as e: + logger.warning(f"Failed to clean up temporary file {temp_file}: {e}") + + self._temp_files.clear() + + # Delegate disposal to base factory + if hasattr(self.base_factory, "dispose"): + await self.base_factory.dispose() diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py index 78a2fb6f5..5f6a2952a 100644 --- a/src/uipath/_cli/_evals/_models/_evaluation_set.py +++ b/src/uipath/_cli/_evals/_models/_evaluation_set.py @@ -70,7 +70,7 @@ class ModelSettings(BaseModel): """Model Generation Parameters.""" model: str = Field(..., alias="model") - temperature: float | None = Field(default=None, alias="temperature") + temperature: float | str | None = Field(default=None, alias="temperature") top_p: float | None = Field(default=None, alias="topP") top_k: int | None = Field(default=None, alias="topK") frequency_penalty: float | None = Field(default=None, alias="frequencyPenalty") @@ -78,6 +78,12 @@ class ModelSettings(BaseModel): max_tokens: int | None = Field(default=None, alias="maxTokens") +class EvaluationSetModelSettings(ModelSettings): + """Model setting overrides within evaluation sets with ID.""" + + id: str = Field(..., alias="id") + + class LLMMockingStrategy(BaseMockingStrategy): type: Literal[MockingStrategyType.LLM] = MockingStrategyType.LLM prompt: str = Field(..., alias="prompt") @@ -211,6 +217,9 @@ class EvaluationSet(BaseModel): default_factory=list, alias="evaluatorConfigs" ) evaluations: list[EvaluationItem] = Field(default_factory=list) + model_settings: list[EvaluationSetModelSettings] = Field( + default_factory=list, alias="modelSettings" + ) def extract_selected_evals(self, eval_ids) -> None: selected_evals: list[EvaluationItem] = [] @@ -239,7 +248,7 @@ class LegacyEvaluationSet(BaseModel): name: str batch_size: int = Field(10, alias="batchSize") timeout_minutes: int = Field(default=20, alias="timeoutMinutes") - model_settings: list[dict[str, Any]] = Field( + model_settings: list[EvaluationSetModelSettings] = Field( default_factory=list, alias="modelSettings" ) created_at: str = Field(alias="createdAt") diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index e3b104735..1dccee744 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -56,6 +56,7 @@ from ...eval.models.models import AgentExecution, EvalItemResult from .._utils._eval_set import EvalHelpers from .._utils._parallelization import execute_parallel +from ._configurable_factory import ConfigurableRuntimeFactory from ._evaluator_factory import EvaluatorFactory from ._models._evaluation_set import ( EvaluationItem, @@ -184,6 +185,7 @@ class UiPathEvalContext: verbose: bool = False enable_mocker_cache: bool = False report_coverage: bool = False + model_settings_id: str = "default" class UiPathEvalRuntime: @@ -197,7 +199,8 @@ def __init__( event_bus: EventBus, ): self.context: UiPathEvalContext = context - self.factory: UiPathRuntimeFactoryProtocol = factory + # Wrap the factory to support model settings overrides + self.factory = ConfigurableRuntimeFactory(factory) self.event_bus: EventBus = event_bus self.trace_manager: UiPathTraceManager = trace_manager self.span_exporter: ExecutionSpanExporter = ExecutionSpanExporter() @@ -222,6 +225,10 @@ async def __aexit__(self, *args: Any) -> None: self.coverage.stop() self.coverage.report(include=["./*"], show_missing=True) + # Clean up any temporary files created by the factory + if hasattr(self.factory, "dispose"): + await self.factory.dispose() + async def get_schema(self, runtime: UiPathRuntimeProtocol) -> UiPathRuntimeSchema: schema = await runtime.get_schema() if schema is None: @@ -550,12 +557,61 @@ def _get_and_clear_execution_data( return spans, logs + async def _configure_model_settings_override(self) -> None: + """Configure the factory with model settings override if specified.""" + # Skip if no model settings ID specified + if ( + not self.context.model_settings_id + or self.context.model_settings_id == "default" + ): + return + + # Load evaluation set to get model settings + evaluation_set, _ = EvalHelpers.load_eval_set(self.context.eval_set or "") + if ( + not hasattr(evaluation_set, "model_settings") + or not evaluation_set.model_settings + ): + logger.warning("No model settings available in evaluation set") + return + + # Find the specified model settings + target_model_settings = next( + ( + ms + for ms in evaluation_set.model_settings + if ms.id == self.context.model_settings_id + ), + None, + ) + + if not target_model_settings: + logger.warning( + f"Model settings ID '{self.context.model_settings_id}' not found in evaluation set" + ) + return + + logger.info( + f"Configuring model settings override: id='{target_model_settings.id}', " + f"model='{target_model_settings.model}', temperature='{target_model_settings.temperature}'" + ) + + # Configure the factory with the override settings + self.factory.set_model_settings_override(target_model_settings) + async def execute_runtime( self, eval_item: EvaluationItem, execution_id: str, runtime: UiPathRuntimeProtocol, ) -> UiPathEvalRunExecutionOutput: + # Apply model settings override if specified + await self._configure_model_settings_override() + + runtime = await self.factory.new_runtime( + entrypoint=self.context.entrypoint or "", + runtime_id=execution_id, + ) log_handler = self._setup_execution_logging(execution_id) attributes = { "evalId": eval_item.id, diff --git a/src/uipath/_cli/_utils/_eval_set.py b/src/uipath/_cli/_utils/_eval_set.py index ffb352be0..3bd7dffd4 100644 --- a/src/uipath/_cli/_utils/_eval_set.py +++ b/src/uipath/_cli/_utils/_eval_set.py @@ -149,6 +149,7 @@ def migrate_evaluation_item( migrate_evaluation_item(evaluation, eval_set.evaluator_refs) for evaluation in eval_set.evaluations ], + model_settings=eval_set.model_settings, ) except ValidationError as e: raise ValueError( diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py index 04b42fef9..736d82ae8 100644 --- a/src/uipath/_cli/cli_eval.py +++ b/src/uipath/_cli/cli_eval.py @@ -92,6 +92,12 @@ def setup_reporting_prereq(no_report: bool) -> bool: default=False, help="Report evaluation coverage", ) +@click.option( + "--model-settings-id", + type=str, + default="default", + help="Model settings ID from evaluation set to override agent settings (default: 'default')", +) def eval( entrypoint: str | None, eval_set: str | None, @@ -102,6 +108,7 @@ def eval( output_file: str | None, enable_mocker_cache: bool, report_coverage: bool, + model_settings_id: str, ) -> None: """Run an evaluation set against the agent. @@ -114,6 +121,7 @@ def eval( no_report: Do not report the evaluation results enable_mocker_cache: Enable caching for LLM mocker responses report_coverage: Report evaluation coverage + model_settings_id: Model settings ID to override agent settings """ should_register_progress_reporter = setup_reporting_prereq(no_report) @@ -148,6 +156,7 @@ def eval( eval_context.eval_set = resolved_eval_set_path eval_context.eval_ids = eval_ids eval_context.report_coverage = report_coverage + eval_context.model_settings_id = model_settings_id try: diff --git a/tests/cli/eval/test_configurable_factory.py b/tests/cli/eval/test_configurable_factory.py new file mode 100644 index 000000000..690ef90aa --- /dev/null +++ b/tests/cli/eval/test_configurable_factory.py @@ -0,0 +1,183 @@ +"""Tests for ConfigurableRuntimeFactory.""" + +import json +import tempfile +from pathlib import Path +from unittest.mock import AsyncMock, Mock + +import pytest + +from uipath._cli._evals._configurable_factory import ConfigurableRuntimeFactory +from uipath._cli._evals._models._evaluation_set import EvaluationSetModelSettings + + +@pytest.mark.asyncio +async def test_configurable_factory_no_override(): + """Test factory without any overrides.""" + mock_base_factory = AsyncMock() + mock_runtime = Mock() + mock_base_factory.new_runtime.return_value = mock_runtime + + factory = ConfigurableRuntimeFactory(mock_base_factory) + + result = await factory.new_runtime("test.json", "test-id") + + assert result == mock_runtime + mock_base_factory.new_runtime.assert_called_once_with("test.json", "test-id") + + +@pytest.mark.asyncio +async def test_configurable_factory_with_model_override(): + """Test factory with model override.""" + # Create a temporary agent.json file + test_agent = {"settings": {"model": "gpt-4", "temperature": 0.7}} + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(test_agent, f) + temp_path = f.name + + try: + mock_base_factory = AsyncMock() + mock_runtime = Mock() + mock_base_factory.new_runtime.return_value = mock_runtime + + factory = ConfigurableRuntimeFactory(mock_base_factory) + + # Set model override + settings = EvaluationSetModelSettings( + id="test-settings", model="gpt-3.5-turbo", temperature="same-as-agent" + ) + factory.set_model_settings_override(settings) + + result = await factory.new_runtime(temp_path, "test-id") + + assert result == mock_runtime + # Should have been called with a modified temp file + call_args = mock_base_factory.new_runtime.call_args + assert call_args[0][0] != temp_path # Different path (temp file) + assert call_args[0][1] == "test-id" + + # Verify the temp file has correct content + with open(call_args[0][0]) as f: + modified_data = json.load(f) + assert modified_data["settings"]["model"] == "gpt-3.5-turbo" + assert modified_data["settings"]["temperature"] == 0.7 # Unchanged + + finally: + Path(temp_path).unlink(missing_ok=True) + # Clean up temp files created by factory + await factory.dispose() + + +@pytest.mark.asyncio +async def test_configurable_factory_same_as_agent(): + """Test factory when both settings are 'same-as-agent'.""" + # Create a temporary agent.json file + test_agent = {"settings": {"model": "gpt-4", "temperature": 0.7}} + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(test_agent, f) + temp_path = f.name + + try: + mock_base_factory = AsyncMock() + mock_runtime = Mock() + mock_base_factory.new_runtime.return_value = mock_runtime + + factory = ConfigurableRuntimeFactory(mock_base_factory) + + # Set "same-as-agent" for both + settings = EvaluationSetModelSettings( + id="test-settings", model="same-as-agent", temperature="same-as-agent" + ) + factory.set_model_settings_override(settings) + + result = await factory.new_runtime(temp_path, "test-id") + + assert result == mock_runtime + # Should use original path (no override) + mock_base_factory.new_runtime.assert_called_once_with(temp_path, "test-id") + + finally: + Path(temp_path).unlink(missing_ok=True) + + +@pytest.mark.asyncio +async def test_configurable_factory_temperature_override(): + """Test factory with temperature override.""" + # Create a temporary agent.json file + test_agent = {"settings": {"model": "gpt-4", "temperature": 0.7}} + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(test_agent, f) + temp_path = f.name + + try: + mock_base_factory = AsyncMock() + mock_runtime = Mock() + mock_base_factory.new_runtime.return_value = mock_runtime + + factory = ConfigurableRuntimeFactory(mock_base_factory) + + # Set temperature override + settings = EvaluationSetModelSettings( + id="test-settings", model="same-as-agent", temperature=0.2 + ) + factory.set_model_settings_override(settings) + + result = await factory.new_runtime(temp_path, "test-id") + + assert result == mock_runtime + # Should have been called with a modified temp file + call_args = mock_base_factory.new_runtime.call_args + assert call_args[0][0] != temp_path # Different path (temp file) + + # Verify the temp file has correct content + with open(call_args[0][0]) as f: + modified_data = json.load(f) + assert modified_data["settings"]["model"] == "gpt-4" # Unchanged + assert modified_data["settings"]["temperature"] == 0.2 # Changed + + finally: + Path(temp_path).unlink(missing_ok=True) + await factory.dispose() + + +@pytest.mark.asyncio +async def test_configurable_factory_cleanup(): + """Test that temporary files are cleaned up.""" + test_agent = {"settings": {"model": "gpt-4", "temperature": 0.7}} + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(test_agent, f) + temp_path = f.name + + try: + mock_base_factory = AsyncMock() + mock_runtime = Mock() + mock_base_factory.new_runtime.return_value = mock_runtime + + factory = ConfigurableRuntimeFactory(mock_base_factory) + + settings = EvaluationSetModelSettings( + id="test-settings", model="gpt-3.5-turbo", temperature=0.5 + ) + factory.set_model_settings_override(settings) + + await factory.new_runtime(temp_path, "test-id") + + # Get the temp file created + call_args = mock_base_factory.new_runtime.call_args + temp_file_created = call_args[0][0] + + # Temp file should exist + assert Path(temp_file_created).exists() + + # Clean up + await factory.dispose() + + # Temp file should be deleted + assert not Path(temp_file_created).exists() + + finally: + Path(temp_path).unlink(missing_ok=True)