diff --git a/contributing/samples/gepa/experiment.py b/contributing/samples/gepa/experiment.py index 2f5d03a772..f68b349d9c 100644 --- a/contributing/samples/gepa/experiment.py +++ b/contributing/samples/gepa/experiment.py @@ -43,7 +43,6 @@ from tau_bench.types import EnvRunResult from tau_bench.types import RunConfig import tau_bench_agent as tau_bench_agent_lib - import utils diff --git a/contributing/samples/gepa/run_experiment.py b/contributing/samples/gepa/run_experiment.py index cfd850b3a3..1bc4ee58c8 100644 --- a/contributing/samples/gepa/run_experiment.py +++ b/contributing/samples/gepa/run_experiment.py @@ -25,7 +25,6 @@ from absl import flags import experiment from google.genai import types - import utils _OUTPUT_DIR = flags.DEFINE_string( diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index 66f4dbe455..4106315a6c 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -561,11 +561,22 @@ def wrapper(*args, **kwargs): help="Optional. Whether to print detailed results on console or not.", ) @eval_options() +@click.option( + "--print_detailed_results_on_success", + is_flag=True, + show_default=True, + default=False, + help=( + "Optional. Whether to print detailed results on console or not even if" + " the evaluation passed." + ), +) def cli_eval( agent_module_file_path: str, eval_set_file_path_or_id: list[str], config_file_path: str, print_detailed_results: bool, + print_detailed_results_on_success: bool, eval_storage_uri: Optional[str] = None, log_level: str = "INFO", ): @@ -780,6 +791,11 @@ def cli_eval( if print_detailed_results: for eval_result in eval_results: eval_result: EvalCaseResult + + if ( + eval_result.final_eval_status != EvalStatus.PASSED + or print_detailed_results_on_success + ): click.echo( "********************************************************************" ) diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index 514681bfa9..dae5f7af0d 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -113,6 +113,7 @@ async def evaluate_eval_set( num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, print_detailed_results: bool = True, + print_detailed_results_on_success: bool = False, ): """Evaluates an agent using the given EvalSet. @@ -178,6 +179,7 @@ async def evaluate_eval_set( failures_per_eval_case = AgentEvaluator._process_metrics_and_get_failures( eval_metric_results=eval_metric_results, print_detailed_results=print_detailed_results, + print_detailed_results_on_success=print_detailed_results_on_success, agent_module=agent_name, ) @@ -200,6 +202,7 @@ async def evaluate( agent_name: Optional[str] = None, initial_session_file: Optional[str] = None, print_detailed_results: bool = True, + print_detailed_results_on_success: bool = False, ): """Evaluates an Agent given eval data. @@ -245,6 +248,7 @@ async def evaluate( num_runs=num_runs, agent_name=agent_name, print_detailed_results=print_detailed_results, + print_detailed_results_on_success=print_detailed_results_on_success, ) @staticmethod @@ -648,6 +652,7 @@ def _get_eval_metric_results_with_invocation( def _process_metrics_and_get_failures( eval_metric_results: dict[str, list[_EvalMetricResultWithInvocation]], print_detailed_results: bool, + print_detailed_results_on_success: bool, agent_module: str, ) -> list[str]: """Returns a list of failures based on the score for each invocation.""" @@ -678,17 +683,25 @@ def _process_metrics_and_get_failures( # Gather all the failures. if overall_eval_status != EvalStatus.PASSED: - if print_detailed_results: - AgentEvaluator._print_details( - eval_metric_result_with_invocations=eval_metric_results_with_invocations, - overall_eval_status=overall_eval_status, - overall_score=overall_score, - metric_name=metric_name, - threshold=threshold, - ) failures.append( f"{metric_name} for {agent_module} Failed. Expected {threshold}," f" but got {overall_score}." ) + should_print = print_detailed_results and ( + overall_eval_status != EvalStatus.PASSED + or print_detailed_results_on_success + ) + + if should_print: + AgentEvaluator._print_details( + eval_metric_result_with_invocations=( + eval_metric_results_with_invocations + ), + overall_eval_status=overall_eval_status, + overall_score=overall_score, + metric_name=metric_name, + threshold=threshold, + ) + return failures