From 7b332e8725e47efa485c551a6c56d3e114510d14 Mon Sep 17 00:00:00 2001 From: Alan Date: Wed, 10 Dec 2025 16:55:34 -0600 Subject: [PATCH 1/5] Add Agent Evaluations print results on success flag --- src/google/adk/evaluation/agent_evaluator.py | 29 ++++++++++++++------ 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index 514681bfa9..74a18758e1 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -113,6 +113,7 @@ async def evaluate_eval_set( num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, print_detailed_results: bool = True, + print_detailed_results_on_success: bool = False, ): """Evaluates an agent using the given EvalSet. @@ -178,6 +179,7 @@ async def evaluate_eval_set( failures_per_eval_case = AgentEvaluator._process_metrics_and_get_failures( eval_metric_results=eval_metric_results, print_detailed_results=print_detailed_results, + print_detailed_results_on_success=print_detailed_results_on_success, agent_module=agent_name, ) @@ -200,6 +202,7 @@ async def evaluate( agent_name: Optional[str] = None, initial_session_file: Optional[str] = None, print_detailed_results: bool = True, + print_detailed_results_on_success: bool = False, ): """Evaluates an Agent given eval data. @@ -245,6 +248,7 @@ async def evaluate( num_runs=num_runs, agent_name=agent_name, print_detailed_results=print_detailed_results, + print_detailed_results_on_success=print_detailed_results_on_success, ) @staticmethod @@ -648,6 +652,7 @@ def _get_eval_metric_results_with_invocation( def _process_metrics_and_get_failures( eval_metric_results: dict[str, list[_EvalMetricResultWithInvocation]], print_detailed_results: bool, + print_detailed_results_on_success: bool, agent_module: str, ) -> list[str]: """Returns a list of failures based on the score for each invocation.""" @@ -678,17 +683,25 @@ def _process_metrics_and_get_failures( # Gather all the failures. if overall_eval_status != EvalStatus.PASSED: - if print_detailed_results: - AgentEvaluator._print_details( - eval_metric_result_with_invocations=eval_metric_results_with_invocations, - overall_eval_status=overall_eval_status, - overall_score=overall_score, - metric_name=metric_name, - threshold=threshold, - ) + should_print = print_detailed_results failures.append( f"{metric_name} for {agent_module} Failed. Expected {threshold}," f" but got {overall_score}." ) + else: + should_print = ( + print_detailed_results and print_detailed_results_on_success + ) + + if should_print: + AgentEvaluator._print_details( + eval_metric_result_with_invocations=( + eval_metric_results_with_invocations + ), + overall_eval_status=overall_eval_status, + overall_score=overall_score, + metric_name=metric_name, + threshold=threshold, + ) return failures From 25ec8a6478197418d70eb9d212b7d5ee14f909df Mon Sep 17 00:00:00 2001 From: Alan Date: Wed, 10 Dec 2025 17:03:03 -0600 Subject: [PATCH 2/5] Adding cli option --- src/google/adk/cli/cli_tools_click.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index 66f4dbe455..82eeda6833 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -561,11 +561,22 @@ def wrapper(*args, **kwargs): help="Optional. Whether to print detailed results on console or not.", ) @eval_options() +@click.option( + "--print_detailed_results_on_success", + is_flag=True, + show_default=True, + default=False, + help=( + "Optional. Whether to print detailed results on console or not even if" + " the evaluation passed." + ), +) def cli_eval( agent_module_file_path: str, eval_set_file_path_or_id: list[str], config_file_path: str, print_detailed_results: bool, + print_detailed_results_on_success: bool, eval_storage_uri: Optional[str] = None, log_level: str = "INFO", ): @@ -780,10 +791,17 @@ def cli_eval( if print_detailed_results: for eval_result in eval_results: eval_result: EvalCaseResult - click.echo( - "********************************************************************" + + should_print = ( + eval_result.final_eval_status != EvalStatus.PASSED + or print_detailed_results_on_success ) - pretty_print_eval_result(eval_result) + + if should_print: + click.echo( + "********************************************************************" + ) + pretty_print_eval_result(eval_result) @main.group("eval_set") From 5a58e88d162fbdda8e6a8fb83fdadbe30a2389e2 Mon Sep 17 00:00:00 2001 From: Alan Date: Wed, 10 Dec 2025 17:18:31 -0600 Subject: [PATCH 3/5] Refactored per gemini suggestions. --- src/google/adk/cli/cli_tools_click.py | 7 +------ src/google/adk/evaluation/agent_evaluator.py | 10 +++++----- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index 82eeda6833..4db9c368dc 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -792,12 +792,7 @@ def cli_eval( for eval_result in eval_results: eval_result: EvalCaseResult - should_print = ( - eval_result.final_eval_status != EvalStatus.PASSED - or print_detailed_results_on_success - ) - - if should_print: + if eval_result.final_eval_status != EvalStatus.PASSED or print_detailed_results_on_success: click.echo( "********************************************************************" ) diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index 74a18758e1..dae5f7af0d 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -683,15 +683,15 @@ def _process_metrics_and_get_failures( # Gather all the failures. if overall_eval_status != EvalStatus.PASSED: - should_print = print_detailed_results failures.append( f"{metric_name} for {agent_module} Failed. Expected {threshold}," f" but got {overall_score}." ) - else: - should_print = ( - print_detailed_results and print_detailed_results_on_success - ) + + should_print = print_detailed_results and ( + overall_eval_status != EvalStatus.PASSED + or print_detailed_results_on_success + ) if should_print: AgentEvaluator._print_details( From 852da24cdc33b0a7f66639835fd6d7567c22c3f0 Mon Sep 17 00:00:00 2001 From: Alan Date: Thu, 11 Dec 2025 19:48:38 -0600 Subject: [PATCH 4/5] Format update from autoformat tool --- src/google/adk/cli/cli_tools_click.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index 4db9c368dc..4106315a6c 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -791,12 +791,15 @@ def cli_eval( if print_detailed_results: for eval_result in eval_results: eval_result: EvalCaseResult - - if eval_result.final_eval_status != EvalStatus.PASSED or print_detailed_results_on_success: - click.echo( - "********************************************************************" - ) - pretty_print_eval_result(eval_result) + + if ( + eval_result.final_eval_status != EvalStatus.PASSED + or print_detailed_results_on_success + ): + click.echo( + "********************************************************************" + ) + pretty_print_eval_result(eval_result) @main.group("eval_set") From 34a4ec728f7e88e96f9bbcb2dd3fc15432e18d35 Mon Sep 17 00:00:00 2001 From: Alan Date: Thu, 11 Dec 2025 19:53:59 -0600 Subject: [PATCH 5/5] format changes from autoformat.sh --- contributing/samples/gepa/experiment.py | 1 - contributing/samples/gepa/run_experiment.py | 1 - 2 files changed, 2 deletions(-) diff --git a/contributing/samples/gepa/experiment.py b/contributing/samples/gepa/experiment.py index 2f5d03a772..f68b349d9c 100644 --- a/contributing/samples/gepa/experiment.py +++ b/contributing/samples/gepa/experiment.py @@ -43,7 +43,6 @@ from tau_bench.types import EnvRunResult from tau_bench.types import RunConfig import tau_bench_agent as tau_bench_agent_lib - import utils diff --git a/contributing/samples/gepa/run_experiment.py b/contributing/samples/gepa/run_experiment.py index cfd850b3a3..1bc4ee58c8 100644 --- a/contributing/samples/gepa/run_experiment.py +++ b/contributing/samples/gepa/run_experiment.py @@ -25,7 +25,6 @@ from absl import flags import experiment from google.genai import types - import utils _OUTPUT_DIR = flags.DEFINE_string(