Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion contributing/samples/gepa/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
from tau_bench.types import EnvRunResult
from tau_bench.types import RunConfig
import tau_bench_agent as tau_bench_agent_lib

import utils


Expand Down
1 change: 0 additions & 1 deletion contributing/samples/gepa/run_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from absl import flags
import experiment
from google.genai import types

import utils

_OUTPUT_DIR = flags.DEFINE_string(
Expand Down
16 changes: 16 additions & 0 deletions src/google/adk/cli/cli_tools_click.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,11 +561,22 @@ def wrapper(*args, **kwargs):
help="Optional. Whether to print detailed results on console or not.",
)
@eval_options()
@click.option(
"--print_detailed_results_on_success",
is_flag=True,
show_default=True,
default=False,
help=(
"Optional. Whether to print detailed results on console or not even if"
" the evaluation passed."
),
)
def cli_eval(
agent_module_file_path: str,
eval_set_file_path_or_id: list[str],
config_file_path: str,
print_detailed_results: bool,
print_detailed_results_on_success: bool,
eval_storage_uri: Optional[str] = None,
log_level: str = "INFO",
):
Expand Down Expand Up @@ -780,6 +791,11 @@ def cli_eval(
if print_detailed_results:
for eval_result in eval_results:
eval_result: EvalCaseResult

if (
eval_result.final_eval_status != EvalStatus.PASSED
or print_detailed_results_on_success
):
click.echo(
"********************************************************************"
)
Expand Down
29 changes: 21 additions & 8 deletions src/google/adk/evaluation/agent_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ async def evaluate_eval_set(
num_runs: int = NUM_RUNS,
agent_name: Optional[str] = None,
print_detailed_results: bool = True,
print_detailed_results_on_success: bool = False,
):
"""Evaluates an agent using the given EvalSet.

Expand Down Expand Up @@ -178,6 +179,7 @@ async def evaluate_eval_set(
failures_per_eval_case = AgentEvaluator._process_metrics_and_get_failures(
eval_metric_results=eval_metric_results,
print_detailed_results=print_detailed_results,
print_detailed_results_on_success=print_detailed_results_on_success,
agent_module=agent_name,
)

Expand All @@ -200,6 +202,7 @@ async def evaluate(
agent_name: Optional[str] = None,
initial_session_file: Optional[str] = None,
print_detailed_results: bool = True,
print_detailed_results_on_success: bool = False,
):
"""Evaluates an Agent given eval data.

Expand Down Expand Up @@ -245,6 +248,7 @@ async def evaluate(
num_runs=num_runs,
agent_name=agent_name,
print_detailed_results=print_detailed_results,
print_detailed_results_on_success=print_detailed_results_on_success,
)

@staticmethod
Expand Down Expand Up @@ -648,6 +652,7 @@ def _get_eval_metric_results_with_invocation(
def _process_metrics_and_get_failures(
eval_metric_results: dict[str, list[_EvalMetricResultWithInvocation]],
print_detailed_results: bool,
print_detailed_results_on_success: bool,
agent_module: str,
) -> list[str]:
"""Returns a list of failures based on the score for each invocation."""
Expand Down Expand Up @@ -678,17 +683,25 @@ def _process_metrics_and_get_failures(

# Gather all the failures.
if overall_eval_status != EvalStatus.PASSED:
if print_detailed_results:
AgentEvaluator._print_details(
eval_metric_result_with_invocations=eval_metric_results_with_invocations,
overall_eval_status=overall_eval_status,
overall_score=overall_score,
metric_name=metric_name,
threshold=threshold,
)
failures.append(
f"{metric_name} for {agent_module} Failed. Expected {threshold},"
f" but got {overall_score}."
)

should_print = print_detailed_results and (
overall_eval_status != EvalStatus.PASSED
or print_detailed_results_on_success
)

if should_print:
AgentEvaluator._print_details(
eval_metric_result_with_invocations=(
eval_metric_results_with_invocations
),
overall_eval_status=overall_eval_status,
overall_score=overall_score,
metric_name=metric_name,
threshold=threshold,
)

return failures