diff --git a/contributing/samples/gepa/experiment.py b/contributing/samples/gepa/experiment.py index f3751206a8..2710c3894c 100644 --- a/contributing/samples/gepa/experiment.py +++ b/contributing/samples/gepa/experiment.py @@ -43,7 +43,6 @@ from tau_bench.types import EnvRunResult from tau_bench.types import RunConfig import tau_bench_agent as tau_bench_agent_lib - import utils diff --git a/contributing/samples/gepa/run_experiment.py b/contributing/samples/gepa/run_experiment.py index d857da9635..e31db15788 100644 --- a/contributing/samples/gepa/run_experiment.py +++ b/contributing/samples/gepa/run_experiment.py @@ -25,7 +25,6 @@ from absl import flags import experiment from google.genai import types - import utils _OUTPUT_DIR = flags.DEFINE_string( diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index f8fb6795aa..fafd1d28c6 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -280,7 +280,7 @@ def convert_events_to_eval_invocations( invocations = [] for invocation_id, events in events_by_invocation_id.items(): final_response = None - final_event = None + final_event: Optional[Event] = None user_content = Content(parts=[]) invocation_timestamp = 0 app_details = None @@ -315,7 +315,9 @@ def convert_events_to_eval_invocations( invocation_events = [ InvocationEvent(author=e.author, content=e.content) for e in events_to_add - if e is not final_event + if final_event is None + or e is not final_event + or e.get_function_calls() ] invocations.append( Invocation( diff --git a/src/google/adk/flows/llm_flows/functions.py b/src/google/adk/flows/llm_flows/functions.py index eda8474c01..f9393c4986 100644 --- a/src/google/adk/flows/llm_flows/functions.py +++ b/src/google/adk/flows/llm_flows/functions.py @@ -68,6 +68,13 @@ _TOOL_THREAD_POOLS: dict[int, ThreadPoolExecutor] = {} _TOOL_THREAD_POOL_LOCK = threading.Lock() +# Sentinel object used to distinguish a FunctionTool that legitimately returns +# None from a non-FunctionTool sync tool that skips thread-pool execution. +# Using None as a sentinel would cause tools whose underlying function has no +# explicit return statement (implicit None) to fall through to the async +# fallback path and execute a second time. +_SYNC_TOOL_RESULT_UNSET = object() + def _is_live_request_queue_annotation(param: inspect.Parameter) -> bool: """Check whether a parameter is annotated as LiveRequestQueue. @@ -159,13 +166,14 @@ def run_sync_tool(): } return tool.func(**args_to_call) else: - # For other sync tool types, we can't easily run them in thread pool - return None + # For other sync tool types, we can't easily run them in thread pool. + # Return the sentinel so the caller knows to fall back to run_async. + return _SYNC_TOOL_RESULT_UNSET result = await loop.run_in_executor( executor, lambda: ctx.run(run_sync_tool) ) - if result is not None: + if result is not _SYNC_TOOL_RESULT_UNSET: return result else: # For async tools, run them in a new event loop in a background thread. diff --git a/tests/unittests/evaluation/test_evaluation_generator.py b/tests/unittests/evaluation/test_evaluation_generator.py index a4aa8691fd..122d3b7c97 100644 --- a/tests/unittests/evaluation/test_evaluation_generator.py +++ b/tests/unittests/evaluation/test_evaluation_generator.py @@ -16,12 +16,14 @@ from google.adk.evaluation.app_details import AgentDetails from google.adk.evaluation.app_details import AppDetails +from google.adk.evaluation.eval_case import get_all_tool_calls from google.adk.evaluation.evaluation_generator import EvaluationGenerator from google.adk.evaluation.request_intercepter_plugin import _RequestIntercepterPlugin from google.adk.evaluation.simulation.user_simulator import NextUserMessage from google.adk.evaluation.simulation.user_simulator import Status as UserSimulatorStatus from google.adk.evaluation.simulation.user_simulator import UserSimulator from google.adk.events.event import Event +from google.adk.events.event_actions import EventActions from google.adk.models.llm_request import LlmRequest from google.genai import types import pytest @@ -479,3 +481,48 @@ async def mock_generate_inferences_side_effect( mock_generate_inferences.assert_called_once() called_with_content = mock_generate_inferences.call_args.args[3] assert called_with_content.parts[0].text == "message 1" + + +def test_convert_events_preserves_tool_calls_when_skip_summarization(): + """Regression test for #5410. + + When an event has skip_summarization=True, is_final_response() returns True + even if the event contains function calls. Previously such an event was + treated as final_event and excluded from invocation_events, causing + get_all_tool_calls() to return an empty list and tool_trajectory_avg_score + to always be 0.0 despite matching tool calls. + """ + events = [ + Event( + invocation_id="inv1", + author="user", + content=types.Content( + parts=[types.Part(text="run a query")], role="user" + ), + timestamp=1000.0, + ), + Event( + invocation_id="inv1", + author="agent", + content=types.Content( + parts=[ + types.Part( + function_call=types.FunctionCall( + id="call_01", + name="execute_sql", + args={"project_id": "my-proj", "query": "SELECT 1"}, + ) + ) + ] + ), + actions=EventActions(skip_summarization=True), + ), + ] + + invocations = EvaluationGenerator.convert_events_to_eval_invocations(events) + assert len(invocations) == 1 + + tool_calls = get_all_tool_calls(invocations[0].intermediate_data) + assert len(tool_calls) == 1 + assert tool_calls[0].name == "execute_sql" + assert tool_calls[0].args == {"project_id": "my-proj", "query": "SELECT 1"} diff --git a/tests/unittests/evaluation/test_trajectory_evaluator.py b/tests/unittests/evaluation/test_trajectory_evaluator.py index 0fa3fa5a73..8a3dae02a7 100644 --- a/tests/unittests/evaluation/test_trajectory_evaluator.py +++ b/tests/unittests/evaluation/test_trajectory_evaluator.py @@ -16,6 +16,8 @@ from google.adk.evaluation.eval_case import IntermediateData from google.adk.evaluation.eval_case import Invocation +from google.adk.evaluation.eval_case import InvocationEvent +from google.adk.evaluation.eval_case import InvocationEvents from google.adk.evaluation.eval_metrics import EvalMetric from google.adk.evaluation.eval_metrics import PrebuiltMetrics from google.adk.evaluation.eval_metrics import ToolTrajectoryCriterion @@ -462,3 +464,62 @@ def test_evaluate_invocations_no_invocations(evaluator: TrajectoryEvaluator): assert result.overall_score is None assert result.overall_eval_status == EvalStatus.NOT_EVALUATED assert not result.per_invocation_results + + +def _make_invocation_events( + *tool_calls: genai_types.FunctionCall, +) -> Invocation: + """Returns an Invocation using InvocationEvents intermediate_data format.""" + return Invocation( + user_content=_USER_CONTENT, + intermediate_data=InvocationEvents( + invocation_events=[ + InvocationEvent( + author="agent", + content=genai_types.Content( + parts=[genai_types.Part(function_call=tc)] + ), + ) + for tc in tool_calls + ] + ), + ) + + +def test_evaluate_invocations_invocation_events_format_exact_match( + evaluator: TrajectoryEvaluator, +): + """InvocationEvents intermediate_data format should score 1.0 on exact match. + + Regression test for #5410: tool_trajectory_avg_score returned 0.0 even when + tool name and args were identical because function-call events with + skip_summarization=True were incorrectly excluded from invocation_events. + """ + tool_call = genai_types.FunctionCall( + id="toolu_01", name="execute_sql", args={"query": "SELECT 1"} + ) + expected_tool_call = genai_types.FunctionCall( + name="execute_sql", args={"query": "SELECT 1"} + ) + actual = _make_invocation_events(tool_call) + expected = _make_invocation_events(expected_tool_call) + + result = evaluator.evaluate_invocations([actual], [expected]) + assert result.overall_score == 1.0 + assert result.overall_eval_status == EvalStatus.PASSED + + +def test_evaluate_invocations_invocation_events_format_mismatch( + evaluator: TrajectoryEvaluator, +): + """InvocationEvents format should score 0.0 when tool calls differ.""" + actual = _make_invocation_events( + genai_types.FunctionCall(name="tool_a", args={"x": "1"}) + ) + expected = _make_invocation_events( + genai_types.FunctionCall(name="tool_b", args={"x": "1"}) + ) + + result = evaluator.evaluate_invocations([actual], [expected]) + assert result.overall_score == 0.0 + assert result.overall_eval_status == EvalStatus.FAILED diff --git a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py index e134a555c1..f129d7c3c3 100644 --- a/tests/unittests/flows/llm_flows/test_functions_thread_pool.py +++ b/tests/unittests/flows/llm_flows/test_functions_thread_pool.py @@ -277,6 +277,56 @@ def blocking_sleep() -> dict: event_loop_ticks >= 5 ), f'Event loop should have ticked at least 5 times, got {event_loop_ticks}' + @pytest.mark.asyncio + @pytest.mark.parametrize( + 'return_value,use_implicit_return', + [ + (None, True), # implicit None (no return statement) + (None, False), # explicit `return None` + (0, False), # falsy int + ('', False), # falsy str + ({}, False), # falsy dict + (False, False), # falsy bool + ], + ) + async def test_sync_tool_falsy_return_executes_exactly_once( + self, return_value, use_implicit_return + ): + """FunctionTools returning None or other falsy values must execute exactly once. + + Regression test for https://github.com/google/adk-python/issues/5284. + Previously, a None return was mistaken for the internal sentinel used to + signal 'non-FunctionTool, fall back to run_async', causing a second + invocation. The fix uses an identity-based sentinel so that None and other + falsy values (0, '', {}, False) are treated as valid results. + """ + call_count = 0 + + def sync_func(): + nonlocal call_count + call_count += 1 + if not use_implicit_return: + return return_value + # implicit None — no return statement + + tool = FunctionTool(sync_func) + model = testing_utils.MockModel.create(responses=[]) + agent = Agent(name='test_agent', model=model, tools=[tool]) + invocation_context = await testing_utils.create_invocation_context( + agent=agent, user_content='' + ) + tool_context = ToolContext( + invocation_context=invocation_context, + function_call_id='test_id', + ) + + result = await _call_tool_in_thread_pool(tool, {}, tool_context) + + assert result == return_value + assert ( + call_count == 1 + ), f'Tool function executed {call_count} time(s); expected exactly 1.' + @pytest.mark.asyncio async def test_sync_tool_exception_propagates(self): """Test that exceptions from sync tools propagate correctly."""