Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 78 additions & 1 deletion src/google/adk/evaluation/final_response_match_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@

from __future__ import annotations

import re
from typing import Optional
import unicodedata

from google.genai import types as genai_types
from typing_extensions import override
Expand Down Expand Up @@ -109,6 +111,63 @@ def _get_eval_status(score: float, threshold: float):
return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED


class _UnicodeTokenizer:
"""A tokenizer that handles Unicode text for non-Latin scripts.

The default rouge_scorer tokenizer only works with ASCII characters,
returning empty token lists for non-Latin scripts like Thai, Chinese,
Arabic, etc. This tokenizer uses Unicode-aware regex to properly
tokenize text in any script.
"""

def tokenize(self, text: str) -> list[str]:
"""Tokenizes text using Unicode-aware word boundaries.

Args:
text: The text to tokenize.

Returns:
A list of tokens (words) from the text.
"""
return re.findall(r"\w+", text, re.UNICODE)


def _is_latin_script(text: str) -> bool:
"""Checks if text is primarily Latin script.

This is used to determine whether to apply English-specific stemming.
Latin script includes English, Portuguese, Spanish, French, German, etc.
Non-Latin scripts include Thai, Chinese, Arabic, Japanese, Korean, etc.

Args:
text: The text to analyze.

Returns:
True if the text is primarily Latin script, False otherwise.
"""
if not text:
return True

latin_chars = 0
letter_chars = 0

for char in text:
# Check if character is a letter (category starts with 'L')
if unicodedata.category(char).startswith("L"):
letter_chars += 1
# Check if it's a Latin character by looking at its Unicode name
char_name = unicodedata.name(char, "")
if "LATIN" in char_name:
latin_chars += 1

# If no letters found, default to Latin (likely punctuation/numbers only)
if letter_chars == 0:
return True

# Consider text as Latin if more than 50% of letters are Latin
return latin_chars / letter_chars > 0.5


def _calculate_rouge_1_scores(candidate: str, reference: str):
"""Calculates the ROUGE-1 score between a candidate and reference text.

Expand All @@ -120,14 +179,32 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
candidate.
- F-measure: The harmonic mean of precision and recall.

Stemming is only applied for Latin script text (English, Portuguese, etc.)
since the Porter stemmer only works correctly for English. For non-Latin
scripts (Thai, Chinese, Arabic, etc.), stemming is disabled to ensure
accurate matching.

Args:
candidate: The generated text to be evaluated.
reference: The ground-truth text to compare against.

Returns:
A dictionary containing the ROUGE-1 precision, recall, and f-measure.
"""
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
# Check if both texts are Latin script
is_latin = _is_latin_script(candidate) and _is_latin_script(reference)

# For Latin scripts (English, Portuguese, etc.): use default tokenizer with
# stemmer. For non-Latin scripts (Thai, Chinese, Arabic, etc.): use custom
# Unicode tokenizer without stemmer, since:
# 1. Porter stemmer only works for English
# 2. Default tokenizer doesn't handle Unicode characters properly
if is_latin:
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
else:
scorer = rouge_scorer.RougeScorer(
["rouge1"], use_stemmer=False, tokenizer=_UnicodeTokenizer()
)

# The score method returns a dictionary where keys are the ROUGE types
# and values are Score objects (tuples) with precision, recall, and fmeasure.
Expand Down
274 changes: 274 additions & 0 deletions tests/unittests/evaluation/test_final_response_match_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from google.adk.evaluation.eval_metrics import PrebuiltMetrics
from google.adk.evaluation.evaluator import EvalStatus
from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
from google.adk.evaluation.final_response_match_v1 import _is_latin_script
from google.adk.evaluation.final_response_match_v1 import RougeEvaluator
from google.genai import types as genai_types
import pytest
Expand Down Expand Up @@ -147,3 +148,276 @@ def test_get_metric_info():
assert metric_info.metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value
assert metric_info.metric_value_info.interval.min_value == 0.0
assert metric_info.metric_value_info.interval.max_value == 1.0


# Tests for _is_latin_script function
class TestIsLatinScript:
"""Tests for the _is_latin_script helper function."""

def test_empty_string(self):
"""Empty string should default to Latin."""
assert _is_latin_script("") is True

def test_english_text(self):
"""English text should be detected as Latin."""
assert _is_latin_script("Hello world") is True
assert _is_latin_script("The quick brown fox") is True

def test_portuguese_text(self):
"""Portuguese with accents should be detected as Latin."""
assert _is_latin_script("Olá, como você está?") is True
assert _is_latin_script("São Paulo é uma cidade") is True

def test_french_text(self):
"""French with accents should be detected as Latin."""
assert _is_latin_script("Bonjour, comment allez-vous?") is True
assert _is_latin_script("français café résumé") is True

def test_german_text(self):
"""German with umlauts should be detected as Latin."""
assert _is_latin_script("Guten Tag, wie geht es Ihnen?") is True
assert _is_latin_script("Größe Übung Äpfel") is True

def test_thai_text(self):
"""Thai text should be detected as non-Latin."""
assert _is_latin_script("สวัสดี") is False
assert _is_latin_script("สวัสดีครับ") is False

def test_chinese_text(self):
"""Chinese text should be detected as non-Latin."""
assert _is_latin_script("你好") is False
assert _is_latin_script("中文测试") is False

def test_arabic_text(self):
"""Arabic text should be detected as non-Latin."""
assert _is_latin_script("مرحبا") is False
assert _is_latin_script("اللغة العربية") is False

def test_japanese_text(self):
"""Japanese text should be detected as non-Latin."""
assert _is_latin_script("こんにちは") is False
assert _is_latin_script("日本語テスト") is False

def test_korean_text(self):
"""Korean text should be detected as non-Latin."""
assert _is_latin_script("안녕하세요") is False
assert _is_latin_script("한국어 테스트") is False

def test_numbers_only(self):
"""Numbers only should default to Latin."""
assert _is_latin_script("12345") is True

def test_punctuation_only(self):
"""Punctuation only should default to Latin."""
assert _is_latin_script("!@#$%") is True

def test_mixed_latin_dominant(self):
"""Mixed text with Latin dominant should be Latin."""
assert _is_latin_script("Hello 你好 world test") is True

def test_mixed_non_latin_dominant(self):
"""Mixed text with non-Latin dominant should be non-Latin."""
assert _is_latin_script("你好世界 Hi") is False


# Tests for non-English language ROUGE scoring
class TestNonEnglishRougeScoring:
"""Tests for ROUGE scoring with non-English languages (Issue #3111).

These tests verify that the fix for non-English languages works correctly.
The key issue was that Porter stemmer only works for English, causing
match failures for other languages.
"""

# === Thai Language Tests (Original Issue #3111) ===

def test_thai_greeting_identical(self):
"""Thai: Identical greeting should have perfect score."""
# This is the exact case from Issue #3111
candidate = "สวัสดี"
reference = "สวัสดี"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
assert rouge_1_score.fmeasure == 1.0

def test_thai_sentence_with_overlap(self):
"""Thai: Sentences with common words should show partial match."""
# "Hello, how are you today?" vs "Hello, how is the weather?"
candidate = "สวัสดี คุณ สบายดี ไหม วันนี้"
reference = "สวัสดี คุณ อากาศ เป็น อย่างไร"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match "สวัสดี" and "คุณ" (2 out of 5 words each)
assert rouge_1_score.fmeasure > 0
assert rouge_1_score.fmeasure < 1.0
Comment on lines +249 to +250
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The assertions > 0 and < 1.0 are correct but not very precise. We can calculate the exact expected F-measure to make this test stronger. Given the candidate and reference texts, the F-measure should be exactly 0.4.

Calculation:

  • Candidate tokens: 5
  • Reference tokens: 5
  • Common tokens: 2
  • Precision = 2/5 = 0.4
  • Recall = 2/5 = 0.4
  • F-measure = 2 * (0.4 * 0.4) / (0.4 + 0.4) = 0.4
Suggested change
assert rouge_1_score.fmeasure > 0
assert rouge_1_score.fmeasure < 1.0
assert rouge_1_score.fmeasure == pytest.approx(0.4)


def test_thai_polite_particle_variation(self):
"""Thai: Same meaning with polite particle should show high match."""
# "Hello" vs "Hello (polite)"
candidate = "สวัสดี ครับ"
reference = "สวัสดี ค่ะ"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match "สวัสดี" (1 out of 2 words)
assert rouge_1_score.fmeasure == pytest.approx(0.5, rel=0.1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The assertion uses a relative tolerance rel=0.1, which is unnecessarily loose as the expected F-measure is exactly 0.5. Using pytest.approx(0.5) without a tolerance is more precise and makes the test stricter.

Suggested change
assert rouge_1_score.fmeasure == pytest.approx(0.5, rel=0.1)
assert rouge_1_score.fmeasure == pytest.approx(0.5)


# === Chinese Language Tests ===

def test_chinese_greeting_identical(self):
"""Chinese: Identical greeting should have perfect score."""
candidate = "你好世界"
reference = "你好世界"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
assert rouge_1_score.fmeasure == 1.0

def test_chinese_sentence_with_overlap(self):
"""Chinese: Sentences with common words should show partial match."""
# Space-separated for tokenization
candidate = "今天 天气 很好" # "Today's weather is good"
reference = "今天 我 很 开心" # "Today I am happy"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match "今天" and "很"
assert rouge_1_score.fmeasure > 0
assert rouge_1_score.fmeasure < 1.0
Comment on lines +273 to +278
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The comment on line 276, Should match "今天" and "很", is inconsistent with the candidate string "今天 天气 很好". The default tokenizer will treat "很好" as a single token, so only "今天" will be matched.

To align with the comment's intent and create a stronger test, I suggest splitting "很好" into "很 好" in the candidate string. This will result in an F-measure of 0.5.

Suggested change
candidate = "今天 天气 很好" # "Today's weather is good"
reference = "今天 我 很 开心" # "Today I am happy"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match "今天" and "很"
assert rouge_1_score.fmeasure > 0
assert rouge_1_score.fmeasure < 1.0
candidate = "今天 天气 很 好" # "Today's weather is very good"
reference = "今天 我 很 开心" # "Today I am happy"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match "今天" and "很"
assert rouge_1_score.fmeasure == pytest.approx(0.5)


def test_chinese_different_sentences(self):
"""Chinese: Completely different sentences should have zero score."""
candidate = "苹果 橙子 香蕉" # "Apple orange banana"
reference = "汽车 飞机 火车" # "Car airplane train"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
assert rouge_1_score.fmeasure == 0

# === Arabic Language Tests ===

def test_arabic_greeting_identical(self):
"""Arabic: Identical greeting should have perfect score."""
candidate = "مرحبا بالعالم"
reference = "مرحبا بالعالم"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
assert rouge_1_score.fmeasure == 1.0

def test_arabic_sentence_with_overlap(self):
"""Arabic: Sentences with common words should show partial match."""
candidate = "أنا أحب القراءة والكتابة" # "I love reading and writing"
reference = "أنا أحب السفر والموسيقى" # "I love travel and music"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match "أنا" and "أحب"
assert rouge_1_score.fmeasure > 0
assert rouge_1_score.fmeasure < 1.0

# === Japanese Language Tests ===

def test_japanese_greeting_identical(self):
"""Japanese: Identical greeting should have perfect score."""
candidate = "こんにちは"
reference = "こんにちは"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
assert rouge_1_score.fmeasure == 1.0

def test_japanese_sentence_with_overlap(self):
"""Japanese: Sentences with common words should show partial match."""
candidate = "今日 は 天気 が いい です" # "Today the weather is good"
reference = "今日 は 仕事 が 忙しい です" # "Today work is busy"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match "今日", "は", "が", "です"
assert rouge_1_score.fmeasure > 0.5
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The assertion > 0.5 is correct but could be more precise. The expected F-measure for this case is exactly 2/3. Using pytest.approx(2 / 3) will make the test more robust against future changes.

Suggested change
assert rouge_1_score.fmeasure > 0.5
assert rouge_1_score.fmeasure == pytest.approx(2 / 3)


# === Korean Language Tests ===

def test_korean_greeting_identical(self):
"""Korean: Identical greeting should have perfect score."""
candidate = "안녕하세요"
reference = "안녕하세요"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
assert rouge_1_score.fmeasure == 1.0

def test_korean_sentence_with_overlap(self):
"""Korean: Sentences with common words should show partial match."""
candidate = "오늘 날씨가 좋습니다" # "Today's weather is good"
reference = "오늘 기분이 좋습니다" # "Today my mood is good"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match "오늘" and "좋습니다"
assert rouge_1_score.fmeasure > 0
assert rouge_1_score.fmeasure < 1.0
Comment on lines +337 to +338
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The assertions > 0 and < 1.0 are correct but are not very specific. The expected F-measure can be calculated precisely as 2/3 for this test case. Using a more precise assertion makes the test stronger.

Calculation:

  • Candidate tokens: 3 (오늘, 날씨가, 좋습니다)
  • Reference tokens: 3 (오늘, 기분이, 좋습니다)
  • Common tokens: 2 (오늘, 좋습니다)
  • Precision = 2/3, Recall = 2/3
  • F-measure = 2/3
Suggested change
assert rouge_1_score.fmeasure > 0
assert rouge_1_score.fmeasure < 1.0
assert rouge_1_score.fmeasure == pytest.approx(2 / 3)


# === European Languages (Latin script with accents) ===

def test_portuguese_sentence_identical(self):
"""Portuguese: Identical sentence with accents should match perfectly."""
candidate = "Olá, como você está hoje?"
reference = "Olá, como você está hoje?"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
assert rouge_1_score.fmeasure == 1.0

def test_portuguese_sentence_with_overlap(self):
"""Portuguese: Sentences with common words should show partial match."""
candidate = "Eu gosto de programação e música"
reference = "Eu gosto de viajar e cozinhar"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match "Eu", "gosto", "de", "e"
assert rouge_1_score.fmeasure > 0.5

def test_french_sentence_with_accents(self):
"""French: Accented characters should match correctly."""
candidate = "Où est la bibliothèque s'il vous plaît?"
reference = "Où est la gare s'il vous plaît?"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match most words except "bibliothèque" vs "gare"
assert rouge_1_score.fmeasure > 0.7

def test_german_sentence_with_umlauts(self):
"""German: Umlauts should be handled correctly."""
candidate = "Ich möchte ein Brötchen und Käse"
reference = "Ich möchte ein Brötchen und Wurst"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match everything except "Käse" vs "Wurst"
assert rouge_1_score.fmeasure > 0.8

def test_spanish_sentence_with_accents(self):
"""Spanish: Accented characters should match correctly."""
candidate = "¿Cómo estás? Estoy muy bien gracias"
reference = "¿Cómo estás? Estoy cansado hoy"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match "¿Cómo", "estás?", "Estoy"
assert rouge_1_score.fmeasure > 0.4

# === English Stemming Verification ===

def test_english_stemming_running_vs_run(self):
"""English: Stemming should normalize 'running' to 'run'."""
candidate = "I am running fast"
reference = "I am run fast"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# With stemming: "running" -> "run", perfect match
assert rouge_1_score.fmeasure == 1.0

def test_english_stemming_multiple_forms(self):
"""English: Multiple word forms should match via stemming."""
candidate = "The dogs are running and jumping happily"
reference = "The dog is run and jump happy"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Stemming normalizes: dogs->dog, running->run, jumping->jump, happily->happi
# Should have high overlap
assert rouge_1_score.fmeasure > 0.7

def test_english_preserves_exact_matching(self):
"""English: Exact matches should still work perfectly."""
candidate = "The quick brown fox jumps over the lazy dog"
reference = "The quick brown fox jumps over the lazy dog"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
assert rouge_1_score.fmeasure == 1.0

# === Mixed Script Edge Cases ===

def test_mixed_english_chinese(self):
"""Mixed: English and Chinese in same text."""
candidate = "Hello 世界 welcome to Python"
reference = "Hello 世界 welcome to Java"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match "Hello", "世界", "welcome", "to" (4 out of 5)
assert rouge_1_score.fmeasure > 0.7

def test_mixed_with_numbers(self):
"""Mixed: Text with numbers should work correctly."""
candidate = "订单号 12345 已确认"
reference = "订单号 12345 已发货"
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
# Should match "订单号" and "12345"
assert rouge_1_score.fmeasure > 0.5
Loading