From fcf7dc453123d680ffa6f46039747a3b771b55cd Mon Sep 17 00:00:00 2001 From: AhrendsW Date: Mon, 15 Dec 2025 21:28:31 -0300 Subject: [PATCH 1/3] fix(eval): Support non-English languages in response_match_score The ROUGE-1 evaluation was returning score 0 for non-English languages (Thai, Chinese, Arabic, etc.) because the Porter stemmer only works for English text. This fix: - Adds _is_latin_script() function to detect text script using unicodedata - Disables stemmer for non-Latin scripts while preserving it for English - Adds comprehensive tests for Thai, Chinese, Arabic, Japanese, Korean, Portuguese, French, German, and Spanish Fixes #3111 --- .../adk/evaluation/final_response_match_v1.py | 47 ++- .../test_final_response_match_v1.py | 274 ++++++++++++++++++ 2 files changed, 320 insertions(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 06a6440882..9366d0016e 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -14,6 +14,7 @@ from __future__ import annotations +import unicodedata from typing import Optional from google.genai import types as genai_types @@ -109,6 +110,42 @@ def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED +def _is_latin_script(text: str) -> bool: + """Checks if text is primarily Latin script. + + This is used to determine whether to apply English-specific stemming. + Latin script includes English, Portuguese, Spanish, French, German, etc. + Non-Latin scripts include Thai, Chinese, Arabic, Japanese, Korean, etc. + + Args: + text: The text to analyze. + + Returns: + True if the text is primarily Latin script, False otherwise. + """ + if not text: + return True + + latin_chars = 0 + letter_chars = 0 + + for char in text: + # Check if character is a letter (category starts with 'L') + if unicodedata.category(char).startswith("L"): + letter_chars += 1 + # Check if it's a Latin character by looking at its Unicode name + char_name = unicodedata.name(char, "") + if "LATIN" in char_name: + latin_chars += 1 + + # If no letters found, default to Latin (likely punctuation/numbers only) + if letter_chars == 0: + return True + + # Consider text as Latin if more than 50% of letters are Latin + return latin_chars / letter_chars > 0.5 + + def _calculate_rouge_1_scores(candidate: str, reference: str): """Calculates the ROUGE-1 score between a candidate and reference text. @@ -120,6 +157,11 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): candidate. - F-measure: The harmonic mean of precision and recall. + Stemming is only applied for Latin script text (English, Portuguese, etc.) + since the Porter stemmer only works correctly for English. For non-Latin + scripts (Thai, Chinese, Arabic, etc.), stemming is disabled to ensure + accurate matching. + Args: candidate: The generated text to be evaluated. reference: The ground-truth text to compare against. @@ -127,7 +169,10 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): Returns: A dictionary containing the ROUGE-1 precision, recall, and f-measure. """ - scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) + # Use stemmer only for Latin script text (English, Portuguese, Spanish, etc.) + # Porter stemmer doesn't work for non-Latin scripts (Thai, Chinese, Arabic) + use_stemmer = _is_latin_script(candidate) and _is_latin_script(reference) + scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=use_stemmer) # The score method returns a dictionary where keys are the ROUGE types # and values are Score objects (tuples) with precision, recall, and fmeasure. diff --git a/tests/unittests/evaluation/test_final_response_match_v1.py b/tests/unittests/evaluation/test_final_response_match_v1.py index d5fe0464f8..ceedf28fa4 100644 --- a/tests/unittests/evaluation/test_final_response_match_v1.py +++ b/tests/unittests/evaluation/test_final_response_match_v1.py @@ -19,6 +19,7 @@ from google.adk.evaluation.eval_metrics import PrebuiltMetrics from google.adk.evaluation.evaluator import EvalStatus from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores +from google.adk.evaluation.final_response_match_v1 import _is_latin_script from google.adk.evaluation.final_response_match_v1 import RougeEvaluator from google.genai import types as genai_types import pytest @@ -147,3 +148,276 @@ def test_get_metric_info(): assert metric_info.metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value assert metric_info.metric_value_info.interval.min_value == 0.0 assert metric_info.metric_value_info.interval.max_value == 1.0 + + +# Tests for _is_latin_script function +class TestIsLatinScript: + """Tests for the _is_latin_script helper function.""" + + def test_empty_string(self): + """Empty string should default to Latin.""" + assert _is_latin_script("") is True + + def test_english_text(self): + """English text should be detected as Latin.""" + assert _is_latin_script("Hello world") is True + assert _is_latin_script("The quick brown fox") is True + + def test_portuguese_text(self): + """Portuguese with accents should be detected as Latin.""" + assert _is_latin_script("Olá, como você está?") is True + assert _is_latin_script("São Paulo é uma cidade") is True + + def test_french_text(self): + """French with accents should be detected as Latin.""" + assert _is_latin_script("Bonjour, comment allez-vous?") is True + assert _is_latin_script("français café résumé") is True + + def test_german_text(self): + """German with umlauts should be detected as Latin.""" + assert _is_latin_script("Guten Tag, wie geht es Ihnen?") is True + assert _is_latin_script("Größe Übung Äpfel") is True + + def test_thai_text(self): + """Thai text should be detected as non-Latin.""" + assert _is_latin_script("สวัสดี") is False + assert _is_latin_script("สวัสดีครับ") is False + + def test_chinese_text(self): + """Chinese text should be detected as non-Latin.""" + assert _is_latin_script("你好") is False + assert _is_latin_script("中文测试") is False + + def test_arabic_text(self): + """Arabic text should be detected as non-Latin.""" + assert _is_latin_script("مرحبا") is False + assert _is_latin_script("اللغة العربية") is False + + def test_japanese_text(self): + """Japanese text should be detected as non-Latin.""" + assert _is_latin_script("こんにちは") is False + assert _is_latin_script("日本語テスト") is False + + def test_korean_text(self): + """Korean text should be detected as non-Latin.""" + assert _is_latin_script("안녕하세요") is False + assert _is_latin_script("한국어 테스트") is False + + def test_numbers_only(self): + """Numbers only should default to Latin.""" + assert _is_latin_script("12345") is True + + def test_punctuation_only(self): + """Punctuation only should default to Latin.""" + assert _is_latin_script("!@#$%") is True + + def test_mixed_latin_dominant(self): + """Mixed text with Latin dominant should be Latin.""" + assert _is_latin_script("Hello 你好 world test") is True + + def test_mixed_non_latin_dominant(self): + """Mixed text with non-Latin dominant should be non-Latin.""" + assert _is_latin_script("你好世界 Hi") is False + + +# Tests for non-English language ROUGE scoring +class TestNonEnglishRougeScoring: + """Tests for ROUGE scoring with non-English languages (Issue #3111). + + These tests verify that the fix for non-English languages works correctly. + The key issue was that Porter stemmer only works for English, causing + match failures for other languages. + """ + + # === Thai Language Tests (Original Issue #3111) === + + def test_thai_greeting_identical(self): + """Thai: Identical greeting should have perfect score.""" + # This is the exact case from Issue #3111 + candidate = "สวัสดี" + reference = "สวัสดี" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + assert rouge_1_score.fmeasure == 1.0 + + def test_thai_sentence_with_overlap(self): + """Thai: Sentences with common words should show partial match.""" + # "Hello, how are you today?" vs "Hello, how is the weather?" + candidate = "สวัสดี คุณ สบายดี ไหม วันนี้" + reference = "สวัสดี คุณ อากาศ เป็น อย่างไร" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Should match "สวัสดี" and "คุณ" (2 out of 5 words each) + assert rouge_1_score.fmeasure > 0 + assert rouge_1_score.fmeasure < 1.0 + + def test_thai_polite_particle_variation(self): + """Thai: Same meaning with polite particle should show high match.""" + # "Hello" vs "Hello (polite)" + candidate = "สวัสดี ครับ" + reference = "สวัสดี ค่ะ" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Should match "สวัสดี" (1 out of 2 words) + assert rouge_1_score.fmeasure == pytest.approx(0.5, rel=0.1) + + # === Chinese Language Tests === + + def test_chinese_greeting_identical(self): + """Chinese: Identical greeting should have perfect score.""" + candidate = "你好世界" + reference = "你好世界" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + assert rouge_1_score.fmeasure == 1.0 + + def test_chinese_sentence_with_overlap(self): + """Chinese: Sentences with common words should show partial match.""" + # Space-separated for tokenization + candidate = "今天 天气 很好" # "Today's weather is good" + reference = "今天 我 很 开心" # "Today I am happy" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Should match "今天" and "很" + assert rouge_1_score.fmeasure > 0 + assert rouge_1_score.fmeasure < 1.0 + + def test_chinese_different_sentences(self): + """Chinese: Completely different sentences should have zero score.""" + candidate = "苹果 橙子 香蕉" # "Apple orange banana" + reference = "汽车 飞机 火车" # "Car airplane train" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + assert rouge_1_score.fmeasure == 0 + + # === Arabic Language Tests === + + def test_arabic_greeting_identical(self): + """Arabic: Identical greeting should have perfect score.""" + candidate = "مرحبا بالعالم" + reference = "مرحبا بالعالم" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + assert rouge_1_score.fmeasure == 1.0 + + def test_arabic_sentence_with_overlap(self): + """Arabic: Sentences with common words should show partial match.""" + candidate = "أنا أحب القراءة والكتابة" # "I love reading and writing" + reference = "أنا أحب السفر والموسيقى" # "I love travel and music" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Should match "أنا" and "أحب" + assert rouge_1_score.fmeasure > 0 + assert rouge_1_score.fmeasure < 1.0 + + # === Japanese Language Tests === + + def test_japanese_greeting_identical(self): + """Japanese: Identical greeting should have perfect score.""" + candidate = "こんにちは" + reference = "こんにちは" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + assert rouge_1_score.fmeasure == 1.0 + + def test_japanese_sentence_with_overlap(self): + """Japanese: Sentences with common words should show partial match.""" + candidate = "今日 は 天気 が いい です" # "Today the weather is good" + reference = "今日 は 仕事 が 忙しい です" # "Today work is busy" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Should match "今日", "は", "が", "です" + assert rouge_1_score.fmeasure > 0.5 + + # === Korean Language Tests === + + def test_korean_greeting_identical(self): + """Korean: Identical greeting should have perfect score.""" + candidate = "안녕하세요" + reference = "안녕하세요" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + assert rouge_1_score.fmeasure == 1.0 + + def test_korean_sentence_with_overlap(self): + """Korean: Sentences with common words should show partial match.""" + candidate = "오늘 날씨가 좋습니다" # "Today's weather is good" + reference = "오늘 기분이 좋습니다" # "Today my mood is good" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Should match "오늘" and "좋습니다" + assert rouge_1_score.fmeasure > 0 + assert rouge_1_score.fmeasure < 1.0 + + # === European Languages (Latin script with accents) === + + def test_portuguese_sentence_identical(self): + """Portuguese: Identical sentence with accents should match perfectly.""" + candidate = "Olá, como você está hoje?" + reference = "Olá, como você está hoje?" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + assert rouge_1_score.fmeasure == 1.0 + + def test_portuguese_sentence_with_overlap(self): + """Portuguese: Sentences with common words should show partial match.""" + candidate = "Eu gosto de programação e música" + reference = "Eu gosto de viajar e cozinhar" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Should match "Eu", "gosto", "de", "e" + assert rouge_1_score.fmeasure > 0.5 + + def test_french_sentence_with_accents(self): + """French: Accented characters should match correctly.""" + candidate = "Où est la bibliothèque s'il vous plaît?" + reference = "Où est la gare s'il vous plaît?" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Should match most words except "bibliothèque" vs "gare" + assert rouge_1_score.fmeasure > 0.7 + + def test_german_sentence_with_umlauts(self): + """German: Umlauts should be handled correctly.""" + candidate = "Ich möchte ein Brötchen und Käse" + reference = "Ich möchte ein Brötchen und Wurst" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Should match everything except "Käse" vs "Wurst" + assert rouge_1_score.fmeasure > 0.8 + + def test_spanish_sentence_with_accents(self): + """Spanish: Accented characters should match correctly.""" + candidate = "¿Cómo estás? Estoy muy bien gracias" + reference = "¿Cómo estás? Estoy cansado hoy" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Should match "¿Cómo", "estás?", "Estoy" + assert rouge_1_score.fmeasure > 0.4 + + # === English Stemming Verification === + + def test_english_stemming_running_vs_run(self): + """English: Stemming should normalize 'running' to 'run'.""" + candidate = "I am running fast" + reference = "I am run fast" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # With stemming: "running" -> "run", perfect match + assert rouge_1_score.fmeasure == 1.0 + + def test_english_stemming_multiple_forms(self): + """English: Multiple word forms should match via stemming.""" + candidate = "The dogs are running and jumping happily" + reference = "The dog is run and jump happy" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Stemming normalizes: dogs->dog, running->run, jumping->jump, happily->happi + # Should have high overlap + assert rouge_1_score.fmeasure > 0.7 + + def test_english_preserves_exact_matching(self): + """English: Exact matches should still work perfectly.""" + candidate = "The quick brown fox jumps over the lazy dog" + reference = "The quick brown fox jumps over the lazy dog" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + assert rouge_1_score.fmeasure == 1.0 + + # === Mixed Script Edge Cases === + + def test_mixed_english_chinese(self): + """Mixed: English and Chinese in same text.""" + candidate = "Hello 世界 welcome to Python" + reference = "Hello 世界 welcome to Java" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Should match "Hello", "世界", "welcome", "to" (4 out of 5) + assert rouge_1_score.fmeasure > 0.7 + + def test_mixed_with_numbers(self): + """Mixed: Text with numbers should work correctly.""" + candidate = "订单号 12345 已确认" + reference = "订单号 12345 已发货" + rouge_1_score = _calculate_rouge_1_scores(candidate, reference) + # Should match "订单号" and "12345" + assert rouge_1_score.fmeasure > 0.5 From 7c03a9e0cf84dae62abb6ca3935a5c77f1c63501 Mon Sep 17 00:00:00 2001 From: AhrendsW Date: Wed, 17 Dec 2025 17:01:26 -0300 Subject: [PATCH 2/3] fix(eval): Add Unicode tokenizer for non-Latin script support The default rouge_scorer tokenizer only handles ASCII characters, returning empty token lists for non-Latin scripts (Thai, Chinese, Arabic, Japanese, Korean). This caused ROUGE scores of 0.0 even for identical strings. Changes: - Add _UnicodeTokenizer class using Unicode-aware regex - Use custom tokenizer for non-Latin scripts - Fix import order per isort requirements --- .../adk/evaluation/final_response_match_v1.py | 42 ++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 9366d0016e..ddb4e53849 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -14,8 +14,9 @@ from __future__ import annotations -import unicodedata +import re from typing import Optional +import unicodedata from google.genai import types as genai_types from typing_extensions import override @@ -110,6 +111,27 @@ def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED +class _UnicodeTokenizer: + """A tokenizer that handles Unicode text for non-Latin scripts. + + The default rouge_scorer tokenizer only works with ASCII characters, + returning empty token lists for non-Latin scripts like Thai, Chinese, + Arabic, etc. This tokenizer uses Unicode-aware regex to properly + tokenize text in any script. + """ + + def tokenize(self, text: str) -> list[str]: + """Tokenizes text using Unicode-aware word boundaries. + + Args: + text: The text to tokenize. + + Returns: + A list of tokens (words) from the text. + """ + return re.findall(r"\w+", text, re.UNICODE) + + def _is_latin_script(text: str) -> bool: """Checks if text is primarily Latin script. @@ -169,10 +191,20 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): Returns: A dictionary containing the ROUGE-1 precision, recall, and f-measure. """ - # Use stemmer only for Latin script text (English, Portuguese, Spanish, etc.) - # Porter stemmer doesn't work for non-Latin scripts (Thai, Chinese, Arabic) - use_stemmer = _is_latin_script(candidate) and _is_latin_script(reference) - scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=use_stemmer) + # Check if both texts are Latin script + is_latin = _is_latin_script(candidate) and _is_latin_script(reference) + + # For Latin scripts (English, Portuguese, etc.): use default tokenizer with + # stemmer. For non-Latin scripts (Thai, Chinese, Arabic, etc.): use custom + # Unicode tokenizer without stemmer, since: + # 1. Porter stemmer only works for English + # 2. Default tokenizer doesn't handle Unicode characters properly + if is_latin: + scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) + else: + scorer = rouge_scorer.RougeScorer( + ["rouge1"], use_stemmer=False, tokenizer=_UnicodeTokenizer() + ) # The score method returns a dictionary where keys are the ROUGE types # and values are Score objects (tuples) with precision, recall, and fmeasure. From 144ec44c80a87b8728265dc7b03e73a2421c235e Mon Sep 17 00:00:00 2001 From: AhrendsW Date: Wed, 17 Dec 2025 21:08:06 -0300 Subject: [PATCH 3/3] chore: trigger CI re-run