From fcf7dc453123d680ffa6f46039747a3b771b55cd Mon Sep 17 00:00:00 2001
From: AhrendsW <AhrendsW@users.noreply.github.com>
Date: Mon, 15 Dec 2025 21:28:31 -0300
Subject: [PATCH 1/3] fix(eval): Support non-English languages in
 response_match_score

The ROUGE-1 evaluation was returning score 0 for non-English languages
(Thai, Chinese, Arabic, etc.) because the Porter stemmer only works
for English text.

This fix:
- Adds _is_latin_script() function to detect text script using unicodedata
- Disables stemmer for non-Latin scripts while preserving it for English
- Adds comprehensive tests for Thai, Chinese, Arabic, Japanese, Korean,
  Portuguese, French, German, and Spanish

Fixes #3111
---
 .../adk/evaluation/final_response_match_v1.py |  47 ++-
 .../test_final_response_match_v1.py           | 274 ++++++++++++++++++
 2 files changed, 320 insertions(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
index 06a6440882..9366d0016e 100644
--- a/src/google/adk/evaluation/final_response_match_v1.py
+++ b/src/google/adk/evaluation/final_response_match_v1.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+import unicodedata
 from typing import Optional
 
 from google.genai import types as genai_types
@@ -109,6 +110,42 @@ def _get_eval_status(score: float, threshold: float):
   return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
 
 
+def _is_latin_script(text: str) -> bool:
+  """Checks if text is primarily Latin script.
+
+  This is used to determine whether to apply English-specific stemming.
+  Latin script includes English, Portuguese, Spanish, French, German, etc.
+  Non-Latin scripts include Thai, Chinese, Arabic, Japanese, Korean, etc.
+
+  Args:
+      text: The text to analyze.
+
+  Returns:
+      True if the text is primarily Latin script, False otherwise.
+  """
+  if not text:
+    return True
+
+  latin_chars = 0
+  letter_chars = 0
+
+  for char in text:
+    # Check if character is a letter (category starts with 'L')
+    if unicodedata.category(char).startswith("L"):
+      letter_chars += 1
+      # Check if it's a Latin character by looking at its Unicode name
+      char_name = unicodedata.name(char, "")
+      if "LATIN" in char_name:
+        latin_chars += 1
+
+  # If no letters found, default to Latin (likely punctuation/numbers only)
+  if letter_chars == 0:
+    return True
+
+  # Consider text as Latin if more than 50% of letters are Latin
+  return latin_chars / letter_chars > 0.5
+
+
 def _calculate_rouge_1_scores(candidate: str, reference: str):
   """Calculates the ROUGE-1 score between a candidate and reference text.
 
@@ -120,6 +157,11 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
   candidate.
   - F-measure: The harmonic mean of precision and recall.
 
+  Stemming is only applied for Latin script text (English, Portuguese, etc.)
+  since the Porter stemmer only works correctly for English. For non-Latin
+  scripts (Thai, Chinese, Arabic, etc.), stemming is disabled to ensure
+  accurate matching.
+
   Args:
       candidate: The generated text to be evaluated.
       reference: The ground-truth text to compare against.
@@ -127,7 +169,10 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
   Returns:
       A dictionary containing the ROUGE-1 precision, recall, and f-measure.
   """
-  scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
+  # Use stemmer only for Latin script text (English, Portuguese, Spanish, etc.)
+  # Porter stemmer doesn't work for non-Latin scripts (Thai, Chinese, Arabic)
+  use_stemmer = _is_latin_script(candidate) and _is_latin_script(reference)
+  scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=use_stemmer)
 
   # The score method returns a dictionary where keys are the ROUGE types
   # and values are Score objects (tuples) with precision, recall, and fmeasure.
diff --git a/tests/unittests/evaluation/test_final_response_match_v1.py b/tests/unittests/evaluation/test_final_response_match_v1.py
index d5fe0464f8..ceedf28fa4 100644
--- a/tests/unittests/evaluation/test_final_response_match_v1.py
+++ b/tests/unittests/evaluation/test_final_response_match_v1.py
@@ -19,6 +19,7 @@
 from google.adk.evaluation.eval_metrics import PrebuiltMetrics
 from google.adk.evaluation.evaluator import EvalStatus
 from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
+from google.adk.evaluation.final_response_match_v1 import _is_latin_script
 from google.adk.evaluation.final_response_match_v1 import RougeEvaluator
 from google.genai import types as genai_types
 import pytest
@@ -147,3 +148,276 @@ def test_get_metric_info():
   assert metric_info.metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value
   assert metric_info.metric_value_info.interval.min_value == 0.0
   assert metric_info.metric_value_info.interval.max_value == 1.0
+
+
+# Tests for _is_latin_script function
+class TestIsLatinScript:
+  """Tests for the _is_latin_script helper function."""
+
+  def test_empty_string(self):
+    """Empty string should default to Latin."""
+    assert _is_latin_script("") is True
+
+  def test_english_text(self):
+    """English text should be detected as Latin."""
+    assert _is_latin_script("Hello world") is True
+    assert _is_latin_script("The quick brown fox") is True
+
+  def test_portuguese_text(self):
+    """Portuguese with accents should be detected as Latin."""
+    assert _is_latin_script("Olá, como você está?") is True
+    assert _is_latin_script("São Paulo é uma cidade") is True
+
+  def test_french_text(self):
+    """French with accents should be detected as Latin."""
+    assert _is_latin_script("Bonjour, comment allez-vous?") is True
+    assert _is_latin_script("français café résumé") is True
+
+  def test_german_text(self):
+    """German with umlauts should be detected as Latin."""
+    assert _is_latin_script("Guten Tag, wie geht es Ihnen?") is True
+    assert _is_latin_script("Größe Übung Äpfel") is True
+
+  def test_thai_text(self):
+    """Thai text should be detected as non-Latin."""
+    assert _is_latin_script("สวัสดี") is False
+    assert _is_latin_script("สวัสดีครับ") is False
+
+  def test_chinese_text(self):
+    """Chinese text should be detected as non-Latin."""
+    assert _is_latin_script("你好") is False
+    assert _is_latin_script("中文测试") is False
+
+  def test_arabic_text(self):
+    """Arabic text should be detected as non-Latin."""
+    assert _is_latin_script("مرحبا") is False
+    assert _is_latin_script("اللغة العربية") is False
+
+  def test_japanese_text(self):
+    """Japanese text should be detected as non-Latin."""
+    assert _is_latin_script("こんにちは") is False
+    assert _is_latin_script("日本語テスト") is False
+
+  def test_korean_text(self):
+    """Korean text should be detected as non-Latin."""
+    assert _is_latin_script("안녕하세요") is False
+    assert _is_latin_script("한국어 테스트") is False
+
+  def test_numbers_only(self):
+    """Numbers only should default to Latin."""
+    assert _is_latin_script("12345") is True
+
+  def test_punctuation_only(self):
+    """Punctuation only should default to Latin."""
+    assert _is_latin_script("!@#$%") is True
+
+  def test_mixed_latin_dominant(self):
+    """Mixed text with Latin dominant should be Latin."""
+    assert _is_latin_script("Hello 你好 world test") is True
+
+  def test_mixed_non_latin_dominant(self):
+    """Mixed text with non-Latin dominant should be non-Latin."""
+    assert _is_latin_script("你好世界 Hi") is False
+
+
+# Tests for non-English language ROUGE scoring
+class TestNonEnglishRougeScoring:
+  """Tests for ROUGE scoring with non-English languages (Issue #3111).
+
+  These tests verify that the fix for non-English languages works correctly.
+  The key issue was that Porter stemmer only works for English, causing
+  match failures for other languages.
+  """
+
+  # === Thai Language Tests (Original Issue #3111) ===
+
+  def test_thai_greeting_identical(self):
+    """Thai: Identical greeting should have perfect score."""
+    # This is the exact case from Issue #3111
+    candidate = "สวัสดี"
+    reference = "สวัสดี"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    assert rouge_1_score.fmeasure == 1.0
+
+  def test_thai_sentence_with_overlap(self):
+    """Thai: Sentences with common words should show partial match."""
+    # "Hello, how are you today?" vs "Hello, how is the weather?"
+    candidate = "สวัสดี คุณ สบายดี ไหม วันนี้"
+    reference = "สวัสดี คุณ อากาศ เป็น อย่างไร"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Should match "สวัสดี" and "คุณ" (2 out of 5 words each)
+    assert rouge_1_score.fmeasure > 0
+    assert rouge_1_score.fmeasure < 1.0
+
+  def test_thai_polite_particle_variation(self):
+    """Thai: Same meaning with polite particle should show high match."""
+    # "Hello" vs "Hello (polite)"
+    candidate = "สวัสดี ครับ"
+    reference = "สวัสดี ค่ะ"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Should match "สวัสดี" (1 out of 2 words)
+    assert rouge_1_score.fmeasure == pytest.approx(0.5, rel=0.1)
+
+  # === Chinese Language Tests ===
+
+  def test_chinese_greeting_identical(self):
+    """Chinese: Identical greeting should have perfect score."""
+    candidate = "你好世界"
+    reference = "你好世界"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    assert rouge_1_score.fmeasure == 1.0
+
+  def test_chinese_sentence_with_overlap(self):
+    """Chinese: Sentences with common words should show partial match."""
+    # Space-separated for tokenization
+    candidate = "今天 天气 很好"  # "Today's weather is good"
+    reference = "今天 我 很 开心"  # "Today I am happy"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Should match "今天" and "很"
+    assert rouge_1_score.fmeasure > 0
+    assert rouge_1_score.fmeasure < 1.0
+
+  def test_chinese_different_sentences(self):
+    """Chinese: Completely different sentences should have zero score."""
+    candidate = "苹果 橙子 香蕉"  # "Apple orange banana"
+    reference = "汽车 飞机 火车"  # "Car airplane train"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    assert rouge_1_score.fmeasure == 0
+
+  # === Arabic Language Tests ===
+
+  def test_arabic_greeting_identical(self):
+    """Arabic: Identical greeting should have perfect score."""
+    candidate = "مرحبا بالعالم"
+    reference = "مرحبا بالعالم"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    assert rouge_1_score.fmeasure == 1.0
+
+  def test_arabic_sentence_with_overlap(self):
+    """Arabic: Sentences with common words should show partial match."""
+    candidate = "أنا أحب القراءة والكتابة"  # "I love reading and writing"
+    reference = "أنا أحب السفر والموسيقى"  # "I love travel and music"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Should match "أنا" and "أحب"
+    assert rouge_1_score.fmeasure > 0
+    assert rouge_1_score.fmeasure < 1.0
+
+  # === Japanese Language Tests ===
+
+  def test_japanese_greeting_identical(self):
+    """Japanese: Identical greeting should have perfect score."""
+    candidate = "こんにちは"
+    reference = "こんにちは"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    assert rouge_1_score.fmeasure == 1.0
+
+  def test_japanese_sentence_with_overlap(self):
+    """Japanese: Sentences with common words should show partial match."""
+    candidate = "今日 は 天気 が いい です"  # "Today the weather is good"
+    reference = "今日 は 仕事 が 忙しい です"  # "Today work is busy"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Should match "今日", "は", "が", "です"
+    assert rouge_1_score.fmeasure > 0.5
+
+  # === Korean Language Tests ===
+
+  def test_korean_greeting_identical(self):
+    """Korean: Identical greeting should have perfect score."""
+    candidate = "안녕하세요"
+    reference = "안녕하세요"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    assert rouge_1_score.fmeasure == 1.0
+
+  def test_korean_sentence_with_overlap(self):
+    """Korean: Sentences with common words should show partial match."""
+    candidate = "오늘 날씨가 좋습니다"  # "Today's weather is good"
+    reference = "오늘 기분이 좋습니다"  # "Today my mood is good"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Should match "오늘" and "좋습니다"
+    assert rouge_1_score.fmeasure > 0
+    assert rouge_1_score.fmeasure < 1.0
+
+  # === European Languages (Latin script with accents) ===
+
+  def test_portuguese_sentence_identical(self):
+    """Portuguese: Identical sentence with accents should match perfectly."""
+    candidate = "Olá, como você está hoje?"
+    reference = "Olá, como você está hoje?"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    assert rouge_1_score.fmeasure == 1.0
+
+  def test_portuguese_sentence_with_overlap(self):
+    """Portuguese: Sentences with common words should show partial match."""
+    candidate = "Eu gosto de programação e música"
+    reference = "Eu gosto de viajar e cozinhar"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Should match "Eu", "gosto", "de", "e"
+    assert rouge_1_score.fmeasure > 0.5
+
+  def test_french_sentence_with_accents(self):
+    """French: Accented characters should match correctly."""
+    candidate = "Où est la bibliothèque s'il vous plaît?"
+    reference = "Où est la gare s'il vous plaît?"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Should match most words except "bibliothèque" vs "gare"
+    assert rouge_1_score.fmeasure > 0.7
+
+  def test_german_sentence_with_umlauts(self):
+    """German: Umlauts should be handled correctly."""
+    candidate = "Ich möchte ein Brötchen und Käse"
+    reference = "Ich möchte ein Brötchen und Wurst"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Should match everything except "Käse" vs "Wurst"
+    assert rouge_1_score.fmeasure > 0.8
+
+  def test_spanish_sentence_with_accents(self):
+    """Spanish: Accented characters should match correctly."""
+    candidate = "¿Cómo estás? Estoy muy bien gracias"
+    reference = "¿Cómo estás? Estoy cansado hoy"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Should match "¿Cómo", "estás?", "Estoy"
+    assert rouge_1_score.fmeasure > 0.4
+
+  # === English Stemming Verification ===
+
+  def test_english_stemming_running_vs_run(self):
+    """English: Stemming should normalize 'running' to 'run'."""
+    candidate = "I am running fast"
+    reference = "I am run fast"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # With stemming: "running" -> "run", perfect match
+    assert rouge_1_score.fmeasure == 1.0
+
+  def test_english_stemming_multiple_forms(self):
+    """English: Multiple word forms should match via stemming."""
+    candidate = "The dogs are running and jumping happily"
+    reference = "The dog is run and jump happy"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Stemming normalizes: dogs->dog, running->run, jumping->jump, happily->happi
+    # Should have high overlap
+    assert rouge_1_score.fmeasure > 0.7
+
+  def test_english_preserves_exact_matching(self):
+    """English: Exact matches should still work perfectly."""
+    candidate = "The quick brown fox jumps over the lazy dog"
+    reference = "The quick brown fox jumps over the lazy dog"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    assert rouge_1_score.fmeasure == 1.0
+
+  # === Mixed Script Edge Cases ===
+
+  def test_mixed_english_chinese(self):
+    """Mixed: English and Chinese in same text."""
+    candidate = "Hello 世界 welcome to Python"
+    reference = "Hello 世界 welcome to Java"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Should match "Hello", "世界", "welcome", "to" (4 out of 5)
+    assert rouge_1_score.fmeasure > 0.7
+
+  def test_mixed_with_numbers(self):
+    """Mixed: Text with numbers should work correctly."""
+    candidate = "订单号 12345 已确认"
+    reference = "订单号 12345 已发货"
+    rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
+    # Should match "订单号" and "12345"
+    assert rouge_1_score.fmeasure > 0.5

From 7c03a9e0cf84dae62abb6ca3935a5c77f1c63501 Mon Sep 17 00:00:00 2001
From: AhrendsW <AhrendsW@users.noreply.github.com>
Date: Wed, 17 Dec 2025 17:01:26 -0300
Subject: [PATCH 2/3] fix(eval): Add Unicode tokenizer for non-Latin script
 support

The default rouge_scorer tokenizer only handles ASCII characters,
returning empty token lists for non-Latin scripts (Thai, Chinese,
Arabic, Japanese, Korean). This caused ROUGE scores of 0.0 even for
identical strings.

Changes:
- Add _UnicodeTokenizer class using Unicode-aware regex
- Use custom tokenizer for non-Latin scripts
- Fix import order per isort requirements
---
 .../adk/evaluation/final_response_match_v1.py | 42 ++++++++++++++++---
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
index 9366d0016e..ddb4e53849 100644
--- a/src/google/adk/evaluation/final_response_match_v1.py
+++ b/src/google/adk/evaluation/final_response_match_v1.py
@@ -14,8 +14,9 @@
 
 from __future__ import annotations
 
-import unicodedata
+import re
 from typing import Optional
+import unicodedata
 
 from google.genai import types as genai_types
 from typing_extensions import override
@@ -110,6 +111,27 @@ def _get_eval_status(score: float, threshold: float):
   return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
 
 
+class _UnicodeTokenizer:
+  """A tokenizer that handles Unicode text for non-Latin scripts.
+
+  The default rouge_scorer tokenizer only works with ASCII characters,
+  returning empty token lists for non-Latin scripts like Thai, Chinese,
+  Arabic, etc. This tokenizer uses Unicode-aware regex to properly
+  tokenize text in any script.
+  """
+
+  def tokenize(self, text: str) -> list[str]:
+    """Tokenizes text using Unicode-aware word boundaries.
+
+    Args:
+        text: The text to tokenize.
+
+    Returns:
+        A list of tokens (words) from the text.
+    """
+    return re.findall(r"\w+", text, re.UNICODE)
+
+
 def _is_latin_script(text: str) -> bool:
   """Checks if text is primarily Latin script.
 
@@ -169,10 +191,20 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
   Returns:
       A dictionary containing the ROUGE-1 precision, recall, and f-measure.
   """
-  # Use stemmer only for Latin script text (English, Portuguese, Spanish, etc.)
-  # Porter stemmer doesn't work for non-Latin scripts (Thai, Chinese, Arabic)
-  use_stemmer = _is_latin_script(candidate) and _is_latin_script(reference)
-  scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=use_stemmer)
+  # Check if both texts are Latin script
+  is_latin = _is_latin_script(candidate) and _is_latin_script(reference)
+
+  # For Latin scripts (English, Portuguese, etc.): use default tokenizer with
+  # stemmer. For non-Latin scripts (Thai, Chinese, Arabic, etc.): use custom
+  # Unicode tokenizer without stemmer, since:
+  # 1. Porter stemmer only works for English
+  # 2. Default tokenizer doesn't handle Unicode characters properly
+  if is_latin:
+    scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
+  else:
+    scorer = rouge_scorer.RougeScorer(
+        ["rouge1"], use_stemmer=False, tokenizer=_UnicodeTokenizer()
+    )
 
   # The score method returns a dictionary where keys are the ROUGE types
   # and values are Score objects (tuples) with precision, recall, and fmeasure.

From 144ec44c80a87b8728265dc7b03e73a2421c235e Mon Sep 17 00:00:00 2001
From: AhrendsW <AhrendsW@users.noreply.github.com>
Date: Wed, 17 Dec 2025 21:08:06 -0300
Subject: [PATCH 3/3] chore: trigger CI re-run