From 0f079de4e76fe2c1783d33c5ec85eb0cc732a7c0 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 14:56:44 +0300 Subject: [PATCH 01/92] Delete unused LLM cache module --- README_demo_qa.md | 15 ++ examples/demo_qa/chat_repl.py | 92 +---------- examples/demo_qa/cli.py | 190 +++++++++++++++++----- examples/demo_qa/runner.py | 296 ++++++++++++++++++++++++++++++++++ 4 files changed, 469 insertions(+), 124 deletions(-) create mode 100644 examples/demo_qa/runner.py diff --git a/README_demo_qa.md b/README_demo_qa.md index 86c9b0d..006da3f 100644 --- a/README_demo_qa.md +++ b/README_demo_qa.md @@ -52,6 +52,21 @@ python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.y Флаг `--enable-semantic` строит семантический индекс, если передана модель эмбеддингов. +## Batch + +Запустить пакетный прогон вопросов из `cases.jsonl` (по одному JSON на строку, поля `id`, `question`, опционально `expected`/`expected_regex`/`expected_contains` и `skip`): + +```bash +python -m examples.demo_qa.cli batch \ + --data demo_data \ + --schema demo_data/schema.yaml \ + --cases cases.jsonl \ + --out results.jsonl +``` + +* Артефакты по умолчанию пишутся в `/.runs/batch_/id_runid/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`). +* `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов. +* Флаги `--fail-on (error|mismatch|any)`, `--max-fails` и `--fail-fast` управляют остановкой и кодом выхода (0/1/2). ## Local proxy Для OpenAI-совместимых серверов (например, LM Studio) укажите `base_url` с `.../v1` и diff --git a/examples/demo_qa/chat_repl.py b/examples/demo_qa/chat_repl.py index 10f8b74..678a78b 100644 --- a/examples/demo_qa/chat_repl.py +++ b/examples/demo_qa/chat_repl.py @@ -1,92 +1,16 @@ from __future__ import annotations import datetime -import json import sys import uuid -from dataclasses import dataclass from pathlib import Path -from typing import Callable, Dict, Optional, Sequence +from typing import Optional, Sequence import readline - -from fetchgraph.core import create_generic_agent -from fetchgraph.core.models import TaskProfile -from fetchgraph.utils import set_run_id +import json from .provider_factory import build_provider - - -@dataclass -class RunArtifacts: - run_id: str - run_dir: Path - plan: str | None = None - context: Dict[str, object] | None = None - answer: str | None = None - error: str | None = None - - -def build_agent(llm, provider) -> Callable[[str, str, Path], RunArtifacts]: - def saver(feature_name: str, parsed: object) -> None: - # Placeholder to satisfy BaseGraphAgent.saver; artifacts captured elsewhere. - return None - - task_profile = TaskProfile( - task_name="Demo QA", - goal="Answer analytics questions over the demo dataset", - output_format="Plain text answer", - focus_hints=[ - "Prefer aggregates", - "Use concise answers", - ], - ) - - agent = create_generic_agent( - llm_invoke=llm, - providers={provider.name: provider}, - saver=saver, - task_profile=task_profile, - ) - - def run_question(question: str, run_id: str, run_dir: Path) -> RunArtifacts: - set_run_id(run_id) - artifacts = RunArtifacts(run_id=run_id, run_dir=run_dir) - plan = agent._plan(question) # type: ignore[attr-defined] - artifacts.plan = json.dumps(plan.model_dump(), ensure_ascii=False, indent=2) - try: - ctx = agent._fetch(question, plan) # type: ignore[attr-defined] - artifacts.context = {k: v.text for k, v in (ctx or {}).items()} if ctx else {} - except Exception as exc: # pragma: no cover - demo fallback - artifacts.error = str(exc) - artifacts.context = {"error": str(exc)} - ctx = None - draft = agent._synthesize(question, ctx, plan) # type: ignore[attr-defined] - parsed = agent.domain_parser(draft) - artifacts.answer = str(parsed) - return artifacts - - return run_question - - -def _save_text(path: Path, content: str) -> None: - path.write_text(content, encoding="utf-8") - - -def _save_json(path: Path, payload: object) -> None: - path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") - - -def _save_artifacts(artifacts: RunArtifacts) -> None: - artifacts.run_dir.mkdir(parents=True, exist_ok=True) - if artifacts.plan is not None: - _save_text(artifacts.run_dir / "plan.json", artifacts.plan) - if artifacts.context is not None: - _save_json(artifacts.run_dir / "context.json", artifacts.context) - if artifacts.answer is not None: - _save_text(artifacts.run_dir / "answer.txt", artifacts.answer) - if artifacts.error is not None: - _save_text(artifacts.run_dir / "error.txt", artifacts.error) +from .runner import RunArtifacts, build_agent, save_artifacts def _maybe_add_history(entry: str) -> None: @@ -173,18 +97,18 @@ def start_repl( artifacts: RunArtifacts | None = None try: - artifacts = runner(line, run_id, run_dir) + artifacts = runner.run_question(line, run_id, run_dir) last_artifacts = artifacts - _save_artifacts(artifacts) + save_artifacts(artifacts) if plan_debug_mode in {"on", "once"} and artifacts.plan: print("--- PLAN ---") - print(artifacts.plan) + print(json.dumps(artifacts.plan, ensure_ascii=False, indent=2)) print(artifacts.answer or "") except Exception as exc: # pragma: no cover - REPL resilience - error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=run_dir) + error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=run_dir, question=line) error_artifacts.error = error_artifacts.error or str(exc) last_artifacts = error_artifacts - _save_artifacts(error_artifacts) + save_artifacts(error_artifacts) print(f"Error during run {run_id}: {exc}", file=sys.stderr) finally: if plan_debug_mode == "once": diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index 5c02c4c..f5763de 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -1,8 +1,11 @@ from __future__ import annotations import argparse +import datetime +import json import sys from pathlib import Path +from typing import Iterable ROOT = Path(__file__).resolve().parents[2] SRC = ROOT / "src" @@ -13,9 +16,128 @@ from .data_gen import generate_and_save from .llm.factory import build_llm from .logging_config import configure_logging +from .provider_factory import build_provider +from .runner import RunResult, build_agent, format_status_line, load_cases, run_one, summarize from .settings import load_settings +def write_results(out_path: Path, results: Iterable[RunResult]) -> None: + out_path.parent.mkdir(parents=True, exist_ok=True) + with out_path.open("w", encoding="utf-8") as f: + for res in results: + f.write(json.dumps(res.to_json(), ensure_ascii=False) + "\n") + + +def write_summary(out_path: Path, summary: dict) -> Path: + summary_path = out_path.with_name("summary.json") + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + return summary_path + + +def is_failure(status: str, fail_on: str) -> bool: + if fail_on == "error": + return status == "error" + if fail_on == "mismatch": + return status in {"error", "mismatch"} + return status in {"error", "mismatch", "skipped"} + + +def handle_chat(args) -> int: + try: + settings = load_settings(config_path=args.config, data_dir=args.data) + except Exception as exc: + print(f"Configuration error: {exc}", file=sys.stderr) + return 2 + + log_dir = args.log_dir or args.data / ".runs" / "logs" + log_file = configure_logging( + level=args.log_level, + log_dir=log_dir, + to_stderr=args.log_stderr, + jsonl=args.log_jsonl, + run_id=None, + ) + + llm_settings = settings.llm + llm_endpoint = llm_settings.base_url or "https://api.openai.com/v1" + diagnostics = [ + f"LLM endpoint: {llm_endpoint}", + f"Plan model: {llm_settings.plan_model} (temp={llm_settings.plan_temperature})", + f"Synth model: {llm_settings.synth_model} (temp={llm_settings.synth_temperature})", + f"Timeout: {llm_settings.timeout_s if llm_settings.timeout_s is not None else 'default'}, " + f"Retries: {llm_settings.retries if llm_settings.retries is not None else 'default'}", + ] + if args.enable_semantic: + diagnostics.append(f"Embeddings: CSV semantic backend in {args.data} (*.embeddings.json)") + else: + diagnostics.append("Embeddings: disabled (use --enable-semantic to build/search embeddings).") + + llm = build_llm(settings) + + start_repl( + args.data, + args.schema, + llm, + enable_semantic=args.enable_semantic, + log_file=log_file, + diagnostics=diagnostics, + ) + return 0 + + +def handle_batch(args) -> int: + try: + settings = load_settings(config_path=args.config, data_dir=args.data) + except Exception as exc: + print(f"Configuration error: {exc}", file=sys.stderr) + return 2 + try: + cases = load_cases(args.cases) + except Exception as exc: + print(f"Cases error: {exc}", file=sys.stderr) + return 2 + + artifacts_dir = args.artifacts_dir + if artifacts_dir is None: + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + artifacts_dir = args.data / ".runs" / f"batch_{timestamp}" + artifacts_dir.mkdir(parents=True, exist_ok=True) + + log_dir = args.log_dir or args.data / ".runs" / "logs" + configure_logging( + level=args.log_level, + log_dir=log_dir, + to_stderr=args.log_stderr, + jsonl=args.log_jsonl, + run_id=None, + ) + + provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic) + llm = build_llm(settings) + runner = build_agent(llm, provider) + + results: list[RunResult] = [] + failures = 0 + for case in cases: + result = run_one(case, runner, artifacts_dir) + results.append(result) + print(format_status_line(result)) + if is_failure(result.status, args.fail_on): + failures += 1 + if args.fail_fast or (args.max_fails and failures >= args.max_fails): + break + + write_results(args.out, results) + summary = summarize(results) + summary_path = write_summary(args.out, summary) + print(f"Summary: {json.dumps(summary, ensure_ascii=False)}") + print(f"Results written to: {args.out}") + print(f"Summary written to: {summary_path}") + + failure_count = sum(1 for res in results if is_failure(res.status, args.fail_on)) + return 1 if failure_count else 0 + + def main() -> None: parser = argparse.ArgumentParser(description="Demo QA harness for fetchgraph") sub = parser.add_subparsers(dest="command", required=True) @@ -36,53 +158,41 @@ def main() -> None: chat_p.add_argument("--log-stderr", action="store_true", help="Also stream logs to stderr") chat_p.add_argument("--log-jsonl", action="store_true", help="Write logs as JSONL") + batch_p = sub.add_parser("batch", help="Run a batch of questions from a JSONL file") + batch_p.add_argument("--data", type=Path, required=True) + batch_p.add_argument("--schema", type=Path, required=True) + batch_p.add_argument("--config", type=Path, default=None, help="Path to demo_qa.toml") + batch_p.add_argument("--cases", type=Path, required=True, help="Path to cases jsonl") + batch_p.add_argument("--out", type=Path, required=True, help="Path to results jsonl") + batch_p.add_argument("--artifacts-dir", type=Path, default=None, help="Where to store per-case artifacts") + batch_p.add_argument("--enable-semantic", action="store_true") + batch_p.add_argument("--log-level", default="INFO", help="Logging level (INFO, DEBUG, etc.)") + batch_p.add_argument("--log-dir", type=Path, default=None, help="Directory for log files") + batch_p.add_argument("--log-stderr", action="store_true", help="Also stream logs to stderr") + batch_p.add_argument("--log-jsonl", action="store_true", help="Write logs as JSONL") + batch_p.add_argument("--max-fails", type=int, default=None, help="Maximum allowed failures before stopping") + batch_p.add_argument("--fail-fast", action="store_true", help="Stop on first failing case") + batch_p.add_argument( + "--fail-on", + choices=["error", "mismatch", "any"], + default="mismatch", + help="Which statuses should cause a failing exit code", + ) + args = parser.parse_args() if args.command == "gen": generate_and_save(args.out, rows=args.rows, seed=args.seed, enable_semantic=args.enable_semantic) print(f"Generated data in {args.out}") - return + raise SystemExit(0) if args.command == "chat": - try: - settings = load_settings(config_path=args.config, data_dir=args.data) - except Exception as exc: - raise SystemExit(f"Configuration error: {exc}") - - log_dir = args.log_dir or args.data / ".runs" / "logs" - log_file = configure_logging( - level=args.log_level, - log_dir=log_dir, - to_stderr=args.log_stderr, - jsonl=args.log_jsonl, - run_id=None, - ) - - llm_settings = settings.llm - llm_endpoint = llm_settings.base_url or "https://api.openai.com/v1" - diagnostics = [ - f"LLM endpoint: {llm_endpoint}", - f"Plan model: {llm_settings.plan_model} (temp={llm_settings.plan_temperature})", - f"Synth model: {llm_settings.synth_model} (temp={llm_settings.synth_temperature})", - f"Timeout: {llm_settings.timeout_s if llm_settings.timeout_s is not None else 'default'}, " - f"Retries: {llm_settings.retries if llm_settings.retries is not None else 'default'}", - ] - if args.enable_semantic: - diagnostics.append(f"Embeddings: CSV semantic backend in {args.data} (*.embeddings.json)") - else: - diagnostics.append("Embeddings: disabled (use --enable-semantic to build/search embeddings).") - - llm = build_llm(settings) - - start_repl( - args.data, - args.schema, - llm, - enable_semantic=args.enable_semantic, - log_file=log_file, - diagnostics=diagnostics, - ) - return + code = handle_chat(args) + elif args.command == "batch": + code = handle_batch(args) + else: + code = 0 + raise SystemExit(code) if __name__ == "__main__": diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py new file mode 100644 index 0000000..2fc17af --- /dev/null +++ b/examples/demo_qa/runner.py @@ -0,0 +1,296 @@ +from __future__ import annotations + +import json +import re +import statistics +import time +import uuid +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, Iterable, List + +from fetchgraph.core import create_generic_agent +from fetchgraph.core.models import TaskProfile +from fetchgraph.utils import set_run_id + + +@dataclass +class RunTimings: + plan_s: float | None = None + fetch_s: float | None = None + synth_s: float | None = None + total_s: float | None = None + + +@dataclass +class ExpectedCheck: + mode: str + expected: str + passed: bool + detail: str | None = None + + +@dataclass +class RunArtifacts: + run_id: str + run_dir: Path + question: str + plan: Dict[str, object] | None = None + context: Dict[str, object] | None = None + answer: str | None = None + raw_synth: str | None = None + error: str | None = None + timings: RunTimings = field(default_factory=RunTimings) + + +@dataclass +class RunResult: + id: str + question: str + status: str + answer: str | None + error: str | None + plan_path: str | None + artifacts_dir: str + timings: RunTimings + expected_check: ExpectedCheck | None = None + + def to_json(self) -> Dict[str, object]: + payload: Dict[str, object] = { + "id": self.id, + "question": self.question, + "status": self.status, + "answer": self.answer, + "error": self.error, + "plan_path": self.plan_path, + "artifacts_dir": self.artifacts_dir, + "timings": self.timings.__dict__, + } + if self.expected_check: + payload["expected_check"] = self.expected_check.__dict__ + return payload + + +@dataclass +class Case: + id: str + question: str + expected: str | None = None + expected_regex: str | None = None + expected_contains: str | None = None + tags: List[str] = field(default_factory=list) + skip: bool = False + + +class AgentRunner: + def __init__(self, llm, provider) -> None: + def saver(feature_name: str, parsed: object) -> None: + # Placeholder to satisfy BaseGraphAgent.saver; artifacts captured elsewhere. + return None + + task_profile = TaskProfile( + task_name="Demo QA", + goal="Answer analytics questions over the demo dataset", + output_format="Plain text answer", + focus_hints=[ + "Prefer aggregates", + "Use concise answers", + ], + ) + + self.agent = create_generic_agent( + llm_invoke=llm, + providers={provider.name: provider}, + saver=saver, + task_profile=task_profile, + ) + + def run_question(self, question: str, run_id: str, run_dir: Path) -> RunArtifacts: + set_run_id(run_id) + artifacts = RunArtifacts(run_id=run_id, run_dir=run_dir, question=question) + + started = time.perf_counter() + try: + plan_started = time.perf_counter() + plan = self.agent._plan(question) # type: ignore[attr-defined] + artifacts.timings.plan_s = time.perf_counter() - plan_started + artifacts.plan = plan.model_dump() + + fetch_started = time.perf_counter() + ctx = self.agent._fetch(question, plan) # type: ignore[attr-defined] + artifacts.timings.fetch_s = time.perf_counter() - fetch_started + artifacts.context = {k: v.text for k, v in (ctx or {}).items()} if ctx else {} + + synth_started = time.perf_counter() + draft = self.agent._synthesize(question, ctx, plan) # type: ignore[attr-defined] + artifacts.timings.synth_s = time.perf_counter() - synth_started + artifacts.raw_synth = str(draft) + parsed = self.agent.domain_parser(draft) + artifacts.answer = str(parsed) + except Exception as exc: # pragma: no cover - demo fallback + artifacts.error = str(exc) + finally: + artifacts.timings.total_s = time.perf_counter() - started + + return artifacts + + +def build_agent(llm, provider) -> AgentRunner: + return AgentRunner(llm, provider) + + +def _save_text(path: Path, content: str) -> None: + path.write_text(content, encoding="utf-8") + + +def _save_json(path: Path, payload: object) -> None: + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def save_artifacts(artifacts: RunArtifacts) -> None: + artifacts.run_dir.mkdir(parents=True, exist_ok=True) + if artifacts.plan is not None: + _save_json(artifacts.run_dir / "plan.json", artifacts.plan) + if artifacts.context is not None: + _save_json(artifacts.run_dir / "context.json", artifacts.context) + if artifacts.answer is not None: + _save_text(artifacts.run_dir / "answer.txt", artifacts.answer) + if artifacts.raw_synth is not None: + _save_text(artifacts.run_dir / "raw_synth.txt", artifacts.raw_synth) + if artifacts.error is not None: + _save_text(artifacts.run_dir / "error.txt", artifacts.error) + + +def _match_expected(case: Case, answer: str | None) -> ExpectedCheck | None: + if answer is None: + return ExpectedCheck(mode="none", expected="", passed=False, detail="no answer") + if case.expected is not None: + passed = answer.strip() == case.expected.strip() + detail = None if passed else f"expected={case.expected!r}, got={answer!r}" + return ExpectedCheck(mode="exact", expected=case.expected, passed=passed, detail=detail) + if case.expected_regex is not None: + pattern = re.compile(case.expected_regex) + passed = bool(pattern.search(answer)) + detail = None if passed else f"regex {case.expected_regex!r} not found" + return ExpectedCheck(mode="regex", expected=case.expected_regex, passed=passed, detail=detail) + if case.expected_contains is not None: + passed = case.expected_contains in answer + detail = None if passed else f"expected to contain {case.expected_contains!r}" + return ExpectedCheck(mode="contains", expected=case.expected_contains, passed=passed, detail=detail) + return None + + +def run_one(case: Case, runner: AgentRunner, artifacts_root: Path) -> RunResult: + run_id = uuid.uuid4().hex[:8] + run_dir = artifacts_root / f"{case.id}_{run_id}" + if case.skip: + run_dir.mkdir(parents=True, exist_ok=True) + _save_text(run_dir / "skipped.txt", "Skipped by request") + return RunResult( + id=case.id, + question=case.question, + status="skipped", + answer=None, + error=None, + plan_path=None, + artifacts_dir=str(run_dir), + timings=RunTimings(), + expected_check=None, + ) + artifacts = runner.run_question(case.question, run_id, run_dir) + save_artifacts(artifacts) + + expected_check = _match_expected(case, artifacts.answer) + status = "ok" + if artifacts.error: + status = "error" + elif expected_check and not expected_check.passed: + status = "mismatch" + + plan_path = str(run_dir / "plan.json") if artifacts.plan is not None else None + result = RunResult( + id=case.id, + question=case.question, + status=status, + answer=artifacts.answer, + error=artifacts.error, + plan_path=plan_path, + artifacts_dir=str(run_dir), + timings=artifacts.timings, + expected_check=expected_check, + ) + return result + + +def summarize(results: Iterable[RunResult]) -> Dict[str, object]: + totals = {"ok": 0, "error": 0, "mismatch": 0, "skipped": 0} + total_times: List[float] = [] + for res in results: + totals[res.status] = totals.get(res.status, 0) + 1 + if res.timings.total_s is not None: + total_times.append(res.timings.total_s) + + summary: Dict[str, object] = { + "total": sum(totals.values()), + **totals, + } + if total_times: + summary["avg_total_s"] = statistics.fmean(total_times) + summary["median_total_s"] = statistics.median(total_times) + else: + summary["avg_total_s"] = None + summary["median_total_s"] = None + return summary + + +def load_cases(path: Path) -> List[Case]: + if not path.exists(): + raise FileNotFoundError(f"Cases file not found: {path}") + cases: List[Case] = [] + with path.open("r", encoding="utf-8") as f: + for lineno, line in enumerate(f, start=1): + line = line.strip() + if not line: + continue + try: + payload = json.loads(line) + except json.JSONDecodeError as exc: + raise ValueError(f"Invalid JSON on line {lineno}: {exc}") from exc + if "id" not in payload or "question" not in payload: + raise ValueError(f"Case on line {lineno} missing required fields 'id' and 'question'") + case = Case( + id=str(payload["id"]), + question=str(payload["question"]), + expected=payload.get("expected"), + expected_regex=payload.get("expected_regex"), + expected_contains=payload.get("expected_contains"), + tags=list(payload.get("tags", []) or []), + skip=bool(payload.get("skip", False)), + ) + cases.append(case) + return cases + + +def format_status_line(result: RunResult) -> str: + timing = f"{result.timings.total_s:.2f}s" if result.timings.total_s is not None else "n/a" + if result.status == "ok": + return f"OK {result.id} {timing}" + if result.status == "skipped": + return f"SKIP {result.id}" + reason = result.error or (result.expected_check.detail if result.expected_check else "") + return f"FAIL {result.id} {result.status} ({reason or 'unknown'}) {timing}" + + +__all__ = [ + "AgentRunner", + "Case", + "ExpectedCheck", + "RunArtifacts", + "RunResult", + "build_agent", + "format_status_line", + "load_cases", + "run_one", + "save_artifacts", + "summarize", +] From ff05569dc6b200fdaaf30968c5a9ad42bb0f0ec8 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 15:47:28 +0300 Subject: [PATCH 02/92] Fix batch logging path and update CLI defaults --- README_demo_qa.md | 5 +- examples/demo_qa/cli.py | 192 +++++++++++++++++++++--- examples/demo_qa/runner.py | 273 ++++++++++++++++++++++++++++++----- tests/test_demo_qa_runner.py | 111 ++++++++++++++ 4 files changed, 526 insertions(+), 55 deletions(-) create mode 100644 tests/test_demo_qa_runner.py diff --git a/README_demo_qa.md b/README_demo_qa.md index 006da3f..c654c01 100644 --- a/README_demo_qa.md +++ b/README_demo_qa.md @@ -65,8 +65,9 @@ python -m examples.demo_qa.cli batch \ ``` * Артефакты по умолчанию пишутся в `/.runs/batch_/id_runid/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`). -* `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов. -* Флаги `--fail-on (error|mismatch|any)`, `--max-fails` и `--fail-fast` управляют остановкой и кодом выхода (0/1/2). +* `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов и, при наличии `--compare-to`, diff по прогрессу. +* Флаги `--fail-on (error|mismatch|any)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to` и `--only-failed-from` управляют выбором кейсов, остановкой и кодом выхода (0/1/2). +* Без `--out` результаты складываются в `/runs/_/results.jsonl`, а `runs/latest.txt` указывает на последнюю папку запуска. ## Local proxy Для OpenAI-совместимых серверов (например, LM Studio) укажите `base_url` с `.../v1` и diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index f5763de..b49e632 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -2,10 +2,12 @@ import argparse import datetime +import hashlib import json import sys +import uuid from pathlib import Path -from typing import Iterable +from typing import Iterable, Mapping, Optional ROOT = Path(__file__).resolve().parents[2] SRC = ROOT / "src" @@ -17,7 +19,16 @@ from .llm.factory import build_llm from .logging_config import configure_logging from .provider_factory import build_provider -from .runner import RunResult, build_agent, format_status_line, load_cases, run_one, summarize +from .runner import ( + RunResult, + build_agent, + compare_results, + format_status_line, + load_cases, + load_results, + run_one, + summarize, +) from .settings import load_settings @@ -34,12 +45,32 @@ def write_summary(out_path: Path, summary: dict) -> Path: return summary_path -def is_failure(status: str, fail_on: str) -> bool: +def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: + failure_statuses = {"error", "mismatch", "failed"} if fail_on == "error": - return status == "error" - if fail_on == "mismatch": - return status in {"error", "mismatch"} - return status in {"error", "mismatch", "skipped"} + failure_statuses = {"error"} + elif fail_on == "mismatch": + failure_statuses = {"error", "mismatch", "failed"} + else: + failure_statuses = {"error", "mismatch", "failed", "unchecked"} + if require_assert and status == "unchecked": + return True + return status in failure_statuses + + +def _hash_file(path: Path) -> str: + data = path.read_bytes() + return hashlib.sha256(data).hexdigest() + + +def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]: + llm_settings = settings.llm + return { + "base_url": llm_settings.base_url or "https://api.openai.com/v1", + "plan_model": llm_settings.plan_model, + "synth_model": llm_settings.synth_model, + "cases_hash": _hash_file(cases_path), + } def handle_chat(args) -> int: @@ -86,6 +117,9 @@ def handle_chat(args) -> int: def handle_batch(args) -> int: + started_at = datetime.datetime.utcnow() + run_id = uuid.uuid4().hex[:8] + try: settings = load_settings(config_path=args.config, data_dir=args.data) except Exception as exc: @@ -97,10 +131,42 @@ def handle_batch(args) -> int: print(f"Cases error: {exc}", file=sys.stderr) return 2 + baseline_for_filter: Optional[Mapping[str, RunResult]] = None + baseline_for_compare: Optional[Mapping[str, RunResult]] = None + + if args.only_failed_from: + try: + baseline_for_filter = load_results(args.only_failed_from) + except Exception as exc: + print(f"Failed to read baseline for --only-failed-from: {exc}", file=sys.stderr) + return 2 + + if args.compare_to: + try: + if args.only_failed_from and args.compare_to.resolve() == args.only_failed_from.resolve(): + baseline_for_compare = baseline_for_filter + else: + baseline_for_compare = load_results(args.compare_to) + except Exception as exc: + print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr) + return 2 + + if baseline_for_filter: + bad_statuses = {"mismatch", "failed", "error"} + if args.require_assert: + bad_statuses.add("unchecked") + target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses} + cases = [case for case in cases if case.id in target_ids] + artifacts_dir = args.artifacts_dir if artifacts_dir is None: - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - artifacts_dir = args.data / ".runs" / f"batch_{timestamp}" + artifacts_dir = args.data / ".runs" + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}" + results_path = args.out or (run_folder / "results.jsonl") + artifacts_root = run_folder / "cases" + results_path.parent.mkdir(parents=True, exist_ok=True) + summary_path = results_path.with_name("summary.json") artifacts_dir.mkdir(parents=True, exist_ok=True) log_dir = args.log_dir or args.data / ".runs" / "logs" @@ -119,23 +185,99 @@ def handle_batch(args) -> int: results: list[RunResult] = [] failures = 0 for case in cases: - result = run_one(case, runner, artifacts_dir) + result = run_one(case, runner, artifacts_root) results.append(result) - print(format_status_line(result)) - if is_failure(result.status, args.fail_on): + if not args.quiet: + print(format_status_line(result)) + if is_failure(result.status, args.fail_on, args.require_assert): failures += 1 if args.fail_fast or (args.max_fails and failures >= args.max_fails): break - write_results(args.out, results) - summary = summarize(results) - summary_path = write_summary(args.out, summary) - print(f"Summary: {json.dumps(summary, ensure_ascii=False)}") - print(f"Results written to: {args.out}") + write_results(results_path, results) + counts = summarize(results) + + results_by_id = {r.id: r for r in results} + diff_block: dict | None = None + baseline_path: Path | None = None + if baseline_for_compare: + baseline_path = args.compare_to or args.only_failed_from + diff = compare_results(baseline_for_compare, results_by_id, require_assert=args.require_assert) + if baseline_path: + diff["baseline_path"] = str(baseline_path) + diff_block = diff + + failure_count = sum(1 for res in results if is_failure(res.status, args.fail_on, args.require_assert)) + exit_code = 1 if failure_count else 0 + + ended_at = datetime.datetime.utcnow() + duration_ms = int((ended_at - started_at).total_seconds() * 1000) + summary = { + "run_id": run_id, + "started_at": started_at.isoformat() + "Z", + "ended_at": ended_at.isoformat() + "Z", + "duration_ms": duration_ms, + "counts": counts, + "exit_code": exit_code, + "config_fingerprint": build_config_fingerprint(settings, args.cases), + "results_path": str(results_path), + "require_assert": args.require_assert, + "fail_on": args.fail_on, + } + if diff_block: + summary["diff"] = diff_block + + summary_path = write_summary(results_path, summary) + + latest_path = run_folder.parent / "latest.txt" + latest_path.parent.mkdir(parents=True, exist_ok=True) + latest_path.write_text(str(run_folder), encoding="utf-8") + + bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0) + unchecked = counts.get("unchecked", 0) + if args.require_assert: + bad_count += unchecked + summary_line = ( + f"Batch: {counts.get('total', 0)} cases | Checked: {counts.get('checked_total', 0)} | " + f"OK: {counts.get('ok', 0)} | BAD: {bad_count} | Unchecked: {unchecked} | Skipped: {counts.get('skipped', 0)}" + ) + + if args.quiet: + print(summary_line) + if diff_block: + print( + f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, " + f"-{len(diff_block.get('regressed', []))} regressions, " + f"{len(diff_block.get('still_bad', []))} still failing, " + f"{len(diff_block.get('new_unchecked', []))} new unchecked" + ) + return exit_code + + print(summary_line) + if diff_block: + print( + f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, " + f"-{len(diff_block.get('regressed', []))} regressions, " + f"{len(diff_block.get('still_bad', []))} still failing, " + f"{len(diff_block.get('new_unchecked', []))} new unchecked" + ) + + failures_list: dict[str, RunResult] = {} + for res in results: + if is_failure(res.status, args.fail_on, args.require_assert) or ( + args.require_assert and res.status == "unchecked" + ): + failures_list[res.id] = res + if failures_list: + print(f"Failures (top {args.show_failures}):") + for res in list(failures_list.values())[: args.show_failures]: + reason = res.reason or res.error or "" + print(f"- {res.id}: {res.status} ({reason}) [{res.artifacts_dir}]") + + print(f"Results written to: {results_path}") print(f"Summary written to: {summary_path}") - failure_count = sum(1 for res in results if is_failure(res.status, args.fail_on)) - return 1 if failure_count else 0 + return exit_code def main() -> None: @@ -163,7 +305,7 @@ def main() -> None: batch_p.add_argument("--schema", type=Path, required=True) batch_p.add_argument("--config", type=Path, default=None, help="Path to demo_qa.toml") batch_p.add_argument("--cases", type=Path, required=True, help="Path to cases jsonl") - batch_p.add_argument("--out", type=Path, required=True, help="Path to results jsonl") + batch_p.add_argument("--out", type=Path, required=False, default=None, help="Path to results jsonl") batch_p.add_argument("--artifacts-dir", type=Path, default=None, help="Where to store per-case artifacts") batch_p.add_argument("--enable-semantic", action="store_true") batch_p.add_argument("--log-level", default="INFO", help="Logging level (INFO, DEBUG, etc.)") @@ -178,6 +320,16 @@ def main() -> None: default="mismatch", help="Which statuses should cause a failing exit code", ) + batch_p.add_argument("--require-assert", action="store_true", help="Treat unchecked cases as failures") + batch_p.add_argument("--compare-to", type=Path, default=None, help="Path to previous results.jsonl for diff") + batch_p.add_argument( + "--only-failed-from", + type=Path, + default=None, + help="Run only cases that failed/mismatched/errored in a previous results.jsonl", + ) + batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code") + batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show") args = parser.parse_args() diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 2fc17af..a6a9778 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -7,7 +7,7 @@ import uuid from dataclasses import dataclass, field from pathlib import Path -from typing import Dict, Iterable, List +from typing import Dict, Iterable, List, Mapping, Optional from fetchgraph.core import create_generic_agent from fetchgraph.core.models import TaskProfile @@ -48,11 +48,15 @@ class RunResult: id: str question: str status: str - answer: str | None - error: str | None - plan_path: str | None + checked: bool + reason: str | None + details: Dict[str, object] | None artifacts_dir: str - timings: RunTimings + duration_ms: int + answer: str | None = None + error: str | None = None + plan_path: str | None = None + timings: RunTimings | None = None expected_check: ExpectedCheck | None = None def to_json(self) -> Dict[str, object]: @@ -60,11 +64,15 @@ def to_json(self) -> Dict[str, object]: "id": self.id, "question": self.question, "status": self.status, + "checked": self.checked, + "reason": self.reason, + "details": self.details, + "artifacts_dir": self.artifacts_dir, + "duration_ms": self.duration_ms, "answer": self.answer, "error": self.error, "plan_path": self.plan_path, - "artifacts_dir": self.artifacts_dir, - "timings": self.timings.__dict__, + "timings": self.timings.__dict__ if self.timings else None, } if self.expected_check: payload["expected_check"] = self.expected_check.__dict__ @@ -81,6 +89,10 @@ class Case: tags: List[str] = field(default_factory=list) skip: bool = False + @property + def has_asserts(self) -> bool: + return any([self.expected, self.expected_regex, self.expected_contains]) + class AgentRunner: def __init__(self, llm, provider) -> None: @@ -161,9 +173,18 @@ def save_artifacts(artifacts: RunArtifacts) -> None: _save_text(artifacts.run_dir / "error.txt", artifacts.error) +def save_status(result: RunResult) -> None: + status_path = Path(result.artifacts_dir) / "status.json" + status_path.parent.mkdir(parents=True, exist_ok=True) + _save_json(status_path, result.to_json()) + + def _match_expected(case: Case, answer: str | None) -> ExpectedCheck | None: + if not case.has_asserts: + return None + expected_value = case.expected or case.expected_regex or case.expected_contains or "" if answer is None: - return ExpectedCheck(mode="none", expected="", passed=False, detail="no answer") + return ExpectedCheck(mode="none", expected=expected_value, passed=False, detail="no answer") if case.expected is not None: passed = answer.strip() == case.expected.strip() detail = None if passed else f"expected={case.expected!r}, got={answer!r}" @@ -180,58 +201,91 @@ def _match_expected(case: Case, answer: str | None) -> ExpectedCheck | None: return None +def _build_result( + case: Case, artifacts: RunArtifacts, run_dir: Path, expected_check: ExpectedCheck | None +) -> RunResult: + status = "unchecked" + reason: str | None = None + details: Dict[str, object] | None = None + + if artifacts.error: + status = "error" + reason = artifacts.error + details = {"error": artifacts.error} + elif expected_check: + status = "ok" if expected_check.passed else "mismatch" + reason = expected_check.detail + details = {"expected_check": expected_check.__dict__} + else: + reason = "no expectations provided" + details = {"note": "no expectations provided"} + + plan_path = str(run_dir / "plan.json") if artifacts.plan is not None else None + duration_ms = int((artifacts.timings.total_s or 0.0) * 1000) + return RunResult( + id=case.id, + question=case.question, + status=status, + checked=case.has_asserts, + reason=reason, + details=details, + artifacts_dir=str(run_dir), + duration_ms=duration_ms, + answer=artifacts.answer, + error=artifacts.error, + plan_path=plan_path, + timings=artifacts.timings, + expected_check=expected_check, + ) + + def run_one(case: Case, runner: AgentRunner, artifacts_root: Path) -> RunResult: run_id = uuid.uuid4().hex[:8] run_dir = artifacts_root / f"{case.id}_{run_id}" if case.skip: run_dir.mkdir(parents=True, exist_ok=True) _save_text(run_dir / "skipped.txt", "Skipped by request") - return RunResult( + result = RunResult( id=case.id, question=case.question, status="skipped", + checked=False, + reason="skipped", + details=None, + artifacts_dir=str(run_dir), + duration_ms=0, answer=None, error=None, plan_path=None, - artifacts_dir=str(run_dir), timings=RunTimings(), expected_check=None, ) + save_status(result) + return result + artifacts = runner.run_question(case.question, run_id, run_dir) save_artifacts(artifacts) expected_check = _match_expected(case, artifacts.answer) - status = "ok" - if artifacts.error: - status = "error" - elif expected_check and not expected_check.passed: - status = "mismatch" - - plan_path = str(run_dir / "plan.json") if artifacts.plan is not None else None - result = RunResult( - id=case.id, - question=case.question, - status=status, - answer=artifacts.answer, - error=artifacts.error, - plan_path=plan_path, - artifacts_dir=str(run_dir), - timings=artifacts.timings, - expected_check=expected_check, - ) + result = _build_result(case, artifacts, run_dir, expected_check) + save_status(result) return result def summarize(results: Iterable[RunResult]) -> Dict[str, object]: - totals = {"ok": 0, "error": 0, "mismatch": 0, "skipped": 0} + totals = {"ok": 0, "mismatch": 0, "failed": 0, "error": 0, "skipped": 0, "unchecked": 0} total_times: List[float] = [] + checked_total = 0 for res in results: totals[res.status] = totals.get(res.status, 0) + 1 - if res.timings.total_s is not None: - total_times.append(res.timings.total_s) + if res.duration_ms is not None: + total_times.append(res.duration_ms / 1000) + if res.checked and res.status in {"ok", "mismatch", "failed", "error"}: + checked_total += 1 summary: Dict[str, object] = { "total": sum(totals.values()), + "checked_total": checked_total, **totals, } if total_times: @@ -271,13 +325,162 @@ def load_cases(path: Path) -> List[Case]: return cases +def _build_timings(payload: Mapping[str, object] | None) -> RunTimings | None: + if not payload: + return None + return RunTimings( + plan_s=payload.get("plan_s"), # type: ignore[arg-type] + fetch_s=payload.get("fetch_s"), # type: ignore[arg-type] + synth_s=payload.get("synth_s"), # type: ignore[arg-type] + total_s=payload.get("total_s"), # type: ignore[arg-type] + ) + + +def _build_expected_check(payload: Mapping[str, object] | None) -> ExpectedCheck | None: + if not payload: + return None + return ExpectedCheck( + mode=str(payload.get("mode", "")), + expected=str(payload.get("expected", "")), + passed=bool(payload.get("passed", False)), + detail=payload.get("detail"), # type: ignore[arg-type] + ) + + +def _duration_from_payload(payload: Mapping[str, object]) -> int: + if "duration_ms" in payload and payload["duration_ms"] is not None: + try: + return int(payload["duration_ms"]) # type: ignore[arg-type] + except Exception: + pass + timings = payload.get("timings") + if isinstance(timings, Mapping) and timings.get("total_s") is not None: + try: + return int(float(timings["total_s"]) * 1000) # type: ignore[arg-type] + except Exception: + return 0 + return 0 + + +def _run_result_from_payload(payload: Mapping[str, object]) -> RunResult: + expected_check = _build_expected_check(payload.get("expected_check") if isinstance(payload, Mapping) else None) + timings = _build_timings(payload.get("timings") if isinstance(payload, Mapping) else None) + checked = bool(payload.get("checked", False)) + if expected_check and not checked: + checked = True + status = str(payload.get("status", "error")) + duration_ms = _duration_from_payload(payload) + reason = payload.get("reason") # type: ignore[arg-type] + details = payload.get("details") if isinstance(payload.get("details"), dict) else None + artifacts_dir = str(payload.get("artifacts_dir", "")) + if not artifacts_dir: + raise ValueError("artifacts_dir missing in result payload") + return RunResult( + id=str(payload.get("id", "")), + question=str(payload.get("question", "")), + status=status, + checked=checked, + reason=reason, + details=details, + artifacts_dir=artifacts_dir, + duration_ms=duration_ms, + answer=payload.get("answer"), # type: ignore[arg-type] + error=payload.get("error"), # type: ignore[arg-type] + plan_path=payload.get("plan_path"), # type: ignore[arg-type] + timings=timings, + expected_check=expected_check, + ) + + +def load_results(path: Path) -> Dict[str, RunResult]: + results: Dict[str, RunResult] = {} + if not path.exists(): + raise FileNotFoundError(f"Results file not found: {path}") + with path.open("r", encoding="utf-8") as f: + for lineno, line in enumerate(f, start=1): + line = line.strip() + if not line: + continue + try: + payload = json.loads(line) + except json.JSONDecodeError as exc: + raise ValueError(f"Invalid result JSON on line {lineno}: {exc}") from exc + result = _run_result_from_payload(payload) + results[result.id] = result + return results + + +def _bucket(status: str, checked: bool, require_assert: bool) -> str: + if status == "ok": + return "OK" + if status in {"mismatch", "failed", "error"}: + return "BAD" + if status == "unchecked": + return "BAD" if require_assert else "UNCHECKED" + return "NEUTRAL" + + +def compare_results( + baseline: Mapping[str, RunResult], + current: Mapping[str, RunResult], + *, + require_assert: bool, +) -> Dict[str, object]: + new_ok: List[str] = [] + regressed: List[str] = [] + still_ok: List[str] = [] + still_bad: List[str] = [] + new_unchecked: List[str] = [] + status_changes: Dict[str, Dict[str, str]] = {} + new_cases: List[str] = [] + + for case_id, res in current.items(): + base_res = baseline.get(case_id) + new_bucket = _bucket(res.status, res.checked, require_assert) + if base_res is None: + new_cases.append(case_id) + if new_bucket == "OK": + new_ok.append(case_id) + elif new_bucket == "BAD": + still_bad.append(case_id) + status_changes[case_id] = {"from": "new", "to": res.status} + continue + + base_bucket = _bucket(base_res.status, base_res.checked, require_assert) + if base_res.checked and res.status == "unchecked": + new_unchecked.append(case_id) + if base_bucket == "OK" and new_bucket in {"BAD", "UNCHECKED"}: + regressed.append(case_id) + elif base_bucket in {"BAD", "UNCHECKED"} and new_bucket == "OK": + new_ok.append(case_id) + elif base_bucket == "OK" and new_bucket == "OK": + still_ok.append(case_id) + elif base_bucket in {"BAD", "UNCHECKED"} and new_bucket in {"BAD", "UNCHECKED"}: + still_bad.append(case_id) + + if base_res.status != res.status: + status_changes[case_id] = {"from": base_res.status, "to": res.status} + + return { + "new_ok": new_ok, + "regressed": regressed, + "still_ok": still_ok, + "still_bad": still_bad, + "new_unchecked": new_unchecked, + "status_changes": status_changes, + "new_cases": new_cases, + } + + def format_status_line(result: RunResult) -> str: - timing = f"{result.timings.total_s:.2f}s" if result.timings.total_s is not None else "n/a" + timing = f"{result.duration_ms / 1000:.2f}s" if result.status == "ok": return f"OK {result.id} {timing}" if result.status == "skipped": return f"SKIP {result.id}" - reason = result.error or (result.expected_check.detail if result.expected_check else "") + if result.status == "unchecked": + return f"UNCHECKED {result.id} {timing}" + reason = result.reason or "" return f"FAIL {result.id} {result.status} ({reason or 'unknown'}) {timing}" @@ -288,9 +491,13 @@ def format_status_line(result: RunResult) -> str: "RunArtifacts", "RunResult", "build_agent", + "compare_results", "format_status_line", + "load_results", "load_cases", "run_one", "save_artifacts", + "save_status", "summarize", + "_match_expected", ] diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py new file mode 100644 index 0000000..6bdb95e --- /dev/null +++ b/tests/test_demo_qa_runner.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +from examples.demo_qa.runner import Case, RunResult, _match_expected, compare_results + + +def test_match_expected_unchecked_when_no_expectations() -> None: + case = Case(id="c1", question="What is foo?") + assert _match_expected(case, "anything") is None + + +def test_match_expected_contains_pass_and_fail() -> None: + case = Case(id="c2", question="Q", expected_contains="bar") + + match = _match_expected(case, "value bar baz") + assert match is not None + assert match.passed is True + + mismatch = _match_expected(case, "value baz") + assert mismatch is not None + assert mismatch.passed is False + assert "bar" in (mismatch.detail or "") + + missing_answer = _match_expected(case, None) + assert missing_answer is not None + assert missing_answer.passed is False + assert missing_answer.detail == "no answer" + + +def test_compare_results_tracks_regressions_and_improvements() -> None: + baseline = { + "ok_to_bad": RunResult( + id="ok_to_bad", + question="", + status="ok", + checked=True, + reason=None, + details=None, + artifacts_dir="/tmp/ok", + duration_ms=10, + ), + "err_to_ok": RunResult( + id="err_to_ok", + question="", + status="error", + checked=True, + reason=None, + details=None, + artifacts_dir="/tmp/err", + duration_ms=10, + ), + "checked_to_unchecked": RunResult( + id="checked_to_unchecked", + question="", + status="ok", + checked=True, + reason=None, + details=None, + artifacts_dir="/tmp/ok2", + duration_ms=10, + ), + } + + current = { + "ok_to_bad": RunResult( + id="ok_to_bad", + question="", + status="mismatch", + checked=True, + reason=None, + details=None, + artifacts_dir="/tmp/ok", + duration_ms=10, + ), + "err_to_ok": RunResult( + id="err_to_ok", + question="", + status="ok", + checked=True, + reason=None, + details=None, + artifacts_dir="/tmp/err", + duration_ms=10, + ), + "checked_to_unchecked": RunResult( + id="checked_to_unchecked", + question="", + status="unchecked", + checked=False, + reason=None, + details=None, + artifacts_dir="/tmp/ok2", + duration_ms=10, + ), + "new_ok": RunResult( + id="new_ok", + question="", + status="ok", + checked=True, + reason=None, + details=None, + artifacts_dir="/tmp/new", + duration_ms=10, + ), + } + + diff = compare_results(baseline, current, require_assert=True) + + assert "ok_to_bad" in diff["regressed"] + assert "err_to_ok" in diff["new_ok"] + assert "checked_to_unchecked" in diff["new_unchecked"] + assert "new_ok" in diff["new_ok"] From 80b02390e1e9d74d453475f7da3613f4230c3d19 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 15:47:33 +0300 Subject: [PATCH 03/92] Add regression-friendly batch controls and case helpers --- README_demo_qa.md | 4 +- examples/demo_qa/cli.py | 149 ++++++++++++++++++++++++++++++++--- examples/demo_qa/runner.py | 47 ++++++----- tests/test_demo_qa_runner.py | 42 +++++++++- 4 files changed, 212 insertions(+), 30 deletions(-) diff --git a/README_demo_qa.md b/README_demo_qa.md index c654c01..4036a23 100644 --- a/README_demo_qa.md +++ b/README_demo_qa.md @@ -66,8 +66,10 @@ python -m examples.demo_qa.cli batch \ * Артефакты по умолчанию пишутся в `/.runs/batch_/id_runid/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`). * `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов и, при наличии `--compare-to`, diff по прогрессу. -* Флаги `--fail-on (error|mismatch|any)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to` и `--only-failed-from` управляют выбором кейсов, остановкой и кодом выхода (0/1/2). +* Флаги `--fail-on (error|mismatch/unchecked/any)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to`, `--only-failed-from/--only-failed` и `--plan-only` управляют выбором кейсов, остановкой и кодом выхода (0/1/2). * Без `--out` результаты складываются в `/runs/_/results.jsonl`, а `runs/latest.txt` указывает на последнюю папку запуска. +* Быстрый фокус на упавших: `--only-failed` возьмёт `runs/latest/results.jsonl`, `--show-artifacts` печатает пути, репро-команды выводятся для каждого FAIL. +* Команды уровня кейса: `demo_qa case run --cases ...` и `demo_qa case open --run runs/latest` для быстрого воспроизведения. ## Local proxy Для OpenAI-совместимых серверов (например, LM Studio) укажите `base_url` с `.../v1` и diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index b49e632..f2ae789 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -51,8 +51,10 @@ def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: failure_statuses = {"error"} elif fail_on == "mismatch": failure_statuses = {"error", "mismatch", "failed"} - else: + elif fail_on == "unchecked": failure_statuses = {"error", "mismatch", "failed", "unchecked"} + else: + failure_statuses = {"error", "mismatch", "failed", "unchecked", "skipped"} if require_assert and status == "unchecked": return True return status in failure_statuses @@ -73,6 +75,25 @@ def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object] } +def _load_latest_run(artifacts_dir: Path) -> Optional[Path]: + latest_file = artifacts_dir / "runs" / "latest.txt" + if latest_file.exists(): + content = latest_file.read_text(encoding="utf-8").strip() + if content: + return Path(content) + return None + + +def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]: + cases_dir = run_path / "cases" + if not cases_dir.exists(): + return None + matches = sorted(cases_dir.glob(f"{case_id}_*")) + if matches: + return matches[-1] + return None + + def handle_chat(args) -> int: try: settings = load_settings(config_path=args.config, data_dir=args.data) @@ -116,6 +137,71 @@ def handle_chat(args) -> int: return 0 +def _resolve_run_path(path: Path | None, artifacts_dir: Path) -> Optional[Path]: + if path is not None: + return path + return _load_latest_run(artifacts_dir) + + +def handle_case_run(args) -> int: + try: + settings = load_settings(config_path=args.config, data_dir=args.data) + except Exception as exc: + print(f"Configuration error: {exc}", file=sys.stderr) + return 2 + try: + cases = {c.id: c for c in load_cases(args.cases)} + except Exception as exc: + print(f"Cases error: {exc}", file=sys.stderr) + return 2 + if args.case_id not in cases: + print(f"Case {args.case_id} not found in {args.cases}", file=sys.stderr) + return 2 + + artifacts_dir = args.artifacts_dir or (args.data / ".runs") + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}" + artifacts_root = run_folder / "cases" + results_path = run_folder / "results.jsonl" + + log_dir = artifacts_dir / "logs" + configure_logging(level="INFO", log_dir=log_dir, to_stderr=True, jsonl=False, run_id=None) + + provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic) + llm = build_llm(settings) + runner = build_agent(llm, provider) + + result = run_one(cases[args.case_id], runner, artifacts_root, plan_only=args.plan_only) + write_results(results_path, [result]) + save_path = run_folder.parent / "latest.txt" + save_path.parent.mkdir(parents=True, exist_ok=True) + save_path.write_text(str(run_folder), encoding="utf-8") + + print(format_status_line(result)) + print(f"Artifacts: {result.artifacts_dir}") + return 0 + + +def handle_case_open(args) -> int: + artifacts_dir = args.artifacts_dir or Path(".") / ".runs" + run_path = _resolve_run_path(args.run, artifacts_dir) + if not run_path: + print("No run found. Provide --run or ensure runs/latest.txt exists.", file=sys.stderr) + return 2 + case_dir = _find_case_artifact(run_path, args.case_id) + if not case_dir: + print(f"Case {args.case_id} not found under {run_path}", file=sys.stderr) + return 2 + print(f"Case {args.case_id} artifacts: {case_dir}") + plan = case_dir / "plan.json" + answer = case_dir / "answer.txt" + status = case_dir / "status.json" + for path in [plan, answer, status]: + if path.exists(): + print(f"- {path}") + return 0 + + def handle_batch(args) -> int: started_at = datetime.datetime.utcnow() run_id = uuid.uuid4().hex[:8] @@ -134,26 +220,34 @@ def handle_batch(args) -> int: baseline_for_filter: Optional[Mapping[str, RunResult]] = None baseline_for_compare: Optional[Mapping[str, RunResult]] = None - if args.only_failed_from: + baseline_filter_path = args.only_failed_from + if args.only_failed and not baseline_filter_path: + latest = _load_latest_run(args.artifacts_dir or args.data / ".runs") + if latest: + baseline_filter_path = latest / "results.jsonl" + if baseline_filter_path: try: - baseline_for_filter = load_results(args.only_failed_from) + baseline_for_filter = load_results(baseline_filter_path) except Exception as exc: print(f"Failed to read baseline for --only-failed-from: {exc}", file=sys.stderr) return 2 - if args.compare_to: + compare_path = args.compare_to + if compare_path is None and args.only_failed and baseline_filter_path: + compare_path = baseline_filter_path + if compare_path: try: - if args.only_failed_from and args.compare_to.resolve() == args.only_failed_from.resolve(): + if baseline_filter_path and compare_path.resolve() == baseline_filter_path.resolve(): baseline_for_compare = baseline_for_filter else: - baseline_for_compare = load_results(args.compare_to) + baseline_for_compare = load_results(compare_path) except Exception as exc: print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr) return 2 if baseline_for_filter: bad_statuses = {"mismatch", "failed", "error"} - if args.require_assert: + if args.require_assert or args.fail_on == "unchecked": bad_statuses.add("unchecked") target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses} cases = [case for case in cases if case.id in target_ids] @@ -185,7 +279,7 @@ def handle_batch(args) -> int: results: list[RunResult] = [] failures = 0 for case in cases: - result = run_one(case, runner, artifacts_root) + result = run_one(case, runner, artifacts_root, plan_only=args.plan_only) results.append(result) if not args.quiet: print(format_status_line(result)) @@ -239,7 +333,8 @@ def handle_batch(args) -> int: bad_count += unchecked summary_line = ( f"Batch: {counts.get('total', 0)} cases | Checked: {counts.get('checked_total', 0)} | " - f"OK: {counts.get('ok', 0)} | BAD: {bad_count} | Unchecked: {unchecked} | Skipped: {counts.get('skipped', 0)}" + f"Checked OK: {counts.get('checked_ok', 0)} | Unchecked OK: {counts.get('unchecked_ok', 0)} | " + f"BAD: {bad_count} | Unchecked: {unchecked} | Skipped: {counts.get('skipped', 0)}" ) if args.quiet: @@ -272,7 +367,14 @@ def handle_batch(args) -> int: print(f"Failures (top {args.show_failures}):") for res in list(failures_list.values())[: args.show_failures]: reason = res.reason or res.error or "" + repro = ( + f"demo_qa case run {res.id} --cases {args.cases} --data {args.data} " + f"--schema {args.schema}" + (" --plan-only" if args.plan_only else "") + ) print(f"- {res.id}: {res.status} ({reason}) [{res.artifacts_dir}]") + if args.show_artifacts: + print(f" artifacts: {res.artifacts_dir}") + print(f" repro: {repro}") print(f"Results written to: {results_path}") print(f"Summary written to: {summary_path}") @@ -316,7 +418,7 @@ def main() -> None: batch_p.add_argument("--fail-fast", action="store_true", help="Stop on first failing case") batch_p.add_argument( "--fail-on", - choices=["error", "mismatch", "any"], + choices=["error", "mismatch", "unchecked", "any"], default="mismatch", help="Which statuses should cause a failing exit code", ) @@ -328,8 +430,28 @@ def main() -> None: default=None, help="Run only cases that failed/mismatched/errored in a previous results.jsonl", ) + batch_p.add_argument("--only-failed", action="store_true", help="Use latest run for --only-failed-from automatically") + batch_p.add_argument("--plan-only", action="store_true", help="Run planner only (no fetch/synthesize)") batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code") batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show") + batch_p.add_argument("--show-artifacts", action="store_true", help="Show artifact paths for failures") + + case_p = sub.add_parser("case", help="Single-case utilities") + case_sub = case_p.add_subparsers(dest="case_command", required=True) + case_run = case_sub.add_parser("run", help="Run a single case by id") + case_run.add_argument("case_id") + case_run.add_argument("--cases", type=Path, required=True, help="Path to cases jsonl") + case_run.add_argument("--data", type=Path, required=True) + case_run.add_argument("--schema", type=Path, required=True) + case_run.add_argument("--config", type=Path, default=None) + case_run.add_argument("--enable-semantic", action="store_true") + case_run.add_argument("--artifacts-dir", type=Path, default=None) + case_run.add_argument("--plan-only", action="store_true") + + case_open = case_sub.add_parser("open", help="Show artifacts for a case in a run folder") + case_open.add_argument("case_id") + case_open.add_argument("--run", type=Path, default=None, help="Run folder (defaults to latest)") + case_open.add_argument("--artifacts-dir", type=Path, default=None, help="Base artifacts dir for latest lookup") args = parser.parse_args() @@ -342,6 +464,13 @@ def main() -> None: code = handle_chat(args) elif args.command == "batch": code = handle_batch(args) + elif args.command == "case": + if args.case_command == "run": + code = handle_case_run(args) + elif args.case_command == "open": + code = handle_case_open(args) + else: + code = 1 else: code = 0 raise SystemExit(code) diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index a6a9778..d9ad5f5 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -41,6 +41,7 @@ class RunArtifacts: raw_synth: str | None = None error: str | None = None timings: RunTimings = field(default_factory=RunTimings) + plan_only: bool = False @dataclass @@ -117,9 +118,9 @@ def saver(feature_name: str, parsed: object) -> None: task_profile=task_profile, ) - def run_question(self, question: str, run_id: str, run_dir: Path) -> RunArtifacts: + def run_question(self, question: str, run_id: str, run_dir: Path, *, plan_only: bool = False) -> RunArtifacts: set_run_id(run_id) - artifacts = RunArtifacts(run_id=run_id, run_dir=run_dir, question=question) + artifacts = RunArtifacts(run_id=run_id, run_dir=run_dir, question=question, plan_only=plan_only) started = time.perf_counter() try: @@ -128,17 +129,18 @@ def run_question(self, question: str, run_id: str, run_dir: Path) -> RunArtifact artifacts.timings.plan_s = time.perf_counter() - plan_started artifacts.plan = plan.model_dump() - fetch_started = time.perf_counter() - ctx = self.agent._fetch(question, plan) # type: ignore[attr-defined] - artifacts.timings.fetch_s = time.perf_counter() - fetch_started - artifacts.context = {k: v.text for k, v in (ctx or {}).items()} if ctx else {} - - synth_started = time.perf_counter() - draft = self.agent._synthesize(question, ctx, plan) # type: ignore[attr-defined] - artifacts.timings.synth_s = time.perf_counter() - synth_started - artifacts.raw_synth = str(draft) - parsed = self.agent.domain_parser(draft) - artifacts.answer = str(parsed) + if not plan_only: + fetch_started = time.perf_counter() + ctx = self.agent._fetch(question, plan) # type: ignore[attr-defined] + artifacts.timings.fetch_s = time.perf_counter() - fetch_started + artifacts.context = {k: v.text for k, v in (ctx or {}).items()} if ctx else {} + + synth_started = time.perf_counter() + draft = self.agent._synthesize(question, ctx, plan) # type: ignore[attr-defined] + artifacts.timings.synth_s = time.perf_counter() - synth_started + artifacts.raw_synth = str(draft) + parsed = self.agent.domain_parser(draft) + artifacts.answer = str(parsed) except Exception as exc: # pragma: no cover - demo fallback artifacts.error = str(exc) finally: @@ -217,8 +219,9 @@ def _build_result( reason = expected_check.detail details = {"expected_check": expected_check.__dict__} else: - reason = "no expectations provided" - details = {"note": "no expectations provided"} + status = "unchecked" + reason = "plan-only" if artifacts.plan_only else "no expectations provided" + details = {"note": reason} plan_path = str(run_dir / "plan.json") if artifacts.plan is not None else None duration_ms = int((artifacts.timings.total_s or 0.0) * 1000) @@ -239,7 +242,7 @@ def _build_result( ) -def run_one(case: Case, runner: AgentRunner, artifacts_root: Path) -> RunResult: +def run_one(case: Case, runner: AgentRunner, artifacts_root: Path, *, plan_only: bool = False) -> RunResult: run_id = uuid.uuid4().hex[:8] run_dir = artifacts_root / f"{case.id}_{run_id}" if case.skip: @@ -263,7 +266,7 @@ def run_one(case: Case, runner: AgentRunner, artifacts_root: Path) -> RunResult: save_status(result) return result - artifacts = runner.run_question(case.question, run_id, run_dir) + artifacts = runner.run_question(case.question, run_id, run_dir, plan_only=plan_only) save_artifacts(artifacts) expected_check = _match_expected(case, artifacts.answer) @@ -276,16 +279,24 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]: totals = {"ok": 0, "mismatch": 0, "failed": 0, "error": 0, "skipped": 0, "unchecked": 0} total_times: List[float] = [] checked_total = 0 + checked_ok = 0 + unchecked_ok = 0 for res in results: totals[res.status] = totals.get(res.status, 0) + 1 if res.duration_ms is not None: total_times.append(res.duration_ms / 1000) if res.checked and res.status in {"ok", "mismatch", "failed", "error"}: checked_total += 1 + if res.status == "ok" and res.checked: + checked_ok += 1 + if res.status == "unchecked": + unchecked_ok += 1 summary: Dict[str, object] = { "total": sum(totals.values()), "checked_total": checked_total, + "checked_ok": checked_ok, + "unchecked_ok": unchecked_ok, **totals, } if total_times: @@ -412,7 +423,7 @@ def load_results(path: Path) -> Dict[str, RunResult]: def _bucket(status: str, checked: bool, require_assert: bool) -> str: if status == "ok": - return "OK" + return "OK" if checked else "UNCHECKED" if status in {"mismatch", "failed", "error"}: return "BAD" if status == "unchecked": diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py index 6bdb95e..d83c562 100644 --- a/tests/test_demo_qa_runner.py +++ b/tests/test_demo_qa_runner.py @@ -1,6 +1,6 @@ from __future__ import annotations -from examples.demo_qa.runner import Case, RunResult, _match_expected, compare_results +from examples.demo_qa.runner import Case, RunResult, _match_expected, compare_results, summarize def test_match_expected_unchecked_when_no_expectations() -> None: @@ -109,3 +109,43 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: assert "err_to_ok" in diff["new_ok"] assert "checked_to_unchecked" in diff["new_unchecked"] assert "new_ok" in diff["new_ok"] + + +def test_summarize_counts_checked_and_unchecked() -> None: + results = [ + RunResult( + id="c1", + question="", + status="ok", + checked=True, + reason=None, + details=None, + artifacts_dir="/a", + duration_ms=10, + ), + RunResult( + id="c2", + question="", + status="unchecked", + checked=False, + reason=None, + details=None, + artifacts_dir="/b", + duration_ms=5, + ), + RunResult( + id="c3", + question="", + status="mismatch", + checked=True, + reason=None, + details=None, + artifacts_dir="/c", + duration_ms=7, + ), + ] + + summary = summarize(results) + assert summary["checked_ok"] == 1 + assert summary["unchecked_ok"] == 1 # counts unchecked separately + assert summary["checked_total"] == 2 From f3268695e7267bf073063409951eaefac7b6b8ff Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 15:56:09 +0300 Subject: [PATCH 04/92] Refactor batch CLI into module and tighten regression semantics --- examples/demo_qa/batch.py | 411 +++++++++++++++++++++++++++++++++++ examples/demo_qa/cli.py | 398 ++------------------------------- examples/demo_qa/runner.py | 43 +++- tests/test_demo_qa_runner.py | 2 +- 4 files changed, 466 insertions(+), 388 deletions(-) create mode 100644 examples/demo_qa/batch.py diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py new file mode 100644 index 0000000..0f002a6 --- /dev/null +++ b/examples/demo_qa/batch.py @@ -0,0 +1,411 @@ +from __future__ import annotations + +import datetime +import hashlib +import json +import sys +import uuid +from pathlib import Path +from typing import Iterable, Mapping, Optional + +from .llm.factory import build_llm +from .logging_config import configure_logging +from .provider_factory import build_provider +from .runner import ( + Case, + RunResult, + build_agent, + compare_results, + format_status_line, + load_cases, + load_results, + run_one, + summarize, +) +from .settings import load_settings + + +def write_results(out_path: Path, results: Iterable[RunResult]) -> None: + out_path.parent.mkdir(parents=True, exist_ok=True) + with out_path.open("w", encoding="utf-8") as f: + for res in results: + f.write(json.dumps(res.to_json(), ensure_ascii=False) + "\n") + + +def write_summary(out_path: Path, summary: dict) -> Path: + summary_path = out_path.with_name("summary.json") + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + return summary_path + + +def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: + bad = {"error", "failed", "mismatch"} + unchecked = {"unchecked", "plan_only"} + if require_assert: + bad |= unchecked + if fail_on == "error": + bad = {"error"} + elif fail_on == "mismatch": + bad = {"mismatch"} + elif fail_on == "unchecked": + bad |= unchecked + elif fail_on == "bad": + bad = {"error", "failed", "mismatch"} + if require_assert: + bad |= unchecked + elif fail_on == "any": + bad |= unchecked + elif fail_on == "skipped": + bad |= {"skipped"} + return status in bad + + +def _hash_file(path: Path) -> str: + data = path.read_bytes() + return hashlib.sha256(data).hexdigest() + + +def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]: + llm_settings = settings.llm + return { + "base_url": llm_settings.base_url or "https://api.openai.com/v1", + "plan_model": llm_settings.plan_model, + "synth_model": llm_settings.synth_model, + "cases_hash": _hash_file(cases_path), + } + + +def _load_latest_run(artifacts_dir: Path) -> Optional[Path]: + latest_file = artifacts_dir / "runs" / "latest.txt" + if latest_file.exists(): + content = latest_file.read_text(encoding="utf-8").strip() + if content: + return Path(content) + return None + + +def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]: + cases_dir = run_path / "cases" + if not cases_dir.exists(): + return None + matches = sorted(cases_dir.glob(f"{case_id}_*")) + if matches: + return matches[-1] + return None + + +def _resolve_run_path(path: Path | None, artifacts_dir: Path) -> Optional[Path]: + if path is not None: + return path + return _load_latest_run(artifacts_dir) + + +def handle_chat(args) -> int: + try: + settings = load_settings(config_path=args.config, data_dir=args.data) + except Exception as exc: + print(f"Configuration error: {exc}", file=sys.stderr) + return 2 + + log_dir = args.log_dir or args.data / ".runs" / "logs" + log_file = configure_logging( + level=args.log_level, + log_dir=log_dir, + to_stderr=args.log_stderr, + jsonl=args.log_jsonl, + run_id=None, + ) + + llm_settings = settings.llm + llm_endpoint = llm_settings.base_url or "https://api.openai.com/v1" + diagnostics = [ + f"LLM endpoint: {llm_endpoint}", + f"Plan model: {llm_settings.plan_model} (temp={llm_settings.plan_temperature})", + f"Synth model: {llm_settings.synth_model} (temp={llm_settings.synth_temperature})", + f"Timeout: {llm_settings.timeout_s if llm_settings.timeout_s is not None else 'default'}, " + f"Retries: {llm_settings.retries if llm_settings.retries is not None else 'default'}", + ] + if args.enable_semantic: + diagnostics.append(f"Embeddings: CSV semantic backend in {args.data} (*.embeddings.json)") + else: + diagnostics.append("Embeddings: disabled (use --enable-semantic to build/search embeddings).") + + llm = build_llm(settings) + + from .chat_repl import start_repl + + start_repl( + args.data, + args.schema, + llm, + enable_semantic=args.enable_semantic, + log_file=log_file, + diagnostics=diagnostics, + ) + return 0 + + +def _select_cases_for_rerun( + cases: list[Case], + baseline_for_filter: Optional[Mapping[str, RunResult]], + *, + require_assert: bool, + fail_on: str, +) -> list[Case]: + if not baseline_for_filter: + return cases + bad_statuses = {"mismatch", "failed", "error"} + if require_assert or fail_on in {"unchecked", "any"}: + bad_statuses |= {"unchecked", "plan_only"} + target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses} + return [case for case in cases if case.id in target_ids] + + +def handle_batch(args) -> int: + started_at = datetime.datetime.utcnow() + run_id = uuid.uuid4().hex[:8] + + try: + settings = load_settings(config_path=args.config, data_dir=args.data) + except Exception as exc: + print(f"Configuration error: {exc}", file=sys.stderr) + return 2 + try: + cases = load_cases(args.cases) + except Exception as exc: + print(f"Cases error: {exc}", file=sys.stderr) + return 2 + + baseline_for_filter: Optional[Mapping[str, RunResult]] = None + baseline_for_compare: Optional[Mapping[str, RunResult]] = None + + artifacts_dir = args.artifacts_dir + if artifacts_dir is None: + artifacts_dir = args.data / ".runs" + + baseline_filter_path = args.only_failed_from + if args.only_failed and not baseline_filter_path: + latest = _load_latest_run(artifacts_dir) + if latest: + baseline_filter_path = latest / "results.jsonl" + if baseline_filter_path: + try: + baseline_for_filter = load_results(baseline_filter_path) + except Exception as exc: + print(f"Failed to read baseline for --only-failed-from: {exc}", file=sys.stderr) + return 2 + + compare_path = args.compare_to + if compare_path is None and args.only_failed and baseline_filter_path: + compare_path = baseline_filter_path + if compare_path: + try: + if baseline_filter_path and compare_path.resolve() == baseline_filter_path.resolve(): + baseline_for_compare = baseline_for_filter + else: + baseline_for_compare = load_results(compare_path) + except Exception as exc: + print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr) + return 2 + + cases = _select_cases_for_rerun( + cases, baseline_for_filter, require_assert=args.require_assert, fail_on=args.fail_on + ) + + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}" + results_path = args.out or (run_folder / "results.jsonl") + artifacts_root = run_folder / "cases" + results_path.parent.mkdir(parents=True, exist_ok=True) + summary_path = results_path.with_name("summary.json") + artifacts_dir.mkdir(parents=True, exist_ok=True) + + log_dir = args.log_dir or args.data / ".runs" / "logs" + configure_logging( + level=args.log_level, + log_dir=log_dir, + to_stderr=args.log_stderr, + jsonl=args.log_jsonl, + run_id=None, + ) + + provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic) + llm = build_llm(settings) + runner = build_agent(llm, provider) + + results: list[RunResult] = [] + failures = 0 + for case in cases: + result = run_one(case, runner, artifacts_root, plan_only=args.plan_only) + results.append(result) + if not args.quiet: + print(format_status_line(result)) + if is_failure(result.status, args.fail_on, args.require_assert): + failures += 1 + if args.fail_fast or (args.max_fails and failures >= args.max_fails): + break + + write_results(results_path, results) + counts = summarize(results) + + results_by_id = {r.id: r for r in results} + diff_block: dict | None = None + baseline_path: Path | None = None + if baseline_for_compare: + baseline_path = args.compare_to or baseline_filter_path + diff = compare_results(baseline_for_compare, results_by_id, require_assert=args.require_assert) + if baseline_path: + diff["baseline_path"] = str(baseline_path) + diff_block = diff + + failure_count = sum(1 for res in results if is_failure(res.status, args.fail_on, args.require_assert)) + exit_code = 1 if failure_count else 0 + + ended_at = datetime.datetime.utcnow() + duration_ms = int((ended_at - started_at).total_seconds() * 1000) + summary = { + "run_id": run_id, + "started_at": started_at.isoformat() + "Z", + "ended_at": ended_at.isoformat() + "Z", + "duration_ms": duration_ms, + "counts": counts, + "exit_code": exit_code, + "config_fingerprint": build_config_fingerprint(settings, args.cases), + "results_path": str(results_path), + "require_assert": args.require_assert, + "fail_on": args.fail_on, + } + if diff_block: + summary["diff"] = diff_block + + summary_path = write_summary(results_path, summary) + + latest_path = run_folder.parent / "latest.txt" + latest_path.parent.mkdir(parents=True, exist_ok=True) + latest_path.write_text(str(run_folder), encoding="utf-8") + + bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0) + unchecked = counts.get("unchecked", 0) + plan_only = counts.get("plan_only", 0) + if args.require_assert or args.fail_on in {"unchecked", "any"}: + bad_count += unchecked + plan_only + summary_line = ( + f"Batch: {counts.get('total', 0)} cases | Checked: {counts.get('checked_total', 0)} | " + f"Checked OK: {counts.get('checked_ok', 0)} | Unchecked(no-assert): {unchecked} | " + f"Plan-only: {plan_only} | BAD: {bad_count} | Skipped: {counts.get('skipped', 0)}" + ) + + if args.quiet: + print(summary_line) + if diff_block: + print( + f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, " + f"-{len(diff_block.get('regressed', []))} regressions, " + f"{len(diff_block.get('still_bad', []))} still failing, " + f"{len(diff_block.get('new_unchecked', []))} new unchecked" + ) + return exit_code + + print(summary_line) + if diff_block: + print( + f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, " + f"-{len(diff_block.get('regressed', []))} regressions, " + f"{len(diff_block.get('still_bad', []))} still failing, " + f"{len(diff_block.get('new_unchecked', []))} new unchecked" + ) + + failures_list: dict[str, RunResult] = {} + for res in results: + if is_failure(res.status, args.fail_on, args.require_assert): + failures_list[res.id] = res + if failures_list: + print(f"Failures (top {args.show_failures}):") + for res in list(failures_list.values())[: args.show_failures]: + reason = res.reason or res.error or "" + repro = ( + f"python -m examples.demo_qa.cli case run {res.id} --cases {args.cases} --data {args.data} " + f"--schema {args.schema}" + (" --plan-only" if args.plan_only else "") + ) + print(f"- {res.id}: {res.status} ({reason}) [{res.artifacts_dir}]") + if args.show_artifacts: + print(f" artifacts: {res.artifacts_dir}") + print(f" repro: {repro}") + + print(f"Results written to: {results_path}") + print(f"Summary written to: {summary_path}") + + return exit_code + + +def handle_case_run(args) -> int: + try: + settings = load_settings(config_path=args.config, data_dir=args.data) + except Exception as exc: + print(f"Configuration error: {exc}", file=sys.stderr) + return 2 + try: + cases = {c.id: c for c in load_cases(args.cases)} + except Exception as exc: + print(f"Cases error: {exc}", file=sys.stderr) + return 2 + if args.case_id not in cases: + print(f"Case {args.case_id} not found in {args.cases}", file=sys.stderr) + return 2 + + artifacts_dir = args.artifacts_dir or (args.data / ".runs") + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}" + artifacts_root = run_folder / "cases" + results_path = run_folder / "results.jsonl" + + log_dir = artifacts_dir / "logs" + configure_logging(level="INFO", log_dir=log_dir, to_stderr=True, jsonl=False, run_id=None) + + provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic) + llm = build_llm(settings) + runner = build_agent(llm, provider) + + result = run_one(cases[args.case_id], runner, artifacts_root, plan_only=args.plan_only) + write_results(results_path, [result]) + save_path = run_folder.parent / "latest.txt" + save_path.parent.mkdir(parents=True, exist_ok=True) + save_path.write_text(str(run_folder), encoding="utf-8") + + print(format_status_line(result)) + print(f"Artifacts: {result.artifacts_dir}") + return 0 + + +def handle_case_open(args) -> int: + artifacts_dir = args.artifacts_dir or (args.data / ".runs") + run_path = _resolve_run_path(args.run, artifacts_dir) + if not run_path: + print("No run found. Provide --run or ensure runs/latest.txt exists.", file=sys.stderr) + return 2 + case_dir = _find_case_artifact(run_path, args.case_id) + if not case_dir: + print(f"Case {args.case_id} not found under {run_path}", file=sys.stderr) + return 2 + print(f"Case {args.case_id} artifacts: {case_dir}") + plan = case_dir / "plan.json" + answer = case_dir / "answer.txt" + status = case_dir / "status.json" + for path in [plan, answer, status]: + if path.exists(): + print(f"- {path}") + return 0 + + +__all__ = [ + "handle_batch", + "handle_case_open", + "handle_case_run", + "handle_chat", + "is_failure", + "write_results", + "write_summary", + "_load_latest_run", + "_find_case_artifact", + "build_config_fingerprint", +] diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index f2ae789..dec73ff 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -1,388 +1,26 @@ from __future__ import annotations import argparse -import datetime -import hashlib -import json import sys -import uuid from pathlib import Path -from typing import Iterable, Mapping, Optional ROOT = Path(__file__).resolve().parents[2] SRC = ROOT / "src" -if str(SRC) not in sys.path: - sys.path.insert(0, str(SRC)) -from .chat_repl import start_repl -from .data_gen import generate_and_save -from .llm.factory import build_llm -from .logging_config import configure_logging -from .provider_factory import build_provider -from .runner import ( - RunResult, - build_agent, - compare_results, - format_status_line, - load_cases, - load_results, - run_one, - summarize, -) -from .settings import load_settings +def ensure_repo_imports() -> None: + """Ensure local src/ is on sys.path for demo entrypoints.""" + if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) -def write_results(out_path: Path, results: Iterable[RunResult]) -> None: - out_path.parent.mkdir(parents=True, exist_ok=True) - with out_path.open("w", encoding="utf-8") as f: - for res in results: - f.write(json.dumps(res.to_json(), ensure_ascii=False) + "\n") +ensure_repo_imports() -def write_summary(out_path: Path, summary: dict) -> Path: - summary_path = out_path.with_name("summary.json") - summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") - return summary_path +from .batch import handle_batch, handle_case_open, handle_case_run, handle_chat # noqa: E402 +from .data_gen import generate_and_save # noqa: E402 -def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: - failure_statuses = {"error", "mismatch", "failed"} - if fail_on == "error": - failure_statuses = {"error"} - elif fail_on == "mismatch": - failure_statuses = {"error", "mismatch", "failed"} - elif fail_on == "unchecked": - failure_statuses = {"error", "mismatch", "failed", "unchecked"} - else: - failure_statuses = {"error", "mismatch", "failed", "unchecked", "skipped"} - if require_assert and status == "unchecked": - return True - return status in failure_statuses - - -def _hash_file(path: Path) -> str: - data = path.read_bytes() - return hashlib.sha256(data).hexdigest() - - -def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]: - llm_settings = settings.llm - return { - "base_url": llm_settings.base_url or "https://api.openai.com/v1", - "plan_model": llm_settings.plan_model, - "synth_model": llm_settings.synth_model, - "cases_hash": _hash_file(cases_path), - } - - -def _load_latest_run(artifacts_dir: Path) -> Optional[Path]: - latest_file = artifacts_dir / "runs" / "latest.txt" - if latest_file.exists(): - content = latest_file.read_text(encoding="utf-8").strip() - if content: - return Path(content) - return None - - -def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]: - cases_dir = run_path / "cases" - if not cases_dir.exists(): - return None - matches = sorted(cases_dir.glob(f"{case_id}_*")) - if matches: - return matches[-1] - return None - - -def handle_chat(args) -> int: - try: - settings = load_settings(config_path=args.config, data_dir=args.data) - except Exception as exc: - print(f"Configuration error: {exc}", file=sys.stderr) - return 2 - - log_dir = args.log_dir or args.data / ".runs" / "logs" - log_file = configure_logging( - level=args.log_level, - log_dir=log_dir, - to_stderr=args.log_stderr, - jsonl=args.log_jsonl, - run_id=None, - ) - - llm_settings = settings.llm - llm_endpoint = llm_settings.base_url or "https://api.openai.com/v1" - diagnostics = [ - f"LLM endpoint: {llm_endpoint}", - f"Plan model: {llm_settings.plan_model} (temp={llm_settings.plan_temperature})", - f"Synth model: {llm_settings.synth_model} (temp={llm_settings.synth_temperature})", - f"Timeout: {llm_settings.timeout_s if llm_settings.timeout_s is not None else 'default'}, " - f"Retries: {llm_settings.retries if llm_settings.retries is not None else 'default'}", - ] - if args.enable_semantic: - diagnostics.append(f"Embeddings: CSV semantic backend in {args.data} (*.embeddings.json)") - else: - diagnostics.append("Embeddings: disabled (use --enable-semantic to build/search embeddings).") - - llm = build_llm(settings) - - start_repl( - args.data, - args.schema, - llm, - enable_semantic=args.enable_semantic, - log_file=log_file, - diagnostics=diagnostics, - ) - return 0 - - -def _resolve_run_path(path: Path | None, artifacts_dir: Path) -> Optional[Path]: - if path is not None: - return path - return _load_latest_run(artifacts_dir) - - -def handle_case_run(args) -> int: - try: - settings = load_settings(config_path=args.config, data_dir=args.data) - except Exception as exc: - print(f"Configuration error: {exc}", file=sys.stderr) - return 2 - try: - cases = {c.id: c for c in load_cases(args.cases)} - except Exception as exc: - print(f"Cases error: {exc}", file=sys.stderr) - return 2 - if args.case_id not in cases: - print(f"Case {args.case_id} not found in {args.cases}", file=sys.stderr) - return 2 - - artifacts_dir = args.artifacts_dir or (args.data / ".runs") - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}" - artifacts_root = run_folder / "cases" - results_path = run_folder / "results.jsonl" - - log_dir = artifacts_dir / "logs" - configure_logging(level="INFO", log_dir=log_dir, to_stderr=True, jsonl=False, run_id=None) - - provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic) - llm = build_llm(settings) - runner = build_agent(llm, provider) - - result = run_one(cases[args.case_id], runner, artifacts_root, plan_only=args.plan_only) - write_results(results_path, [result]) - save_path = run_folder.parent / "latest.txt" - save_path.parent.mkdir(parents=True, exist_ok=True) - save_path.write_text(str(run_folder), encoding="utf-8") - - print(format_status_line(result)) - print(f"Artifacts: {result.artifacts_dir}") - return 0 - - -def handle_case_open(args) -> int: - artifacts_dir = args.artifacts_dir or Path(".") / ".runs" - run_path = _resolve_run_path(args.run, artifacts_dir) - if not run_path: - print("No run found. Provide --run or ensure runs/latest.txt exists.", file=sys.stderr) - return 2 - case_dir = _find_case_artifact(run_path, args.case_id) - if not case_dir: - print(f"Case {args.case_id} not found under {run_path}", file=sys.stderr) - return 2 - print(f"Case {args.case_id} artifacts: {case_dir}") - plan = case_dir / "plan.json" - answer = case_dir / "answer.txt" - status = case_dir / "status.json" - for path in [plan, answer, status]: - if path.exists(): - print(f"- {path}") - return 0 - - -def handle_batch(args) -> int: - started_at = datetime.datetime.utcnow() - run_id = uuid.uuid4().hex[:8] - - try: - settings = load_settings(config_path=args.config, data_dir=args.data) - except Exception as exc: - print(f"Configuration error: {exc}", file=sys.stderr) - return 2 - try: - cases = load_cases(args.cases) - except Exception as exc: - print(f"Cases error: {exc}", file=sys.stderr) - return 2 - - baseline_for_filter: Optional[Mapping[str, RunResult]] = None - baseline_for_compare: Optional[Mapping[str, RunResult]] = None - - baseline_filter_path = args.only_failed_from - if args.only_failed and not baseline_filter_path: - latest = _load_latest_run(args.artifacts_dir or args.data / ".runs") - if latest: - baseline_filter_path = latest / "results.jsonl" - if baseline_filter_path: - try: - baseline_for_filter = load_results(baseline_filter_path) - except Exception as exc: - print(f"Failed to read baseline for --only-failed-from: {exc}", file=sys.stderr) - return 2 - - compare_path = args.compare_to - if compare_path is None and args.only_failed and baseline_filter_path: - compare_path = baseline_filter_path - if compare_path: - try: - if baseline_filter_path and compare_path.resolve() == baseline_filter_path.resolve(): - baseline_for_compare = baseline_for_filter - else: - baseline_for_compare = load_results(compare_path) - except Exception as exc: - print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr) - return 2 - - if baseline_for_filter: - bad_statuses = {"mismatch", "failed", "error"} - if args.require_assert or args.fail_on == "unchecked": - bad_statuses.add("unchecked") - target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses} - cases = [case for case in cases if case.id in target_ids] - - artifacts_dir = args.artifacts_dir - if artifacts_dir is None: - artifacts_dir = args.data / ".runs" - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}" - results_path = args.out or (run_folder / "results.jsonl") - artifacts_root = run_folder / "cases" - results_path.parent.mkdir(parents=True, exist_ok=True) - summary_path = results_path.with_name("summary.json") - artifacts_dir.mkdir(parents=True, exist_ok=True) - - log_dir = args.log_dir or args.data / ".runs" / "logs" - configure_logging( - level=args.log_level, - log_dir=log_dir, - to_stderr=args.log_stderr, - jsonl=args.log_jsonl, - run_id=None, - ) - - provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic) - llm = build_llm(settings) - runner = build_agent(llm, provider) - - results: list[RunResult] = [] - failures = 0 - for case in cases: - result = run_one(case, runner, artifacts_root, plan_only=args.plan_only) - results.append(result) - if not args.quiet: - print(format_status_line(result)) - if is_failure(result.status, args.fail_on, args.require_assert): - failures += 1 - if args.fail_fast or (args.max_fails and failures >= args.max_fails): - break - - write_results(results_path, results) - counts = summarize(results) - - results_by_id = {r.id: r for r in results} - diff_block: dict | None = None - baseline_path: Path | None = None - if baseline_for_compare: - baseline_path = args.compare_to or args.only_failed_from - diff = compare_results(baseline_for_compare, results_by_id, require_assert=args.require_assert) - if baseline_path: - diff["baseline_path"] = str(baseline_path) - diff_block = diff - - failure_count = sum(1 for res in results if is_failure(res.status, args.fail_on, args.require_assert)) - exit_code = 1 if failure_count else 0 - - ended_at = datetime.datetime.utcnow() - duration_ms = int((ended_at - started_at).total_seconds() * 1000) - summary = { - "run_id": run_id, - "started_at": started_at.isoformat() + "Z", - "ended_at": ended_at.isoformat() + "Z", - "duration_ms": duration_ms, - "counts": counts, - "exit_code": exit_code, - "config_fingerprint": build_config_fingerprint(settings, args.cases), - "results_path": str(results_path), - "require_assert": args.require_assert, - "fail_on": args.fail_on, - } - if diff_block: - summary["diff"] = diff_block - - summary_path = write_summary(results_path, summary) - - latest_path = run_folder.parent / "latest.txt" - latest_path.parent.mkdir(parents=True, exist_ok=True) - latest_path.write_text(str(run_folder), encoding="utf-8") - - bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0) - unchecked = counts.get("unchecked", 0) - if args.require_assert: - bad_count += unchecked - summary_line = ( - f"Batch: {counts.get('total', 0)} cases | Checked: {counts.get('checked_total', 0)} | " - f"Checked OK: {counts.get('checked_ok', 0)} | Unchecked OK: {counts.get('unchecked_ok', 0)} | " - f"BAD: {bad_count} | Unchecked: {unchecked} | Skipped: {counts.get('skipped', 0)}" - ) - - if args.quiet: - print(summary_line) - if diff_block: - print( - f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, " - f"-{len(diff_block.get('regressed', []))} regressions, " - f"{len(diff_block.get('still_bad', []))} still failing, " - f"{len(diff_block.get('new_unchecked', []))} new unchecked" - ) - return exit_code - - print(summary_line) - if diff_block: - print( - f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, " - f"-{len(diff_block.get('regressed', []))} regressions, " - f"{len(diff_block.get('still_bad', []))} still failing, " - f"{len(diff_block.get('new_unchecked', []))} new unchecked" - ) - - failures_list: dict[str, RunResult] = {} - for res in results: - if is_failure(res.status, args.fail_on, args.require_assert) or ( - args.require_assert and res.status == "unchecked" - ): - failures_list[res.id] = res - if failures_list: - print(f"Failures (top {args.show_failures}):") - for res in list(failures_list.values())[: args.show_failures]: - reason = res.reason or res.error or "" - repro = ( - f"demo_qa case run {res.id} --cases {args.cases} --data {args.data} " - f"--schema {args.schema}" + (" --plan-only" if args.plan_only else "") - ) - print(f"- {res.id}: {res.status} ({reason}) [{res.artifacts_dir}]") - if args.show_artifacts: - print(f" artifacts: {res.artifacts_dir}") - print(f" repro: {repro}") - - print(f"Results written to: {results_path}") - print(f"Summary written to: {summary_path}") - - return exit_code - - -def main() -> None: +def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Demo QA harness for fetchgraph") sub = parser.add_subparsers(dest="command", required=True) @@ -418,8 +56,8 @@ def main() -> None: batch_p.add_argument("--fail-fast", action="store_true", help="Stop on first failing case") batch_p.add_argument( "--fail-on", - choices=["error", "mismatch", "unchecked", "any"], - default="mismatch", + choices=["error", "mismatch", "bad", "unchecked", "any", "skipped"], + default="bad", help="Which statuses should cause a failing exit code", ) batch_p.add_argument("--require-assert", action="store_true", help="Treat unchecked cases as failures") @@ -436,8 +74,10 @@ def main() -> None: batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show") batch_p.add_argument("--show-artifacts", action="store_true", help="Show artifact paths for failures") - case_p = sub.add_parser("case", help="Single-case utilities") - case_sub = case_p.add_subparsers(dest="case_command", required=True) + case_p = sub.add_subparsers(dest="case_command") + case_root = sub.add_parser("case", help="Single-case utilities") + case_sub = case_root.add_subparsers(dest="case_command", required=True) + case_run = case_sub.add_parser("run", help="Run a single case by id") case_run.add_argument("case_id") case_run.add_argument("--cases", type=Path, required=True, help="Path to cases jsonl") @@ -450,9 +90,17 @@ def main() -> None: case_open = case_sub.add_parser("open", help="Show artifacts for a case in a run folder") case_open.add_argument("case_id") + case_open.add_argument("--data", type=Path, required=True) case_open.add_argument("--run", type=Path, default=None, help="Run folder (defaults to latest)") - case_open.add_argument("--artifacts-dir", type=Path, default=None, help="Base artifacts dir for latest lookup") + case_open.add_argument( + "--artifacts-dir", type=Path, default=None, help="Base artifacts dir for latest lookup (default data/.runs)" + ) + return parser + + +def main() -> None: + parser = build_parser() args = parser.parse_args() if args.command == "gen": diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index d9ad5f5..14321a2 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -219,7 +219,7 @@ def _build_result( reason = expected_check.detail details = {"expected_check": expected_check.__dict__} else: - status = "unchecked" + status = "plan_only" if artifacts.plan_only else "unchecked" reason = "plan-only" if artifacts.plan_only else "no expectations provided" details = {"note": reason} @@ -276,11 +276,12 @@ def run_one(case: Case, runner: AgentRunner, artifacts_root: Path, *, plan_only: def summarize(results: Iterable[RunResult]) -> Dict[str, object]: - totals = {"ok": 0, "mismatch": 0, "failed": 0, "error": 0, "skipped": 0, "unchecked": 0} + totals = {"ok": 0, "mismatch": 0, "failed": 0, "error": 0, "skipped": 0, "unchecked": 0, "plan_only": 0} total_times: List[float] = [] checked_total = 0 checked_ok = 0 - unchecked_ok = 0 + unchecked_no_assert = 0 + plan_only = 0 for res in results: totals[res.status] = totals.get(res.status, 0) + 1 if res.duration_ms is not None: @@ -290,13 +291,16 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]: if res.status == "ok" and res.checked: checked_ok += 1 if res.status == "unchecked": - unchecked_ok += 1 + unchecked_no_assert += 1 + if res.status == "plan_only": + plan_only += 1 summary: Dict[str, object] = { "total": sum(totals.values()), "checked_total": checked_total, "checked_ok": checked_ok, - "unchecked_ok": unchecked_ok, + "unchecked_no_assert": unchecked_no_assert, + "plan_only": plan_only, **totals, } if total_times: @@ -312,6 +316,7 @@ def load_cases(path: Path) -> List[Case]: if not path.exists(): raise FileNotFoundError(f"Cases file not found: {path}") cases: List[Case] = [] + seen_ids: set[str] = set() with path.open("r", encoding="utf-8") as f: for lineno, line in enumerate(f, start=1): line = line.strip() @@ -323,12 +328,26 @@ def load_cases(path: Path) -> List[Case]: raise ValueError(f"Invalid JSON on line {lineno}: {exc}") from exc if "id" not in payload or "question" not in payload: raise ValueError(f"Case on line {lineno} missing required fields 'id' and 'question'") + case_id = str(payload["id"]) + if case_id in seen_ids: + raise ValueError(f"Duplicate case id {case_id!r} on line {lineno}") + seen_ids.add(case_id) + expected = payload.get("expected") + expected_regex = payload.get("expected_regex") + expected_contains = payload.get("expected_contains") + for field_name, val in [ + ("expected", expected), + ("expected_regex", expected_regex), + ("expected_contains", expected_contains), + ]: + if val is not None and str(val).strip() == "": + raise ValueError(f"{field_name} must not be empty on line {lineno}") case = Case( - id=str(payload["id"]), + id=case_id, question=str(payload["question"]), - expected=payload.get("expected"), - expected_regex=payload.get("expected_regex"), - expected_contains=payload.get("expected_contains"), + expected=expected, + expected_regex=expected_regex, + expected_contains=expected_contains, tags=list(payload.get("tags", []) or []), skip=bool(payload.get("skip", False)), ) @@ -426,7 +445,7 @@ def _bucket(status: str, checked: bool, require_assert: bool) -> str: return "OK" if checked else "UNCHECKED" if status in {"mismatch", "failed", "error"}: return "BAD" - if status == "unchecked": + if status in {"unchecked", "plan_only"}: return "BAD" if require_assert else "UNCHECKED" return "NEUTRAL" @@ -489,8 +508,8 @@ def format_status_line(result: RunResult) -> str: return f"OK {result.id} {timing}" if result.status == "skipped": return f"SKIP {result.id}" - if result.status == "unchecked": - return f"UNCHECKED {result.id} {timing}" + if result.status in {"unchecked", "plan_only"}: + return f"{result.status.upper()} {result.id} {timing}" reason = result.reason or "" return f"FAIL {result.id} {result.status} ({reason or 'unknown'}) {timing}" diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py index d83c562..337c6d2 100644 --- a/tests/test_demo_qa_runner.py +++ b/tests/test_demo_qa_runner.py @@ -147,5 +147,5 @@ def test_summarize_counts_checked_and_unchecked() -> None: summary = summarize(results) assert summary["checked_ok"] == 1 - assert summary["unchecked_ok"] == 1 # counts unchecked separately + assert summary["unchecked_no_assert"] == 1 # counts unchecked separately assert summary["checked_total"] == 2 From 84451eaca3ceb45258844b52b51bf7d610562a1a Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 16:19:41 +0300 Subject: [PATCH 05/92] Fix case command parser construction --- examples/demo_qa/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index dec73ff..b1e8119 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -74,7 +74,6 @@ def build_parser() -> argparse.ArgumentParser: batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show") batch_p.add_argument("--show-artifacts", action="store_true", help="Show artifact paths for failures") - case_p = sub.add_subparsers(dest="case_command") case_root = sub.add_parser("case", help="Single-case utilities") case_sub = case_root.add_subparsers(dest="case_command", required=True) From b921779d511bfa24020c18400ea6740d63bed5ad Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 16:38:10 +0300 Subject: [PATCH 06/92] Honor custom result paths when rerunning failed cases --- examples/demo_qa/batch.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 0f002a6..094fcda 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -84,6 +84,15 @@ def _load_latest_run(artifacts_dir: Path) -> Optional[Path]: return None +def _load_latest_results(artifacts_dir: Path) -> Optional[Path]: + latest_file = artifacts_dir / "runs" / "latest_results.txt" + if latest_file.exists(): + content = latest_file.read_text(encoding="utf-8").strip() + if content: + return Path(content) + return None + + def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]: cases_dir = run_path / "cases" if not cases_dir.exists(): @@ -185,9 +194,15 @@ def handle_batch(args) -> int: baseline_filter_path = args.only_failed_from if args.only_failed and not baseline_filter_path: - latest = _load_latest_run(artifacts_dir) - if latest: - baseline_filter_path = latest / "results.jsonl" + latest_results = _load_latest_results(artifacts_dir) + if latest_results: + baseline_filter_path = latest_results + else: + latest_run = _load_latest_run(artifacts_dir) + if latest_run: + candidate = latest_run / "results.jsonl" + if candidate.exists(): + baseline_filter_path = candidate if baseline_filter_path: try: baseline_for_filter = load_results(baseline_filter_path) @@ -281,8 +296,10 @@ def handle_batch(args) -> int: summary_path = write_summary(results_path, summary) latest_path = run_folder.parent / "latest.txt" + latest_results_path = run_folder.parent / "latest_results.txt" latest_path.parent.mkdir(parents=True, exist_ok=True) latest_path.write_text(str(run_folder), encoding="utf-8") + latest_results_path.write_text(str(results_path), encoding="utf-8") bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0) unchecked = counts.get("unchecked", 0) From 1adb196d312bbfd3fcb0b04cf0c097d2de7209cf Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 16:50:34 +0300 Subject: [PATCH 07/92] Keep errors failing when using --fail-on mismatch --- examples/demo_qa/batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 094fcda..e8f7b61 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -46,7 +46,7 @@ def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: if fail_on == "error": bad = {"error"} elif fail_on == "mismatch": - bad = {"mismatch"} + bad = {"error", "failed", "mismatch"} elif fail_on == "unchecked": bad |= unchecked elif fail_on == "bad": From 9be87f2c67be493aefee7f86499f4f7267dcaa1d Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 17:09:17 +0300 Subject: [PATCH 08/92] Respect --require-assert when fail_on is mismatch/error --- examples/demo_qa/batch.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index e8f7b61..5667cce 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -39,10 +39,8 @@ def write_summary(out_path: Path, summary: dict) -> Path: def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: - bad = {"error", "failed", "mismatch"} unchecked = {"unchecked", "plan_only"} - if require_assert: - bad |= unchecked + bad = {"error", "failed", "mismatch"} if fail_on == "error": bad = {"error"} elif fail_on == "mismatch": @@ -51,12 +49,14 @@ def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: bad |= unchecked elif fail_on == "bad": bad = {"error", "failed", "mismatch"} - if require_assert: - bad |= unchecked elif fail_on == "any": bad |= unchecked elif fail_on == "skipped": bad |= {"skipped"} + + if require_assert: + bad |= unchecked + return status in bad From b96ec141d9909ba9d48b7d6bb5678246d1efb163 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 17:16:32 +0300 Subject: [PATCH 09/92] Keep plan-only runs from tripping expected checks --- examples/demo_qa/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 14321a2..0a980b9 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -269,7 +269,7 @@ def run_one(case: Case, runner: AgentRunner, artifacts_root: Path, *, plan_only: artifacts = runner.run_question(case.question, run_id, run_dir, plan_only=plan_only) save_artifacts(artifacts) - expected_check = _match_expected(case, artifacts.answer) + expected_check = None if plan_only else _match_expected(case, artifacts.answer) result = _build_result(case, artifacts, run_dir, expected_check) save_status(result) return result From acf0dc78ab7dc8705efa0a629202ef32cc1be33d Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 17:28:33 +0300 Subject: [PATCH 10/92] Align rerun filtering with fail_on modes --- examples/demo_qa/batch.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 5667cce..656a0cf 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -164,7 +164,20 @@ def _select_cases_for_rerun( if not baseline_for_filter: return cases bad_statuses = {"mismatch", "failed", "error"} - if require_assert or fail_on in {"unchecked", "any"}: + if fail_on == "error": + bad_statuses = {"error"} + elif fail_on == "mismatch": + bad_statuses = {"mismatch", "failed", "error"} + elif fail_on == "unchecked": + bad_statuses |= {"unchecked", "plan_only"} + elif fail_on == "bad": + bad_statuses = {"mismatch", "failed", "error"} + elif fail_on == "any": + bad_statuses |= {"unchecked", "plan_only"} + elif fail_on == "skipped": + bad_statuses |= {"skipped"} + + if require_assert: bad_statuses |= {"unchecked", "plan_only"} target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses} return [case for case in cases if case.id in target_ids] From 6f18d39840a27005f1459f512d55ba4be049bf0c Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 20:16:46 +0300 Subject: [PATCH 11/92] Fallback to summary results path for latest reruns --- examples/demo_qa/batch.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 656a0cf..dc5fbe0 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -90,6 +90,17 @@ def _load_latest_results(artifacts_dir: Path) -> Optional[Path]: content = latest_file.read_text(encoding="utf-8").strip() if content: return Path(content) + latest_run = _load_latest_run(artifacts_dir) + if latest_run: + summary_path = latest_run / "summary.json" + if summary_path.exists(): + try: + summary = json.loads(summary_path.read_text(encoding="utf-8")) + results_path = summary.get("results_path") + if results_path: + return Path(results_path) + except Exception: + pass return None From b204997f7e8f1a0237d9636bc34d6ae248dd9b40 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 20:25:41 +0300 Subject: [PATCH 12/92] Add run registry metadata and stats command --- examples/demo_qa/batch.py | 157 ++++++++++++++++++++++++++++++++++++++ examples/demo_qa/cli.py | 11 ++- 2 files changed, 167 insertions(+), 1 deletion(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index dc5fbe0..b6e0ac5 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -3,6 +3,8 @@ import datetime import hashlib import json +import platform +import subprocess import sys import uuid from pathlib import Path @@ -38,6 +40,15 @@ def write_summary(out_path: Path, summary: dict) -> Path: return summary_path +def _pass_rate(counts: Mapping[str, object]) -> Optional[float]: + total = int(counts.get("total", 0) or 0) + skipped = int(counts.get("skipped", 0) or 0) + denom = total - skipped + if denom <= 0: + return None + return (counts.get("ok", 0) or 0) / denom + + def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: unchecked = {"unchecked", "plan_only"} bad = {"error", "failed", "mismatch"} @@ -75,6 +86,30 @@ def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object] } +def _fingerprint_dir(data_dir: Path) -> Mapping[str, object]: + files: list[dict] = [] + for path in sorted(data_dir.rglob("*")): + if path.is_file(): + stat = path.stat() + files.append( + { + "path": str(path.relative_to(data_dir)), + "size": stat.st_size, + "mtime": stat.st_mtime, + } + ) + digest = hashlib.sha256(json.dumps(files, sort_keys=True).encode("utf-8")).hexdigest() + return {"hash": digest, "files": files} + + +def _git_sha() -> Optional[str]: + try: + result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True) + except Exception: + return None + return result.stdout.strip() or None + + def _load_latest_run(artifacts_dir: Path) -> Optional[Path]: latest_file = artifacts_dir / "runs" / "latest.txt" if latest_file.exists(): @@ -258,6 +293,7 @@ def handle_batch(args) -> int: results_path.parent.mkdir(parents=True, exist_ok=True) summary_path = results_path.with_name("summary.json") artifacts_dir.mkdir(parents=True, exist_ok=True) + history_path = args.history or (args.data / ".runs" / "history.jsonl") log_dir = args.log_dir or args.data / ".runs" / "logs" configure_logging( @@ -325,6 +361,61 @@ def handle_batch(args) -> int: latest_path.write_text(str(run_folder), encoding="utf-8") latest_results_path.write_text(str(results_path), encoding="utf-8") + config_hash = _hash_file(args.config) if args.config else None + schema_hash = _hash_file(args.schema) + cases_hash = _hash_file(args.cases) + data_fingerprint = _fingerprint_dir(args.data) + llm_settings = settings.llm + run_meta = { + "run_id": run_id, + "timestamp": started_at.isoformat() + "Z", + "cases_path": str(args.cases), + "cases_hash": cases_hash, + "config_path": str(args.config) if args.config else None, + "config_hash": config_hash, + "schema_path": str(args.schema), + "schema_hash": schema_hash, + "data_dir": str(args.data), + "data_fingerprint": data_fingerprint, + "llm": { + "plan_model": llm_settings.plan_model, + "synth_model": llm_settings.synth_model, + "plan_temperature": llm_settings.plan_temperature, + "synth_temperature": llm_settings.synth_temperature, + "base_url": llm_settings.base_url or "https://api.openai.com/v1", + }, + "enable_semantic": args.enable_semantic, + "embedding_model": None, + "git_sha": _git_sha(), + "python_version": sys.version, + "platform": platform.platform(), + "results_path": str(results_path), + "summary_path": str(summary_path), + "run_dir": str(run_folder), + } + (run_folder / "run_meta.json").write_text(json.dumps(run_meta, ensure_ascii=False, indent=2), encoding="utf-8") + + prate = _pass_rate(counts) + history_entry = { + "run_id": run_id, + "timestamp": started_at.isoformat() + "Z", + "config_hash": config_hash, + "schema_hash": schema_hash, + "cases_hash": cases_hash, + "ok": counts.get("ok", 0), + "mismatch": counts.get("mismatch", 0), + "error": counts.get("error", 0), + "skipped": counts.get("skipped", 0), + "pass_rate": prate, + "avg_total_s": counts.get("avg_total_s"), + "median_total_s": counts.get("median_total_s"), + "run_dir": str(run_folder), + "results_path": str(results_path), + } + history_path.parent.mkdir(parents=True, exist_ok=True) + with history_path.open("a", encoding="utf-8") as f: + f.write(json.dumps(history_entry, ensure_ascii=False) + "\n") + bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0) unchecked = counts.get("unchecked", 0) plan_only = counts.get("plan_only", 0) @@ -438,6 +529,72 @@ def handle_case_open(args) -> int: return 0 +def _load_history(history_path: Path) -> list[dict]: + if not history_path.exists(): + return [] + entries: list[dict] = [] + with history_path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + entries.append(json.loads(line)) + except json.JSONDecodeError: + continue + return entries + + +def _print_stats(entries: list[dict]) -> None: + if not entries: + print("No history entries found.") + return + header = f"{'run_id':<10} {'ok':>4} {'mis':>4} {'err':>4} {'skip':>5} {'pass%':>7} {'median_s':>10} {'Δpass':>8} {'Δmedian':>9}" + print(header) + prev = None + for entry in entries: + pass_rate = entry.get("pass_rate") + median = entry.get("median_total_s") + delta_pass = None + delta_median = None + if prev: + if pass_rate is not None and prev.get("pass_rate") is not None: + delta_pass = pass_rate - prev.get("pass_rate") + if median is not None and prev.get("median_total_s") is not None: + delta_median = median - prev.get("median_total_s") + pr_display = f"{pass_rate*100:.1f}%" if pass_rate is not None else "n/a" + median_display = f"{median:.2f}" if median is not None else "n/a" + dp = f"{delta_pass:+.1f}%" if delta_pass is not None else "n/a" + dm = f"{delta_median:+.2f}" if delta_median is not None else "n/a" + print( + f"{entry.get('run_id',''):<10} " + f"{entry.get('ok',0):>4} {entry.get('mismatch',0):>4} {entry.get('error',0):>4} {entry.get('skipped',0):>5} " + f"{pr_display:>7} {median_display:>10} {dp:>8} {dm:>9}" + ) + prev = entry + + +def handle_stats(args) -> int: + history_path: Optional[Path] = args.history + if history_path is None: + if not args.data: + print("Provide --data or --history to locate history.jsonl", file=sys.stderr) + return 2 + history_path = args.data / ".runs" / "history.jsonl" + entries = _load_history(history_path) + if args.group_by == "config_hash": + grouped: dict[str, list[dict]] = {} + for e in entries: + key = e.get("config_hash") or "unknown" + grouped.setdefault(key, []).append(e) + for key, vals in grouped.items(): + print(f"\nconfig_hash={key}") + _print_stats(vals[-args.last :]) + else: + _print_stats(entries[-args.last :]) + return 0 + + __all__ = [ "handle_batch", "handle_case_open", diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index b1e8119..fc6eaa8 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -16,7 +16,7 @@ def ensure_repo_imports() -> None: ensure_repo_imports() -from .batch import handle_batch, handle_case_open, handle_case_run, handle_chat # noqa: E402 +from .batch import handle_batch, handle_case_open, handle_case_run, handle_chat, handle_stats # noqa: E402 from .data_gen import generate_and_save # noqa: E402 @@ -73,6 +73,7 @@ def build_parser() -> argparse.ArgumentParser: batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code") batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show") batch_p.add_argument("--show-artifacts", action="store_true", help="Show artifact paths for failures") + batch_p.add_argument("--history", type=Path, default=None, help="Path to history.jsonl (default: /.runs/history.jsonl)") case_root = sub.add_parser("case", help="Single-case utilities") case_sub = case_root.add_subparsers(dest="case_command", required=True) @@ -95,6 +96,12 @@ def build_parser() -> argparse.ArgumentParser: "--artifacts-dir", type=Path, default=None, help="Base artifacts dir for latest lookup (default data/.runs)" ) + stats_p = sub.add_parser("stats", help="Show batch history stats") + stats_p.add_argument("--data", type=Path, default=None, help="Data dir to resolve default history path") + stats_p.add_argument("--history", type=Path, default=None, help="Path to history.jsonl (default: /.runs/history.jsonl)") + stats_p.add_argument("--last", type=int, default=10, help="How many recent runs to show") + stats_p.add_argument("--group-by", choices=["config_hash"], default=None, help="Group stats by config hash") + return parser @@ -118,6 +125,8 @@ def main() -> None: code = handle_case_open(args) else: code = 1 + elif args.command == "stats": + code = handle_stats(args) else: code = 0 raise SystemExit(code) From 557be567379271b00a885b4d54f735ff9cc48120 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 20:29:28 +0300 Subject: [PATCH 13/92] Add compare command with markdown and junit outputs --- examples/demo_qa/batch.py | 184 ++++++++++++++++++++++++++++++++++++++ examples/demo_qa/cli.py | 17 +++- 2 files changed, 200 insertions(+), 1 deletion(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index b6e0ac5..8a812c1 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -40,6 +40,17 @@ def write_summary(out_path: Path, summary: dict) -> Path: return summary_path +def _median_duration(results: Mapping[str, RunResult]) -> Optional[float]: + durations = [res.duration_ms for res in results.values() if res.duration_ms is not None] + if not durations: + return None + durations.sort() + mid = len(durations) // 2 + if len(durations) % 2 == 1: + return durations[mid] / 1000 + return (durations[mid - 1] + durations[mid]) / 2000 + + def _pass_rate(counts: Mapping[str, object]) -> Optional[float]: total = int(counts.get("total", 0) or 0) skipped = int(counts.get("skipped", 0) or 0) @@ -200,6 +211,166 @@ def handle_chat(args) -> int: return 0 +def _bad_statuses() -> set[str]: + return {"mismatch", "error", "failed"} + + +def _reason(res: RunResult) -> str: + if res.reason: + return res.reason + if res.error: + return res.error + if res.expected_check and res.expected_check.detail: + return res.expected_check.detail + return "" + + +def _artifact_links(res: RunResult) -> dict[str, str]: + links = {} + base = Path(res.artifacts_dir) + for name in ["plan.json", "answer.txt", "raw_synth.txt", "status.json"]: + path = base / name + if path.exists(): + links[name] = str(path) + return links + + +def compare_runs(base_path: Path, new_path: Path) -> dict[str, object]: + base = load_results(base_path) + new = load_results(new_path) + bad = _bad_statuses() + + new_fail: list[dict] = [] + fixed: list[dict] = [] + still_fail: list[dict] = [] + + for case_id, new_res in new.items(): + old_res = base.get(case_id) + if old_res is None: + continue + old_bad = old_res.status in bad + new_bad = new_res.status in bad + if not old_bad and new_bad: + new_fail.append( + { + "id": case_id, + "from": old_res.status, + "to": new_res.status, + "reason": _reason(new_res), + "artifacts": _artifact_links(new_res), + } + ) + elif old_bad and not new_bad: + fixed.append( + { + "id": case_id, + "from": old_res.status, + "to": new_res.status, + "reason": _reason(new_res), + "artifacts": _artifact_links(new_res), + } + ) + elif old_bad and new_bad: + still_fail.append( + { + "id": case_id, + "from": old_res.status, + "to": new_res.status, + "reason": _reason(new_res), + "artifacts": _artifact_links(new_res), + } + ) + + base_counts = summarize(base.values()) + new_counts = summarize(new.values()) + base_med = _median_duration(base) + new_med = _median_duration(new) + base_avg = base_counts.get("avg_total_s") + new_avg = new_counts.get("avg_total_s") + return { + "new_fail": new_fail, + "fixed": fixed, + "still_fail": still_fail, + "base_counts": base_counts, + "new_counts": new_counts, + "base_median": base_med, + "new_median": new_med, + "base_avg": base_avg, + "new_avg": new_avg, + } + + +def render_markdown(compare: dict[str, object], out_path: Optional[Path]) -> str: + lines: list[str] = [] + base_counts = compare["base_counts"] # type: ignore[index] + new_counts = compare["new_counts"] # type: ignore[index] + lines.append("# Batch comparison report") + lines.append("") + lines.append("## Summary") + lines.append( + f"- Base OK: {base_counts.get('ok',0)}, Bad: {base_counts.get('mismatch',0)+base_counts.get('error',0)+base_counts.get('failed',0)}" + ) + lines.append( + f"- New OK: {new_counts.get('ok',0)}, Bad: {new_counts.get('mismatch',0)+new_counts.get('error',0)+new_counts.get('failed',0)}" + ) + base_med = compare.get("base_median") + new_med = compare.get("new_median") + if base_med is not None and new_med is not None: + lines.append(f"- Median total time: base {base_med:.2f}s → new {new_med:.2f}s (Δ {new_med - base_med:+.2f}s)") + lines.append("") + + def table(title: str, rows: list[dict]) -> None: + lines.append(f"## {title}") + if not rows: + lines.append("None") + lines.append("") + return + lines.append("| id | status | reason | artifacts |") + lines.append("|---|---|---|---|") + for row in rows: + artifacts = row.get("artifacts", {}) + links = ", ".join(f"[{k}]({v})" for k, v in artifacts.items()) + lines.append( + f"| {row['id']} | {row['from']} → {row['to']} | {row.get('reason','')} | {links or ''} |" + ) + lines.append("") + + table("New regressions", compare["new_fail"]) # type: ignore[arg-type] + table("Fixed", compare["fixed"]) # type: ignore[arg-type] + table("Still failing", compare["still_fail"]) # type: ignore[arg-type] + + content = "\n".join(lines) + if out_path: + out_path.write_text(content, encoding="utf-8") + return content + + +def write_junit(compare: dict[str, object], out_path: Path) -> None: + import xml.etree.ElementTree as ET + + suite = ET.Element("testsuite", name="demo_qa_compare") + bad = compare["new_fail"] + compare["still_fail"] # type: ignore[operator] + fixed = compare["fixed"] # type: ignore[assignment] + cases = compare["new_counts"].get("total", 0) if isinstance(compare.get("new_counts"), dict) else 0 + suite.set("tests", str(cases)) + suite.set("failures", str(len(bad))) + suite.set("errors", "0") + + for row in bad: + tc = ET.SubElement(suite, "testcase", name=row["id"]) + msg = row.get("reason", "") or f"{row.get('from')} → {row.get('to')}" + failure = ET.SubElement(tc, "failure", message=msg) + artifacts = row.get("artifacts", {}) + if artifacts: + failure.text = "\n".join(f"{k}: {v}" for k, v in artifacts.items()) + + for row in fixed: + ET.SubElement(suite, "testcase", name=row["id"]) + + tree = ET.ElementTree(suite) + out_path.write_text(ET.tostring(suite, encoding="unicode"), encoding="utf-8") + + def _select_cases_for_rerun( cases: list[Case], baseline_for_filter: Optional[Mapping[str, RunResult]], @@ -595,6 +766,19 @@ def handle_stats(args) -> int: return 0 +def handle_compare(args) -> int: + if not args.base.exists() or not args.new.exists(): + print("Base or new results file not found.", file=sys.stderr) + return 2 + comparison = compare_runs(args.base, args.new) + report = render_markdown(comparison, args.out) + print(report) + if args.junit: + write_junit(comparison, args.junit) + print(f"JUnit written to {args.junit}") + return 0 + + __all__ = [ "handle_batch", "handle_case_open", diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index fc6eaa8..0c98f21 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -16,7 +16,14 @@ def ensure_repo_imports() -> None: ensure_repo_imports() -from .batch import handle_batch, handle_case_open, handle_case_run, handle_chat, handle_stats # noqa: E402 +from .batch import ( + handle_batch, + handle_case_open, + handle_case_run, + handle_chat, + handle_compare, + handle_stats, +) # noqa: E402 from .data_gen import generate_and_save # noqa: E402 @@ -102,6 +109,12 @@ def build_parser() -> argparse.ArgumentParser: stats_p.add_argument("--last", type=int, default=10, help="How many recent runs to show") stats_p.add_argument("--group-by", choices=["config_hash"], default=None, help="Group stats by config hash") + compare_p = sub.add_parser("compare", help="Compare two batch result files") + compare_p.add_argument("--base", type=Path, required=True, help="Path to baseline results.jsonl") + compare_p.add_argument("--new", type=Path, required=True, help="Path to new results.jsonl") + compare_p.add_argument("--out", type=Path, default=None, help="Path to markdown report to write") + compare_p.add_argument("--junit", type=Path, default=None, help="Path to junit xml output") + return parser @@ -127,6 +140,8 @@ def main() -> None: code = 1 elif args.command == "stats": code = handle_stats(args) + elif args.command == "compare": + code = handle_compare(args) else: code = 0 raise SystemExit(code) From db65975ae286d7a1a4d0056add467dc451ce9b55 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 20:33:07 +0300 Subject: [PATCH 14/92] Add tags to results and summary by tag --- examples/demo_qa/runner.py | 32 ++++++++++++++++++++++++++++++++ tests/test_demo_qa_runner.py | 10 ++++++++++ 2 files changed, 42 insertions(+) diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 0a980b9..77f6815 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -54,6 +54,7 @@ class RunResult: details: Dict[str, object] | None artifacts_dir: str duration_ms: int + tags: list[str] answer: str | None = None error: str | None = None plan_path: str | None = None @@ -70,6 +71,7 @@ def to_json(self) -> Dict[str, object]: "details": self.details, "artifacts_dir": self.artifacts_dir, "duration_ms": self.duration_ms, + "tags": self.tags, "answer": self.answer, "error": self.error, "plan_path": self.plan_path, @@ -234,6 +236,7 @@ def _build_result( details=details, artifacts_dir=str(run_dir), duration_ms=duration_ms, + tags=list(case.tags), answer=artifacts.answer, error=artifacts.error, plan_path=plan_path, @@ -282,6 +285,7 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]: checked_ok = 0 unchecked_no_assert = 0 plan_only = 0 + per_tag: Dict[str, Dict[str, object]] = {} for res in results: totals[res.status] = totals.get(res.status, 0) + 1 if res.duration_ms is not None: @@ -294,6 +298,12 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]: unchecked_no_assert += 1 if res.status == "plan_only": plan_only += 1 + for tag in res.tags: + bucket = per_tag.setdefault( + tag, {"ok": 0, "mismatch": 0, "failed": 0, "error": 0, "skipped": 0, "unchecked": 0, "plan_only": 0} + ) + bucket[res.status] = bucket.get(res.status, 0) + 1 + bucket["total"] = bucket.get("total", 0) + 1 summary: Dict[str, object] = { "total": sum(totals.values()), @@ -301,6 +311,7 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]: "checked_ok": checked_ok, "unchecked_no_assert": unchecked_no_assert, "plan_only": plan_only, + "summary_by_tag": per_tag, **totals, } if total_times: @@ -309,6 +320,26 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]: else: summary["avg_total_s"] = None summary["median_total_s"] = None + + for tag, bucket in per_tag.items(): + times: List[float] = [] + # no per-tag timing collected; reuse overall average for simplicity + if times: + bucket["avg_total_s"] = statistics.fmean(times) + bucket["median_total_s"] = statistics.median(times) + else: + bucket["avg_total_s"] = None + bucket["median_total_s"] = None + total = bucket.get("total", 0) + checked_total_tag = (bucket.get("ok", 0) or 0) + (bucket.get("mismatch", 0) or 0) + ( + bucket.get("failed", 0) or 0 + ) + bucket["checked_total"] = checked_total_tag + non_skipped = total - (bucket.get("skipped", 0) or 0) + if non_skipped > 0: + bucket["pass_rate"] = (bucket.get("ok", 0) or 0) / non_skipped + else: + bucket["pass_rate"] = None return summary @@ -414,6 +445,7 @@ def _run_result_from_payload(payload: Mapping[str, object]) -> RunResult: details=details, artifacts_dir=artifacts_dir, duration_ms=duration_ms, + tags=list(payload.get("tags", []) or []), answer=payload.get("answer"), # type: ignore[arg-type] error=payload.get("error"), # type: ignore[arg-type] plan_path=payload.get("plan_path"), # type: ignore[arg-type] diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py index 337c6d2..ad14b00 100644 --- a/tests/test_demo_qa_runner.py +++ b/tests/test_demo_qa_runner.py @@ -37,6 +37,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: details=None, artifacts_dir="/tmp/ok", duration_ms=10, + tags=[], ), "err_to_ok": RunResult( id="err_to_ok", @@ -47,6 +48,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: details=None, artifacts_dir="/tmp/err", duration_ms=10, + tags=[], ), "checked_to_unchecked": RunResult( id="checked_to_unchecked", @@ -57,6 +59,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: details=None, artifacts_dir="/tmp/ok2", duration_ms=10, + tags=[], ), } @@ -70,6 +73,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: details=None, artifacts_dir="/tmp/ok", duration_ms=10, + tags=[], ), "err_to_ok": RunResult( id="err_to_ok", @@ -80,6 +84,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: details=None, artifacts_dir="/tmp/err", duration_ms=10, + tags=[], ), "checked_to_unchecked": RunResult( id="checked_to_unchecked", @@ -90,6 +95,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: details=None, artifacts_dir="/tmp/ok2", duration_ms=10, + tags=[], ), "new_ok": RunResult( id="new_ok", @@ -100,6 +106,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: details=None, artifacts_dir="/tmp/new", duration_ms=10, + tags=[], ), } @@ -122,6 +129,7 @@ def test_summarize_counts_checked_and_unchecked() -> None: details=None, artifacts_dir="/a", duration_ms=10, + tags=[], ), RunResult( id="c2", @@ -132,6 +140,7 @@ def test_summarize_counts_checked_and_unchecked() -> None: details=None, artifacts_dir="/b", duration_ms=5, + tags=[], ), RunResult( id="c3", @@ -142,6 +151,7 @@ def test_summarize_counts_checked_and_unchecked() -> None: details=None, artifacts_dir="/c", duration_ms=7, + tags=[], ), ] From 549a1e077729877c112924704b955b09c9d3be7f Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 20:38:49 +0300 Subject: [PATCH 15/92] Add tag filters, events logging, and comparison outputs --- examples/demo_qa/batch.py | 60 ++++++++++++++++++++++-- examples/demo_qa/chat_repl.py | 38 +++++++++++++--- examples/demo_qa/cli.py | 6 +++ examples/demo_qa/runner.py | 86 ++++++++++++++++++++++++++++++++--- 4 files changed, 172 insertions(+), 18 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 8a812c1..cae1ebc 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -15,6 +15,7 @@ from .provider_factory import build_provider from .runner import ( Case, + EventLogger, RunResult, build_agent, compare_results, @@ -87,6 +88,24 @@ def _hash_file(path: Path) -> str: return hashlib.sha256(data).hexdigest() +def _split_csv(value: Optional[str]) -> set[str] | None: + if not value: + return None + return {item.strip() for item in value.split(",") if item.strip()} + + +def _load_ids(path: Optional[Path]) -> set[str] | None: + if path is None: + return None + ids = set() + with path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + ids.add(line) + return ids + + def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]: llm_settings = settings.llm return { @@ -377,9 +396,25 @@ def _select_cases_for_rerun( *, require_assert: bool, fail_on: str, + include_tags: set[str] | None, + exclude_tags: set[str] | None, + include_ids: set[str] | None, + exclude_ids: set[str] | None, ) -> list[Case]: + filtered: list[Case] = [] + for case in cases: + tags = set(case.tags) + if include_tags and not tags.intersection(include_tags): + continue + if exclude_tags and tags.intersection(exclude_tags): + continue + if include_ids and case.id not in include_ids: + continue + if exclude_ids and case.id in exclude_ids: + continue + filtered.append(case) if not baseline_for_filter: - return cases + return filtered bad_statuses = {"mismatch", "failed", "error"} if fail_on == "error": bad_statuses = {"error"} @@ -397,7 +432,7 @@ def _select_cases_for_rerun( if require_assert: bad_statuses |= {"unchecked", "plan_only"} target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses} - return [case for case in cases if case.id in target_ids] + return [case for case in filtered if case.id in target_ids] def handle_batch(args) -> int: @@ -454,7 +489,14 @@ def handle_batch(args) -> int: return 2 cases = _select_cases_for_rerun( - cases, baseline_for_filter, require_assert=args.require_assert, fail_on=args.fail_on + cases, + baseline_for_filter, + require_assert=args.require_assert, + fail_on=args.fail_on, + include_tags=_split_csv(args.include_tags), + exclude_tags=_split_csv(args.exclude_tags), + include_ids=_load_ids(args.include_ids), + exclude_ids=_load_ids(args.exclude_ids), ) timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") @@ -478,11 +520,17 @@ def handle_batch(args) -> int: provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic) llm = build_llm(settings) runner = build_agent(llm, provider) + events_path = None + if args.events == "on": + events_path = args.events_file or (run_folder / "events.jsonl") + event_logger = EventLogger(events_path, run_id) if events_path else None + if event_logger: + event_logger.emit({"type": "run_started", "cases": len(cases), "run_dir": str(run_folder)}) results: list[RunResult] = [] failures = 0 for case in cases: - result = run_one(case, runner, artifacts_root, plan_only=args.plan_only) + result = run_one(case, runner, artifacts_root, plan_only=args.plan_only, event_logger=event_logger) results.append(result) if not args.quiet: print(format_status_line(result)) @@ -525,6 +573,10 @@ def handle_batch(args) -> int: summary["diff"] = diff_block summary_path = write_summary(results_path, summary) + summary_by_tag = summary.get("summary_by_tag") + if summary_by_tag: + summary_by_tag_path = summary_path.with_name("summary_by_tag.json") + summary_by_tag_path.write_text(json.dumps(summary_by_tag, ensure_ascii=False, indent=2), encoding="utf-8") latest_path = run_folder.parent / "latest.txt" latest_results_path = run_folder.parent / "latest_results.txt" diff --git a/examples/demo_qa/chat_repl.py b/examples/demo_qa/chat_repl.py index 678a78b..27ca76a 100644 --- a/examples/demo_qa/chat_repl.py +++ b/examples/demo_qa/chat_repl.py @@ -10,7 +10,16 @@ import json from .provider_factory import build_provider -from .runner import RunArtifacts, build_agent, save_artifacts +from .runner import Case, EventLogger, RunArtifacts, build_agent, run_one, save_artifacts + + +def _load_json(path: Path) -> object | None: + if not path.exists(): + return None + try: + return json.loads(path.read_text(encoding="utf-8")) + except Exception: + return None def _maybe_add_history(entry: str) -> None: @@ -94,25 +103,40 @@ def start_repl( run_id = uuid.uuid4().hex[:8] timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") run_dir = runs_root / f"{timestamp}_{run_id}" + events_path = run_dir / "events.jsonl" + event_logger = EventLogger(events_path, run_id) + print(f"Events: {events_path}") artifacts: RunArtifacts | None = None try: - artifacts = runner.run_question(line, run_id, run_dir) + case = Case(id=run_id, question=line, tags=[]) + result = run_one(case, runner, run_dir, plan_only=False, event_logger=event_logger) + plan_obj = _load_json(Path(result.artifacts_dir) / "plan.json") + ctx_obj = _load_json(Path(result.artifacts_dir) / "context.json") or {} + artifacts = RunArtifacts( + run_id=run_id, + run_dir=Path(result.artifacts_dir), + question=line, + plan=plan_obj if isinstance(plan_obj, dict) else None, + context=ctx_obj if isinstance(ctx_obj, dict) else None, + answer=result.answer, + raw_synth=None, + error=result.error, + ) last_artifacts = artifacts - save_artifacts(artifacts) if plan_debug_mode in {"on", "once"} and artifacts.plan: print("--- PLAN ---") print(json.dumps(artifacts.plan, ensure_ascii=False, indent=2)) - print(artifacts.answer or "") + print(result.answer or "") except Exception as exc: # pragma: no cover - REPL resilience error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=run_dir, question=line) error_artifacts.error = error_artifacts.error or str(exc) last_artifacts = error_artifacts save_artifacts(error_artifacts) - print(f"Error during run {run_id}: {exc}", file=sys.stderr) + print(f\"Error during run {run_id}: {exc}\", file=sys.stderr) finally: - if plan_debug_mode == "once": - plan_debug_mode = "off" + if plan_debug_mode == \"once\": + plan_debug_mode = \"off\" __all__ = ["start_repl"] diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index 0c98f21..1626908 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -81,6 +81,12 @@ def build_parser() -> argparse.ArgumentParser: batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show") batch_p.add_argument("--show-artifacts", action="store_true", help="Show artifact paths for failures") batch_p.add_argument("--history", type=Path, default=None, help="Path to history.jsonl (default: /.runs/history.jsonl)") + batch_p.add_argument("--include-tags", type=str, default=None, help="Comma-separated tags to include") + batch_p.add_argument("--exclude-tags", type=str, default=None, help="Comma-separated tags to exclude") + batch_p.add_argument("--include-ids", type=Path, default=None, help="Path to file with ids to include (one per line)") + batch_p.add_argument("--exclude-ids", type=Path, default=None, help="Path to file with ids to exclude (one per line)") + batch_p.add_argument("--events", choices=["on", "off"], default="on", help="Enable events.jsonl emission") + batch_p.add_argument("--events-file", type=Path, default=None, help="Override events file path") case_root = sub.add_parser("case", help="Single-case utilities") case_sub = case_root.add_subparsers(dest="case_command", required=True) diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 77f6815..bb2e70c 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -1,5 +1,6 @@ from __future__ import annotations +import datetime import json import re import statistics @@ -120,29 +121,49 @@ def saver(feature_name: str, parsed: object) -> None: task_profile=task_profile, ) - def run_question(self, question: str, run_id: str, run_dir: Path, *, plan_only: bool = False) -> RunArtifacts: + def run_question( + self, + case: Case, + run_id: str, + run_dir: Path, + *, + plan_only: bool = False, + event_logger: EventLogger | None = None, + ) -> RunArtifacts: set_run_id(run_id) - artifacts = RunArtifacts(run_id=run_id, run_dir=run_dir, question=question, plan_only=plan_only) + artifacts = RunArtifacts(run_id=run_id, run_dir=run_dir, question=case.question, plan_only=plan_only) started = time.perf_counter() try: + if event_logger: + event_logger.emit({"type": "plan_started", "case_id": case.id}) plan_started = time.perf_counter() - plan = self.agent._plan(question) # type: ignore[attr-defined] + plan = self.agent._plan(case.question) # type: ignore[attr-defined] artifacts.timings.plan_s = time.perf_counter() - plan_started artifacts.plan = plan.model_dump() + if event_logger: + event_logger.emit({"type": "plan_built", "case_id": case.id, "plan_path": str(run_dir / "plan.json")}) if not plan_only: + if event_logger: + event_logger.emit({"type": "fetch_started", "case_id": case.id}) fetch_started = time.perf_counter() - ctx = self.agent._fetch(question, plan) # type: ignore[attr-defined] + ctx = self.agent._fetch(case.question, plan) # type: ignore[attr-defined] artifacts.timings.fetch_s = time.perf_counter() - fetch_started artifacts.context = {k: v.text for k, v in (ctx or {}).items()} if ctx else {} + if event_logger: + event_logger.emit({"type": "fetch_finished", "case_id": case.id}) + if event_logger: + event_logger.emit({"type": "synth_started", "case_id": case.id}) synth_started = time.perf_counter() - draft = self.agent._synthesize(question, ctx, plan) # type: ignore[attr-defined] + draft = self.agent._synthesize(case.question, ctx, plan) # type: ignore[attr-defined] artifacts.timings.synth_s = time.perf_counter() - synth_started artifacts.raw_synth = str(draft) parsed = self.agent.domain_parser(draft) artifacts.answer = str(parsed) + if event_logger: + event_logger.emit({"type": "synth_finished", "case_id": case.id}) except Exception as exc: # pragma: no cover - demo fallback artifacts.error = str(exc) finally: @@ -245,9 +266,19 @@ def _build_result( ) -def run_one(case: Case, runner: AgentRunner, artifacts_root: Path, *, plan_only: bool = False) -> RunResult: +def run_one( + case: Case, + runner: AgentRunner, + artifacts_root: Path, + *, + plan_only: bool = False, + event_logger: EventLogger | None = None, +) -> RunResult: run_id = uuid.uuid4().hex[:8] run_dir = artifacts_root / f"{case.id}_{run_id}" + case_logger = event_logger.for_case(case.id, run_dir / "events.jsonl") if event_logger else None + if case_logger: + case_logger.emit({"type": "case_started", "case_id": case.id, "run_dir": str(run_dir)}) if case.skip: run_dir.mkdir(parents=True, exist_ok=True) _save_text(run_dir / "skipped.txt", "Skipped by request") @@ -267,14 +298,36 @@ def run_one(case: Case, runner: AgentRunner, artifacts_root: Path, *, plan_only: expected_check=None, ) save_status(result) + if case_logger: + case_logger.emit({"type": "case_finished", "case_id": case.id, "status": "skipped"}) return result - artifacts = runner.run_question(case.question, run_id, run_dir, plan_only=plan_only) + artifacts = runner.run_question(case, run_id, run_dir, plan_only=plan_only, event_logger=case_logger) save_artifacts(artifacts) expected_check = None if plan_only else _match_expected(case, artifacts.answer) result = _build_result(case, artifacts, run_dir, expected_check) save_status(result) + if case_logger: + if result.status == "error": + case_logger.emit( + { + "type": "case_failed", + "case_id": case.id, + "status": result.status, + "reason": result.reason, + "artifacts_dir": result.artifacts_dir, + } + ) + case_logger.emit( + { + "type": "case_finished", + "case_id": case.id, + "status": result.status, + "duration_ms": result.duration_ms, + "artifacts_dir": result.artifacts_dir, + } + ) return result @@ -552,6 +605,7 @@ def format_status_line(result: RunResult) -> str: "ExpectedCheck", "RunArtifacts", "RunResult", + "EventLogger", "build_agent", "compare_results", "format_status_line", @@ -563,3 +617,21 @@ def format_status_line(result: RunResult) -> str: "summarize", "_match_expected", ] +class EventLogger: + def __init__(self, path: Path | None, run_id: str): + self.path = path + self.run_id = run_id + if path: + path.parent.mkdir(parents=True, exist_ok=True) + + def emit(self, event: Dict[str, object]) -> None: + if not self.path: + return + payload = {"timestamp": datetime.datetime.utcnow().isoformat() + "Z", "run_id": self.run_id, **event} + with self.path.open("a", encoding="utf-8") as f: + f.write(json.dumps(payload, ensure_ascii=False) + "\n") + + def for_case(self, case_id: str, path: Path | None = None) -> "EventLogger": + if path is None: + return self + return EventLogger(path, self.run_id) From 44c1328cab3afe4db4c3acb51aa0492f91f00a8c Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 20:53:08 +0300 Subject: [PATCH 16/92] Include tags on skipped results to avoid constructor errors --- examples/demo_qa/runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index bb2e70c..5bd7d17 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -291,6 +291,7 @@ def run_one( details=None, artifacts_dir=str(run_dir), duration_ms=0, + tags=list(case.tags), answer=None, error=None, plan_path=None, From 56af0eaae63d4016402d5d79a4fa190ee42effae Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 21:04:24 +0300 Subject: [PATCH 17/92] Persist summary_by_tag in batch outputs --- examples/demo_qa/batch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index cae1ebc..7070ded 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -563,6 +563,7 @@ def handle_batch(args) -> int: "ended_at": ended_at.isoformat() + "Z", "duration_ms": duration_ms, "counts": counts, + "summary_by_tag": counts.get("summary_by_tag"), "exit_code": exit_code, "config_fingerprint": build_config_fingerprint(settings, args.cases), "results_path": str(results_path), From 806bd20f527d718d7df0b99b38fa75dbc8a4054f Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 21:09:03 +0300 Subject: [PATCH 18/92] Exclude .runs/.cache from data fingerprint --- examples/demo_qa/batch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 7070ded..434fe89 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -120,10 +120,13 @@ def _fingerprint_dir(data_dir: Path) -> Mapping[str, object]: files: list[dict] = [] for path in sorted(data_dir.rglob("*")): if path.is_file(): + rel = path.relative_to(data_dir) + if rel.parts and rel.parts[0] in {".runs", ".cache"}: + continue stat = path.stat() files.append( { - "path": str(path.relative_to(data_dir)), + "path": str(rel), "size": stat.st_size, "mtime": stat.st_mtime, } From ec567d6ff0fa68811f4b145949d17b39b7384cc2 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 21:15:20 +0300 Subject: [PATCH 19/92] Validate regexes and fix logging/reporting gaps --- README_demo_qa.md | 2 +- examples/demo_qa/batch.py | 10 ++++++++-- examples/demo_qa/chat_repl.py | 6 +++--- examples/demo_qa/runner.py | 5 +++++ 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/README_demo_qa.md b/README_demo_qa.md index 4036a23..ebfd92b 100644 --- a/README_demo_qa.md +++ b/README_demo_qa.md @@ -64,7 +64,7 @@ python -m examples.demo_qa.cli batch \ --out results.jsonl ``` -* Артефакты по умолчанию пишутся в `/.runs/batch_/id_runid/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`). +* Артефакты по умолчанию пишутся в `/.runs/runs/_/cases/_/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`). * `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов и, при наличии `--compare-to`, diff по прогрессу. * Флаги `--fail-on (error|mismatch/unchecked/any)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to`, `--only-failed-from/--only-failed` и `--plan-only` управляют выбором кейсов, остановкой и кодом выхода (0/1/2). * Без `--out` результаты складываются в `/runs/_/results.jsonl`, а `runs/latest.txt` указывает на последнюю папку запуска. diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 434fe89..cae75ec 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -313,6 +313,7 @@ def compare_runs(base_path: Path, new_path: Path) -> dict[str, object]: "new_fail": new_fail, "fixed": fixed, "still_fail": still_fail, + "all_ids": list(new.keys()), "base_counts": base_counts, "new_counts": new_counts, "base_median": base_med, @@ -373,8 +374,9 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None: suite = ET.Element("testsuite", name="demo_qa_compare") bad = compare["new_fail"] + compare["still_fail"] # type: ignore[operator] fixed = compare["fixed"] # type: ignore[assignment] - cases = compare["new_counts"].get("total", 0) if isinstance(compare.get("new_counts"), dict) else 0 - suite.set("tests", str(cases)) + all_ids = set(compare.get("all_ids", []) or []) # type: ignore[arg-type] + cases_total = len(all_ids) + suite.set("tests", str(cases_total)) suite.set("failures", str(len(bad))) suite.set("errors", "0") @@ -389,6 +391,10 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None: for row in fixed: ET.SubElement(suite, "testcase", name=row["id"]) + ok_ids = all_ids - {row["id"] for row in bad} - {row["id"] for row in fixed} + for cid in ok_ids: + ET.SubElement(suite, "testcase", name=cid) + tree = ET.ElementTree(suite) out_path.write_text(ET.tostring(suite, encoding="unicode"), encoding="utf-8") diff --git a/examples/demo_qa/chat_repl.py b/examples/demo_qa/chat_repl.py index 27ca76a..65bb0f7 100644 --- a/examples/demo_qa/chat_repl.py +++ b/examples/demo_qa/chat_repl.py @@ -133,10 +133,10 @@ def start_repl( error_artifacts.error = error_artifacts.error or str(exc) last_artifacts = error_artifacts save_artifacts(error_artifacts) - print(f\"Error during run {run_id}: {exc}\", file=sys.stderr) + print(f"Error during run {run_id}: {exc}", file=sys.stderr) finally: - if plan_debug_mode == \"once\": - plan_debug_mode = \"off\" + if plan_debug_mode == "once": + plan_debug_mode = "off" __all__ = ["start_repl"] diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 5bd7d17..22bec9a 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -427,6 +427,11 @@ def load_cases(path: Path) -> List[Case]: ]: if val is not None and str(val).strip() == "": raise ValueError(f"{field_name} must not be empty on line {lineno}") + if expected_regex is not None: + try: + re.compile(expected_regex) + except re.error as exc: + raise ValueError(f"Invalid expected_regex on line {lineno}: {exc}") from exc case = Case( id=case_id, question=str(payload["question"]), From e47813eb1bc6e824d6f22c97fe1b375c55cc6a14 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 21:31:36 +0300 Subject: [PATCH 20/92] Write summary and latest_results for case runs --- examples/demo_qa/batch.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index cae75ec..0176358 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -733,12 +733,24 @@ def handle_case_run(args) -> int: result = run_one(cases[args.case_id], runner, artifacts_root, plan_only=args.plan_only) write_results(results_path, [result]) - save_path = run_folder.parent / "latest.txt" - save_path.parent.mkdir(parents=True, exist_ok=True) - save_path.write_text(str(run_folder), encoding="utf-8") + counts = summarize([result]) + summary = { + "run_id": run_folder.name, + "timestamp": timestamp + "Z", + "counts": counts, + "results_path": str(results_path), + "fail_on": "bad", + "require_assert": False, + } + summary_path = write_summary(results_path, summary) + save_dir = run_folder.parent + save_dir.mkdir(parents=True, exist_ok=True) + (save_dir / "latest.txt").write_text(str(run_folder), encoding="utf-8") + (save_dir / "latest_results.txt").write_text(str(results_path), encoding="utf-8") print(format_status_line(result)) print(f"Artifacts: {result.artifacts_dir}") + print(f"Summary: {summary_path}") return 0 From b982b869fa38038289e4536a79ed2d3ae022c283 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 21:41:43 +0300 Subject: [PATCH 21/92] Fix pass-rate delta display in stats --- examples/demo_qa/batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 0176358..4ebe878 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -809,7 +809,7 @@ def _print_stats(entries: list[dict]) -> None: delta_median = median - prev.get("median_total_s") pr_display = f"{pass_rate*100:.1f}%" if pass_rate is not None else "n/a" median_display = f"{median:.2f}" if median is not None else "n/a" - dp = f"{delta_pass:+.1f}%" if delta_pass is not None else "n/a" + dp = f"{delta_pass*100:+.1f}pp" if delta_pass is not None else "n/a" dm = f"{delta_median:+.2f}" if delta_median is not None else "n/a" print( f"{entry.get('run_id',''):<10} " From 87edddf7ee2862f0f691dd2224e44e89dac6e66e Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 21:58:56 +0300 Subject: [PATCH 22/92] Provide bundled pydantic settings shim --- src/pydantic_settings/__init__.py | 98 +++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 src/pydantic_settings/__init__.py diff --git a/src/pydantic_settings/__init__.py b/src/pydantic_settings/__init__.py new file mode 100644 index 0000000..bf9fdae --- /dev/null +++ b/src/pydantic_settings/__init__.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import os +import tomllib +from typing import Any, Callable, ClassVar, Dict, Iterable, Mapping + +from pydantic import BaseModel + + +def SettingsConfigDict(**kwargs: Any) -> Dict[str, Any]: + return dict(**kwargs) + + +def _deep_update(base: Dict[str, Any], updates: Mapping[str, Any]) -> Dict[str, Any]: + for key, value in updates.items(): + if isinstance(value, Mapping) and isinstance(base.get(key), dict): + base[key] = _deep_update(base[key], value) + else: + base[key] = value + return base + + +class TomlConfigSettingsSource: + def __init__(self, settings_cls: type[BaseModel], path: os.PathLike | str | None): + self._path = path + + def __call__(self) -> Dict[str, Any]: + if not self._path: + return {} + try: + with open(self._path, "rb") as toml_file: + return tomllib.load(toml_file) + except FileNotFoundError: + return {} + + +class BaseSettings(BaseModel): + model_config: ClassVar[SettingsConfigDict] = {} + + def __init__(self, **values: Any) -> None: + sources = self.settings_customise_sources( + self.__class__, + self._build_init_settings(values), + self._build_env_settings(), + self._build_dotenv_settings(), + self._build_file_secret_settings(), + ) + merged: Dict[str, Any] = {} + for source in reversed(tuple(sources)): + merged = _deep_update(merged, source() or {}) + super().__init__(**merged) + + @classmethod + def settings_customise_sources( + cls, + settings_cls: type[BaseModel], + init_settings: Callable[[], Mapping[str, Any]], + env_settings: Callable[[], Mapping[str, Any]], + dotenv_settings: Callable[[], Mapping[str, Any]], + file_secret_settings: Callable[[], Mapping[str, Any]], + ) -> Iterable[Callable[[], Mapping[str, Any]]]: + return (init_settings, env_settings, dotenv_settings, file_secret_settings) + + @staticmethod + def _build_init_settings(values: Mapping[str, Any]) -> Callable[[], Mapping[str, Any]]: + return lambda: dict(values) + + @classmethod + def _build_env_settings(cls) -> Callable[[], Mapping[str, Any]]: + prefix = cls.model_config.get("env_prefix", "") or "" + delimiter = cls.model_config.get("env_nested_delimiter", "__") or "__" + + def source() -> Dict[str, Any]: + settings: Dict[str, Any] = {} + for key, value in os.environ.items(): + if not key.startswith(prefix): + continue + raw_key = key[len(prefix) :] + parts = raw_key.split(delimiter) if delimiter else [raw_key] + cls._insert_nested(settings, [part.lower() for part in parts], value) + return settings + + return source + + @classmethod + def _build_dotenv_settings(cls) -> Callable[[], Mapping[str, Any]]: + return lambda: {} + + @classmethod + def _build_file_secret_settings(cls) -> Callable[[], Mapping[str, Any]]: + return lambda: {} + + @staticmethod + def _insert_nested(target: Dict[str, Any], parts: list[str], value: Any) -> None: + current = target + for part in parts[:-1]: + current = current.setdefault(part, {}) + current[parts[-1]] = value From 69039cd6e0b8983a74203e254703cc85203b8fdb Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 22:14:41 +0300 Subject: [PATCH 23/92] Centralize failure status handling --- examples/demo_qa/batch.py | 39 ++++++++++++------------------------- tests/test_demo_qa_batch.py | 19 ++++++++++++++++++ 2 files changed, 31 insertions(+), 27 deletions(-) create mode 100644 tests/test_demo_qa_batch.py diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 4ebe878..31a428f 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -61,18 +61,12 @@ def _pass_rate(counts: Mapping[str, object]) -> Optional[float]: return (counts.get("ok", 0) or 0) / denom -def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: +def bad_statuses(fail_on: str, require_assert: bool) -> set[str]: unchecked = {"unchecked", "plan_only"} bad = {"error", "failed", "mismatch"} if fail_on == "error": bad = {"error"} - elif fail_on == "mismatch": - bad = {"error", "failed", "mismatch"} - elif fail_on == "unchecked": - bad |= unchecked - elif fail_on == "bad": - bad = {"error", "failed", "mismatch"} - elif fail_on == "any": + elif fail_on in {"unchecked", "any"}: bad |= unchecked elif fail_on == "skipped": bad |= {"skipped"} @@ -80,7 +74,11 @@ def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: if require_assert: bad |= unchecked - return status in bad + return bad + + +def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: + return status in bad_statuses(fail_on, require_assert) def _hash_file(path: Path) -> str: @@ -234,7 +232,7 @@ def handle_chat(args) -> int: def _bad_statuses() -> set[str]: - return {"mismatch", "error", "failed"} + return bad_statuses("bad", False) def _reason(res: RunResult) -> str: @@ -424,23 +422,9 @@ def _select_cases_for_rerun( filtered.append(case) if not baseline_for_filter: return filtered - bad_statuses = {"mismatch", "failed", "error"} - if fail_on == "error": - bad_statuses = {"error"} - elif fail_on == "mismatch": - bad_statuses = {"mismatch", "failed", "error"} - elif fail_on == "unchecked": - bad_statuses |= {"unchecked", "plan_only"} - elif fail_on == "bad": - bad_statuses = {"mismatch", "failed", "error"} - elif fail_on == "any": - bad_statuses |= {"unchecked", "plan_only"} - elif fail_on == "skipped": - bad_statuses |= {"skipped"} - - if require_assert: - bad_statuses |= {"unchecked", "plan_only"} - target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses} + target_ids = { + case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses(fail_on, require_assert) + } return [case for case in filtered if case.id in target_ids] @@ -858,6 +842,7 @@ def handle_compare(args) -> int: "handle_case_open", "handle_case_run", "handle_chat", + "bad_statuses", "is_failure", "write_results", "write_summary", diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py new file mode 100644 index 0000000..f52da8a --- /dev/null +++ b/tests/test_demo_qa_batch.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +import itertools + +import pytest + +from examples.demo_qa.batch import bad_statuses, is_failure + + +@pytest.mark.parametrize( + "fail_on,require_assert", + itertools.product(["bad", "error", "mismatch", "unchecked", "any", "skipped"], [False, True]), +) +def test_is_failure_matches_bad_statuses(fail_on: str, require_assert: bool) -> None: + statuses = ["ok", "mismatch", "failed", "error", "unchecked", "plan_only", "skipped"] + bad = bad_statuses(fail_on, require_assert) + assert bad # sanity check + for status in statuses: + assert is_failure(status, fail_on, require_assert) == (status in bad) From 3f214d70fcd95e0ec8adfcad1040d4ff743746ac Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 22:58:56 +0300 Subject: [PATCH 24/92] Allow pytest to import examples package --- pytest.ini | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytest.ini b/pytest.ini index 1abd662..1e1d0bf 100644 --- a/pytest.ini +++ b/pytest.ini @@ -2,7 +2,9 @@ [pytest] minversion = 7.0 -pythonpath = src +pythonpath = + src + . testpaths = tests @@ -57,4 +59,4 @@ filterwarnings = error::DeprecationWarning:fetchgraph.* error::PendingDeprecationWarning:fetchgraph.* ignore::DeprecationWarning - ignore::PendingDeprecationWarning \ No newline at end of file + ignore::PendingDeprecationWarning From 62e749c5abfc3db62e3bbc2a6d5c611f83289eec Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 23:16:11 +0300 Subject: [PATCH 25/92] Trim fail_on choices to supported modes --- README_demo_qa.md | 2 +- examples/demo_qa/batch.py | 162 +++++------------------- examples/demo_qa/cli.py | 9 +- examples/demo_qa/runner.py | 231 +++++++++++++++++++++++++---------- tests/test_demo_qa_batch.py | 2 +- tests/test_demo_qa_runner.py | 46 +++---- 6 files changed, 225 insertions(+), 227 deletions(-) diff --git a/README_demo_qa.md b/README_demo_qa.md index ebfd92b..86f9e67 100644 --- a/README_demo_qa.md +++ b/README_demo_qa.md @@ -66,7 +66,7 @@ python -m examples.demo_qa.cli batch \ * Артефакты по умолчанию пишутся в `/.runs/runs/_/cases/_/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`). * `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов и, при наличии `--compare-to`, diff по прогрессу. -* Флаги `--fail-on (error|mismatch/unchecked/any)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to`, `--only-failed-from/--only-failed` и `--plan-only` управляют выбором кейсов, остановкой и кодом выхода (0/1/2). +* Флаги `--fail-on (error|bad|unchecked|any|skipped)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to`, `--only-failed-from/--only-failed` и `--plan-only` управляют выбором кейсов, остановкой и кодом выхода (0/1/2). * Без `--out` результаты складываются в `/runs/_/results.jsonl`, а `runs/latest.txt` указывает на последнюю папку запуска. * Быстрый фокус на упавших: `--only-failed` возьмёт `runs/latest/results.jsonl`, `--show-artifacts` печатает пути, репро-команды выводятся для каждого FAIL. * Команды уровня кейса: `demo_qa case run --cases ...` и `demo_qa case open --run runs/latest` для быстрого воспроизведения. diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 31a428f..4a1d392 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -17,9 +17,11 @@ Case, EventLogger, RunResult, + bad_statuses, build_agent, - compare_results, + diff_runs, format_status_line, + is_failure, load_cases, load_results, run_one, @@ -41,17 +43,6 @@ def write_summary(out_path: Path, summary: dict) -> Path: return summary_path -def _median_duration(results: Mapping[str, RunResult]) -> Optional[float]: - durations = [res.duration_ms for res in results.values() if res.duration_ms is not None] - if not durations: - return None - durations.sort() - mid = len(durations) // 2 - if len(durations) % 2 == 1: - return durations[mid] / 1000 - return (durations[mid - 1] + durations[mid]) / 2000 - - def _pass_rate(counts: Mapping[str, object]) -> Optional[float]: total = int(counts.get("total", 0) or 0) skipped = int(counts.get("skipped", 0) or 0) @@ -61,26 +52,6 @@ def _pass_rate(counts: Mapping[str, object]) -> Optional[float]: return (counts.get("ok", 0) or 0) / denom -def bad_statuses(fail_on: str, require_assert: bool) -> set[str]: - unchecked = {"unchecked", "plan_only"} - bad = {"error", "failed", "mismatch"} - if fail_on == "error": - bad = {"error"} - elif fail_on in {"unchecked", "any"}: - bad |= unchecked - elif fail_on == "skipped": - bad |= {"skipped"} - - if require_assert: - bad |= unchecked - - return bad - - -def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: - return status in bad_statuses(fail_on, require_assert) - - def _hash_file(path: Path) -> str: data = path.read_bytes() return hashlib.sha256(data).hexdigest() @@ -231,94 +202,10 @@ def handle_chat(args) -> int: return 0 -def _bad_statuses() -> set[str]: - return bad_statuses("bad", False) - - -def _reason(res: RunResult) -> str: - if res.reason: - return res.reason - if res.error: - return res.error - if res.expected_check and res.expected_check.detail: - return res.expected_check.detail - return "" - - -def _artifact_links(res: RunResult) -> dict[str, str]: - links = {} - base = Path(res.artifacts_dir) - for name in ["plan.json", "answer.txt", "raw_synth.txt", "status.json"]: - path = base / name - if path.exists(): - links[name] = str(path) - return links - - -def compare_runs(base_path: Path, new_path: Path) -> dict[str, object]: +def compare_runs(base_path: Path, new_path: Path, *, fail_on: str, require_assert: bool) -> dict[str, object]: base = load_results(base_path) new = load_results(new_path) - bad = _bad_statuses() - - new_fail: list[dict] = [] - fixed: list[dict] = [] - still_fail: list[dict] = [] - - for case_id, new_res in new.items(): - old_res = base.get(case_id) - if old_res is None: - continue - old_bad = old_res.status in bad - new_bad = new_res.status in bad - if not old_bad and new_bad: - new_fail.append( - { - "id": case_id, - "from": old_res.status, - "to": new_res.status, - "reason": _reason(new_res), - "artifacts": _artifact_links(new_res), - } - ) - elif old_bad and not new_bad: - fixed.append( - { - "id": case_id, - "from": old_res.status, - "to": new_res.status, - "reason": _reason(new_res), - "artifacts": _artifact_links(new_res), - } - ) - elif old_bad and new_bad: - still_fail.append( - { - "id": case_id, - "from": old_res.status, - "to": new_res.status, - "reason": _reason(new_res), - "artifacts": _artifact_links(new_res), - } - ) - - base_counts = summarize(base.values()) - new_counts = summarize(new.values()) - base_med = _median_duration(base) - new_med = _median_duration(new) - base_avg = base_counts.get("avg_total_s") - new_avg = new_counts.get("avg_total_s") - return { - "new_fail": new_fail, - "fixed": fixed, - "still_fail": still_fail, - "all_ids": list(new.keys()), - "base_counts": base_counts, - "new_counts": new_counts, - "base_median": base_med, - "new_median": new_med, - "base_avg": base_avg, - "new_avg": new_avg, - } + return diff_runs(base.values(), new.values(), fail_on=fail_on, require_assert=require_assert) def render_markdown(compare: dict[str, object], out_path: Optional[Path]) -> str: @@ -348,7 +235,7 @@ def table(title: str, rows: list[dict]) -> None: return lines.append("| id | status | reason | artifacts |") lines.append("|---|---|---|---|") - for row in rows: + for row in sorted(rows, key=lambda r: r.get("id", "")): artifacts = row.get("artifacts", {}) links = ", ".join(f"[{k}]({v})" for k, v in artifacts.items()) lines.append( @@ -372,13 +259,14 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None: suite = ET.Element("testsuite", name="demo_qa_compare") bad = compare["new_fail"] + compare["still_fail"] # type: ignore[operator] fixed = compare["fixed"] # type: ignore[assignment] - all_ids = set(compare.get("all_ids", []) or []) # type: ignore[arg-type] + all_ids_list = list(compare.get("all_ids", []) or []) # type: ignore[arg-type] + all_ids = sorted(all_ids_list) cases_total = len(all_ids) suite.set("tests", str(cases_total)) suite.set("failures", str(len(bad))) suite.set("errors", "0") - for row in bad: + for row in sorted(bad, key=lambda r: r.get("id", "")): tc = ET.SubElement(suite, "testcase", name=row["id"]) msg = row.get("reason", "") or f"{row.get('from')} → {row.get('to')}" failure = ET.SubElement(tc, "failure", message=msg) @@ -386,10 +274,12 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None: if artifacts: failure.text = "\n".join(f"{k}: {v}" for k, v in artifacts.items()) - for row in fixed: + for row in sorted(fixed, key=lambda r: r.get("id", "")): ET.SubElement(suite, "testcase", name=row["id"]) - ok_ids = all_ids - {row["id"] for row in bad} - {row["id"] for row in fixed} + bad_ids = {row["id"] for row in bad} + fixed_ids = {row["id"] for row in fixed} + ok_ids = [cid for cid in all_ids if cid not in bad_ids and cid not in fixed_ids] for cid in ok_ids: ET.SubElement(suite, "testcase", name=cid) @@ -535,12 +425,16 @@ def handle_batch(args) -> int: write_results(results_path, results) counts = summarize(results) - results_by_id = {r.id: r for r in results} diff_block: dict | None = None baseline_path: Path | None = None if baseline_for_compare: baseline_path = args.compare_to or baseline_filter_path - diff = compare_results(baseline_for_compare, results_by_id, require_assert=args.require_assert) + diff = diff_runs( + baseline_for_compare.values(), + results, + fail_on=args.fail_on, + require_assert=args.require_assert, + ) if baseline_path: diff["baseline_path"] = str(baseline_path) diff_block = diff @@ -648,20 +542,20 @@ def handle_batch(args) -> int: print(summary_line) if diff_block: print( - f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, " - f"-{len(diff_block.get('regressed', []))} regressions, " - f"{len(diff_block.get('still_bad', []))} still failing, " - f"{len(diff_block.get('new_unchecked', []))} new unchecked" + f"Δ vs baseline: +{len(diff_block.get('fixed', []))} fixed, " + f"-{len(diff_block.get('new_fail', []))} regressions, " + f"{len(diff_block.get('still_fail', []))} still failing, " + f"{len(diff_block.get('new_cases', []))} new cases" ) return exit_code print(summary_line) if diff_block: print( - f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, " - f"-{len(diff_block.get('regressed', []))} regressions, " - f"{len(diff_block.get('still_bad', []))} still failing, " - f"{len(diff_block.get('new_unchecked', []))} new unchecked" + f"Δ vs baseline: +{len(diff_block.get('fixed', []))} fixed, " + f"-{len(diff_block.get('new_fail', []))} regressions, " + f"{len(diff_block.get('still_fail', []))} still failing, " + f"{len(diff_block.get('new_cases', []))} new cases" ) failures_list: dict[str, RunResult] = {} @@ -828,7 +722,7 @@ def handle_compare(args) -> int: if not args.base.exists() or not args.new.exists(): print("Base or new results file not found.", file=sys.stderr) return 2 - comparison = compare_runs(args.base, args.new) + comparison = compare_runs(args.base, args.new, fail_on=args.fail_on, require_assert=args.require_assert) report = render_markdown(comparison, args.out) print(report) if args.junit: diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index 1626908..2ca493b 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -63,7 +63,7 @@ def build_parser() -> argparse.ArgumentParser: batch_p.add_argument("--fail-fast", action="store_true", help="Stop on first failing case") batch_p.add_argument( "--fail-on", - choices=["error", "mismatch", "bad", "unchecked", "any", "skipped"], + choices=["error", "bad", "unchecked", "any", "skipped"], default="bad", help="Which statuses should cause a failing exit code", ) @@ -120,6 +120,13 @@ def build_parser() -> argparse.ArgumentParser: compare_p.add_argument("--new", type=Path, required=True, help="Path to new results.jsonl") compare_p.add_argument("--out", type=Path, default=None, help="Path to markdown report to write") compare_p.add_argument("--junit", type=Path, default=None, help="Path to junit xml output") + compare_p.add_argument( + "--fail-on", + choices=["error", "bad", "unchecked", "any", "skipped"], + default="bad", + help="Which statuses should be treated as failures when diffing", + ) + compare_p.add_argument("--require-assert", action="store_true", help="Treat unchecked cases as failures when diffing") return parser diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 22bec9a..7d841b8 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -531,65 +531,158 @@ def load_results(path: Path) -> Dict[str, RunResult]: return results -def _bucket(status: str, checked: bool, require_assert: bool) -> str: - if status == "ok": - return "OK" if checked else "UNCHECKED" - if status in {"mismatch", "failed", "error"}: - return "BAD" - if status in {"unchecked", "plan_only"}: - return "BAD" if require_assert else "UNCHECKED" - return "NEUTRAL" - - -def compare_results( - baseline: Mapping[str, RunResult], - current: Mapping[str, RunResult], +def bad_statuses(fail_on: str, require_assert: bool) -> set[str]: + unchecked = {"unchecked", "plan_only"} + bad = {"error", "failed", "mismatch"} + if fail_on == "error": + bad = {"error"} + elif fail_on in {"unchecked", "any"}: + bad |= unchecked + elif fail_on == "skipped": + bad |= {"skipped"} + + if require_assert: + bad |= unchecked + + return bad + + +def is_failure(status: str, fail_on: str, require_assert: bool) -> bool: + return status in bad_statuses(fail_on, require_assert) + + +def _artifact_links(res: RunResult) -> dict[str, str]: + links: dict[str, str] = {} + base = Path(res.artifacts_dir) + for name in ["plan.json", "answer.txt", "raw_synth.txt", "status.json"]: + path = base / name + if path.exists(): + links[name] = str(path) + return links + + +def _reason(res: RunResult) -> str: + if res.reason: + return res.reason + if res.error: + return res.error + if res.expected_check and res.expected_check.detail: + return res.expected_check.detail + return "" + + +def _median_duration(results: Mapping[str, RunResult]) -> float | None: + durations = [res.duration_ms for res in results.values() if res.duration_ms is not None] + if not durations: + return None + durations.sort() + mid = len(durations) // 2 + if len(durations) % 2 == 1: + return durations[mid] / 1000 + return (durations[mid - 1] + durations[mid]) / 2000 + + +def _count_bad_from_summary(counts: Mapping[str, object], fail_on: str, require_assert: bool) -> int: + bad = bad_statuses(fail_on, require_assert) + total = 0 + for status in bad: + try: + total += int(counts.get(status, 0) or 0) + except Exception: + continue + return total + + +def diff_runs( + base_results: Iterable[RunResult], + new_results: Iterable[RunResult], *, + fail_on: str, require_assert: bool, ) -> Dict[str, object]: - new_ok: List[str] = [] - regressed: List[str] = [] - still_ok: List[str] = [] - still_bad: List[str] = [] - new_unchecked: List[str] = [] - status_changes: Dict[str, Dict[str, str]] = {} - new_cases: List[str] = [] - - for case_id, res in current.items(): - base_res = baseline.get(case_id) - new_bucket = _bucket(res.status, res.checked, require_assert) + base_by_id = {res.id: res for res in base_results} + new_by_id = {res.id: res for res in new_results} + all_ids = sorted(new_by_id.keys()) + + bad = bad_statuses(fail_on, require_assert) + + def _is_bad(res: RunResult | None) -> bool: + return bool(res and res.status in bad) + + def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult) -> dict[str, object]: + return { + "id": case_id, + "from": base_res.status if base_res else None, + "to": new_res.status, + "reason": _reason(new_res), + "artifacts": _artifact_links(new_res), + } + + new_fail: list[dict[str, object]] = [] + fixed: list[dict[str, object]] = [] + still_fail: list[dict[str, object]] = [] + changed_status: list[dict[str, str | None]] = [] + new_cases: list[str] = [] + + for case_id in all_ids: + new_res = new_by_id[case_id] + base_res = base_by_id.get(case_id) + base_bad = _is_bad(base_res) + new_bad = _is_bad(new_res) + if base_res is None: new_cases.append(case_id) - if new_bucket == "OK": - new_ok.append(case_id) - elif new_bucket == "BAD": - still_bad.append(case_id) - status_changes[case_id] = {"from": "new", "to": res.status} + else: + if base_res.status != new_res.status: + changed_status.append({"id": case_id, "from": base_res.status, "to": new_res.status}) + + if base_res is None: continue - base_bucket = _bucket(base_res.status, base_res.checked, require_assert) - if base_res.checked and res.status == "unchecked": - new_unchecked.append(case_id) - if base_bucket == "OK" and new_bucket in {"BAD", "UNCHECKED"}: - regressed.append(case_id) - elif base_bucket in {"BAD", "UNCHECKED"} and new_bucket == "OK": - new_ok.append(case_id) - elif base_bucket == "OK" and new_bucket == "OK": - still_ok.append(case_id) - elif base_bucket in {"BAD", "UNCHECKED"} and new_bucket in {"BAD", "UNCHECKED"}: - still_bad.append(case_id) - - if base_res.status != res.status: - status_changes[case_id] = {"from": base_res.status, "to": res.status} + if not base_bad and new_bad: + new_fail.append(_entry(case_id, base_res, new_res)) + elif base_bad and not new_bad: + fixed.append(_entry(case_id, base_res, new_res)) + elif base_bad and new_bad: + still_fail.append(_entry(case_id, base_res, new_res)) + + base_counts = summarize(base_by_id.values()) + new_counts = summarize(new_by_id.values()) + base_med = _median_duration(base_by_id) + new_med = _median_duration(new_by_id) + base_avg = base_counts.get("avg_total_s") + new_avg = new_counts.get("avg_total_s") + + def _count_delta(key: str) -> int | float | None: + base_val = base_counts.get(key) + new_val = new_counts.get(key) + if isinstance(base_val, (int, float)) and isinstance(new_val, (int, float)): + return new_val - base_val + return None + + delta_keys = {"ok", "mismatch", "failed", "error", "skipped", "unchecked", "plan_only", "total"} + count_deltas = {k: _count_delta(k) for k in delta_keys} return { - "new_ok": new_ok, - "regressed": regressed, - "still_ok": still_ok, - "still_bad": still_bad, - "new_unchecked": new_unchecked, - "status_changes": status_changes, + "all_ids": all_ids, + "new_fail": new_fail, + "fixed": fixed, + "still_fail": still_fail, + "changed_status": changed_status, "new_cases": new_cases, + "base_counts": base_counts, + "new_counts": new_counts, + "counts_delta": count_deltas, + "base_median": base_med, + "new_median": new_med, + "base_avg": base_avg, + "new_avg": new_avg, + "median_delta": (new_med - base_med) if (new_med is not None and base_med is not None) else None, + "avg_delta": (new_avg - base_avg) if (isinstance(new_avg, (int, float)) and isinstance(base_avg, (int, float))) else None, + "base_bad_total": _count_bad_from_summary(base_counts, fail_on, require_assert), + "new_bad_total": _count_bad_from_summary(new_counts, fail_on, require_assert), + "fail_on": fail_on, + "require_assert": require_assert, } @@ -605,24 +698,6 @@ def format_status_line(result: RunResult) -> str: return f"FAIL {result.id} {result.status} ({reason or 'unknown'}) {timing}" -__all__ = [ - "AgentRunner", - "Case", - "ExpectedCheck", - "RunArtifacts", - "RunResult", - "EventLogger", - "build_agent", - "compare_results", - "format_status_line", - "load_results", - "load_cases", - "run_one", - "save_artifacts", - "save_status", - "summarize", - "_match_expected", -] class EventLogger: def __init__(self, path: Path | None, run_id: str): self.path = path @@ -641,3 +716,25 @@ def for_case(self, case_id: str, path: Path | None = None) -> "EventLogger": if path is None: return self return EventLogger(path, self.run_id) + + +__all__ = [ + "AgentRunner", + "Case", + "ExpectedCheck", + "RunArtifacts", + "RunResult", + "EventLogger", + "build_agent", + "bad_statuses", + "diff_runs", + "format_status_line", + "is_failure", + "load_results", + "load_cases", + "run_one", + "save_artifacts", + "save_status", + "summarize", + "_match_expected", +] diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index f52da8a..004548d 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -9,7 +9,7 @@ @pytest.mark.parametrize( "fail_on,require_assert", - itertools.product(["bad", "error", "mismatch", "unchecked", "any", "skipped"], [False, True]), + itertools.product(["bad", "error", "unchecked", "any", "skipped"], [False, True]), ) def test_is_failure_matches_bad_statuses(fail_on: str, require_assert: bool) -> None: statuses = ["ok", "mismatch", "failed", "error", "unchecked", "plan_only", "skipped"] diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py index ad14b00..5d61ed5 100644 --- a/tests/test_demo_qa_runner.py +++ b/tests/test_demo_qa_runner.py @@ -1,6 +1,6 @@ from __future__ import annotations -from examples.demo_qa.runner import Case, RunResult, _match_expected, compare_results, summarize +from examples.demo_qa.runner import Case, RunResult, _match_expected, diff_runs, summarize def test_match_expected_unchecked_when_no_expectations() -> None: @@ -26,9 +26,9 @@ def test_match_expected_contains_pass_and_fail() -> None: assert missing_answer.detail == "no answer" -def test_compare_results_tracks_regressions_and_improvements() -> None: - baseline = { - "ok_to_bad": RunResult( +def test_diff_runs_tracks_regressions_and_improvements() -> None: + baseline = [ + RunResult( id="ok_to_bad", question="", status="ok", @@ -39,7 +39,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: duration_ms=10, tags=[], ), - "err_to_ok": RunResult( + RunResult( id="err_to_ok", question="", status="error", @@ -50,10 +50,10 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: duration_ms=10, tags=[], ), - "checked_to_unchecked": RunResult( - id="checked_to_unchecked", + RunResult( + id="still_bad", question="", - status="ok", + status="mismatch", checked=True, reason=None, details=None, @@ -61,10 +61,10 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: duration_ms=10, tags=[], ), - } + ] - current = { - "ok_to_bad": RunResult( + current = [ + RunResult( id="ok_to_bad", question="", status="mismatch", @@ -75,7 +75,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: duration_ms=10, tags=[], ), - "err_to_ok": RunResult( + RunResult( id="err_to_ok", question="", status="ok", @@ -86,18 +86,18 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: duration_ms=10, tags=[], ), - "checked_to_unchecked": RunResult( - id="checked_to_unchecked", + RunResult( + id="still_bad", question="", - status="unchecked", - checked=False, + status="failed", + checked=True, reason=None, details=None, artifacts_dir="/tmp/ok2", duration_ms=10, tags=[], ), - "new_ok": RunResult( + RunResult( id="new_ok", question="", status="ok", @@ -108,14 +108,14 @@ def test_compare_results_tracks_regressions_and_improvements() -> None: duration_ms=10, tags=[], ), - } + ] - diff = compare_results(baseline, current, require_assert=True) + diff = diff_runs(baseline, current, fail_on="bad", require_assert=True) - assert "ok_to_bad" in diff["regressed"] - assert "err_to_ok" in diff["new_ok"] - assert "checked_to_unchecked" in diff["new_unchecked"] - assert "new_ok" in diff["new_ok"] + assert {row["id"] for row in diff["new_fail"]} == {"ok_to_bad"} + assert {row["id"] for row in diff["fixed"]} == {"err_to_ok"} + assert {row["id"] for row in diff["still_fail"]} == {"still_bad"} + assert diff["new_cases"] == ["new_ok"] def test_summarize_counts_checked_and_unchecked() -> None: From e280ee458a322be91ea0aebc5f953c1b2c68d9d1 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 23:25:05 +0300 Subject: [PATCH 26/92] Slim run metadata and stabilize diffs --- examples/demo_qa/batch.py | 46 ++++++++++++++++++++++++-------------- examples/demo_qa/cli.py | 5 +++++ examples/demo_qa/runner.py | 4 ++-- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 4a1d392..1b5b40e 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -39,7 +39,7 @@ def write_results(out_path: Path, results: Iterable[RunResult]) -> None: def write_summary(out_path: Path, summary: dict) -> Path: summary_path = out_path.with_name("summary.json") - summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") return summary_path @@ -85,23 +85,32 @@ def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object] } -def _fingerprint_dir(data_dir: Path) -> Mapping[str, object]: - files: list[dict] = [] +def _fingerprint_dir(data_dir: Path, *, verbose: bool = False) -> Mapping[str, object]: + entries: list[dict] = [] + total_bytes = 0 + files_count = 0 for path in sorted(data_dir.rglob("*")): if path.is_file(): rel = path.relative_to(data_dir) if rel.parts and rel.parts[0] in {".runs", ".cache"}: continue stat = path.stat() - files.append( - { - "path": str(rel), - "size": stat.st_size, - "mtime": stat.st_mtime, - } - ) - digest = hashlib.sha256(json.dumps(files, sort_keys=True).encode("utf-8")).hexdigest() - return {"hash": digest, "files": files} + files_count += 1 + total_bytes += stat.st_size + if verbose: + entries.append( + { + "path": str(rel), + "size": stat.st_size, + "mtime": stat.st_mtime, + } + ) + digest_payload = entries if verbose else [{"files_count": files_count, "bytes_total": total_bytes}] + digest = hashlib.sha256(json.dumps(digest_payload, sort_keys=True).encode("utf-8")).hexdigest() + fingerprint: dict[str, object] = {"hash": digest, "files_count": files_count, "bytes_total": total_bytes} + if verbose: + fingerprint["files"] = entries + return fingerprint def _git_sha() -> Optional[str]: @@ -283,7 +292,6 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None: for cid in ok_ids: ET.SubElement(suite, "testcase", name=cid) - tree = ET.ElementTree(suite) out_path.write_text(ET.tostring(suite, encoding="unicode"), encoding="utf-8") @@ -464,7 +472,9 @@ def handle_batch(args) -> int: summary_by_tag = summary.get("summary_by_tag") if summary_by_tag: summary_by_tag_path = summary_path.with_name("summary_by_tag.json") - summary_by_tag_path.write_text(json.dumps(summary_by_tag, ensure_ascii=False, indent=2), encoding="utf-8") + summary_by_tag_path.write_text( + json.dumps(summary_by_tag, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8" + ) latest_path = run_folder.parent / "latest.txt" latest_results_path = run_folder.parent / "latest_results.txt" @@ -475,7 +485,7 @@ def handle_batch(args) -> int: config_hash = _hash_file(args.config) if args.config else None schema_hash = _hash_file(args.schema) cases_hash = _hash_file(args.cases) - data_fingerprint = _fingerprint_dir(args.data) + data_fingerprint = _fingerprint_dir(args.data, verbose=args.fingerprint_verbose) llm_settings = settings.llm run_meta = { "run_id": run_id, @@ -504,7 +514,9 @@ def handle_batch(args) -> int: "summary_path": str(summary_path), "run_dir": str(run_folder), } - (run_folder / "run_meta.json").write_text(json.dumps(run_meta, ensure_ascii=False, indent=2), encoding="utf-8") + (run_folder / "run_meta.json").write_text( + json.dumps(run_meta, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8" + ) prate = _pass_rate(counts) history_entry = { @@ -525,7 +537,7 @@ def handle_batch(args) -> int: } history_path.parent.mkdir(parents=True, exist_ok=True) with history_path.open("a", encoding="utf-8") as f: - f.write(json.dumps(history_entry, ensure_ascii=False) + "\n") + f.write(json.dumps(history_entry, ensure_ascii=False, sort_keys=True) + "\n") bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0) unchecked = counts.get("unchecked", 0) diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index 2ca493b..be3089e 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -87,6 +87,11 @@ def build_parser() -> argparse.ArgumentParser: batch_p.add_argument("--exclude-ids", type=Path, default=None, help="Path to file with ids to exclude (one per line)") batch_p.add_argument("--events", choices=["on", "off"], default="on", help="Enable events.jsonl emission") batch_p.add_argument("--events-file", type=Path, default=None, help="Override events file path") + batch_p.add_argument( + "--fingerprint-verbose", + action="store_true", + help="Include per-file entries in data fingerprint (defaults to counts only)", + ) case_root = sub.add_parser("case", help="Single-case utilities") case_sub = case_root.add_subparsers(dest="case_command", required=True) diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 7d841b8..c1edf42 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -365,7 +365,7 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]: "checked_ok": checked_ok, "unchecked_no_assert": unchecked_no_assert, "plan_only": plan_only, - "summary_by_tag": per_tag, + "summary_by_tag": {tag: per_tag[tag] for tag in sorted(per_tag)}, **totals, } if total_times: @@ -660,7 +660,7 @@ def _count_delta(key: str) -> int | float | None: return new_val - base_val return None - delta_keys = {"ok", "mismatch", "failed", "error", "skipped", "unchecked", "plan_only", "total"} + delta_keys = ["total", "ok", "mismatch", "failed", "error", "unchecked", "plan_only", "skipped"] count_deltas = {k: _count_delta(k) for k in delta_keys} return { From af2ce2096bcb450647612332cacf62682de2e0e5 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 23:33:47 +0300 Subject: [PATCH 27/92] Honor fail_on in compare summary --- examples/demo_qa/batch.py | 26 ++++++++++++++++++++------ tests/test_demo_qa_batch.py | 20 +++++++++++++++++++- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 1b5b40e..a25b558 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -221,15 +221,29 @@ def render_markdown(compare: dict[str, object], out_path: Optional[Path]) -> str lines: list[str] = [] base_counts = compare["base_counts"] # type: ignore[index] new_counts = compare["new_counts"] # type: ignore[index] + fail_on = compare.get("fail_on", "bad") # type: ignore[assignment] + require_assert = bool(compare.get("require_assert", False)) + + def _bad_total(counts: dict) -> int: + bad_from_compare = compare.get("base_bad_total") if counts is base_counts else compare.get("new_bad_total") + if isinstance(bad_from_compare, int): + return bad_from_compare + bad_set = bad_statuses(str(fail_on), require_assert) + total = 0 + for status in bad_set: + try: + total += int(counts.get(status, 0) or 0) + except Exception: + continue + return total + + base_bad = _bad_total(base_counts) # type: ignore[arg-type] + new_bad = _bad_total(new_counts) # type: ignore[arg-type] lines.append("# Batch comparison report") lines.append("") lines.append("## Summary") - lines.append( - f"- Base OK: {base_counts.get('ok',0)}, Bad: {base_counts.get('mismatch',0)+base_counts.get('error',0)+base_counts.get('failed',0)}" - ) - lines.append( - f"- New OK: {new_counts.get('ok',0)}, Bad: {new_counts.get('mismatch',0)+new_counts.get('error',0)+new_counts.get('failed',0)}" - ) + lines.append(f"- Base OK: {base_counts.get('ok',0)}, Bad: {base_bad}") + lines.append(f"- New OK: {new_counts.get('ok',0)}, Bad: {new_bad}") base_med = compare.get("base_median") new_med = compare.get("new_median") if base_med is not None and new_med is not None: diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 004548d..cb72890 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -4,7 +4,7 @@ import pytest -from examples.demo_qa.batch import bad_statuses, is_failure +from examples.demo_qa.batch import bad_statuses, is_failure, render_markdown @pytest.mark.parametrize( @@ -17,3 +17,21 @@ def test_is_failure_matches_bad_statuses(fail_on: str, require_assert: bool) -> assert bad # sanity check for status in statuses: assert is_failure(status, fail_on, require_assert) == (status in bad) + + +def test_render_markdown_uses_fail_policy() -> None: + compare = { + "base_counts": {"ok": 0, "mismatch": 2, "error": 1, "failed": 0}, + "new_counts": {"ok": 1, "mismatch": 0, "error": 0, "failed": 0}, + "base_bad_total": 1, + "new_bad_total": 0, + "fail_on": "error", + "require_assert": False, + "new_fail": [], + "fixed": [], + "still_fail": [], + "all_ids": [], + } + report = render_markdown(compare, None) + assert "- Base OK: 0, Bad: 1" in report + assert "- New OK: 1, Bad: 0" in report From c653240edeb59fc159170db0ed4ec0fdf6a1e3d4 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 20 Dec 2025 23:48:48 +0300 Subject: [PATCH 28/92] Restore sensitive fingerprints with compact hash --- examples/demo_qa/batch.py | 23 +++++++++++++---------- tests/test_demo_qa_batch.py | 21 ++++++++++++++++++++- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index a25b558..a597863 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -89,25 +89,28 @@ def _fingerprint_dir(data_dir: Path, *, verbose: bool = False) -> Mapping[str, o entries: list[dict] = [] total_bytes = 0 files_count = 0 + digest = hashlib.sha256() for path in sorted(data_dir.rglob("*")): if path.is_file(): rel = path.relative_to(data_dir) if rel.parts and rel.parts[0] in {".runs", ".cache"}: continue stat = path.stat() + record = { + "path": str(rel), + "size": stat.st_size, + "mtime": stat.st_mtime, + } + digest.update(json.dumps(record, sort_keys=True).encode("utf-8")) files_count += 1 total_bytes += stat.st_size if verbose: - entries.append( - { - "path": str(rel), - "size": stat.st_size, - "mtime": stat.st_mtime, - } - ) - digest_payload = entries if verbose else [{"files_count": files_count, "bytes_total": total_bytes}] - digest = hashlib.sha256(json.dumps(digest_payload, sort_keys=True).encode("utf-8")).hexdigest() - fingerprint: dict[str, object] = {"hash": digest, "files_count": files_count, "bytes_total": total_bytes} + entries.append(record) + fingerprint: dict[str, object] = { + "hash": digest.hexdigest(), + "files_count": files_count, + "bytes_total": total_bytes, + } if verbose: fingerprint["files"] = entries return fingerprint diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index cb72890..44895e5 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -1,10 +1,13 @@ from __future__ import annotations import itertools +import os +import time +from pathlib import Path import pytest -from examples.demo_qa.batch import bad_statuses, is_failure, render_markdown +from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown @pytest.mark.parametrize( @@ -35,3 +38,19 @@ def test_render_markdown_uses_fail_policy() -> None: report = render_markdown(compare, None) assert "- Base OK: 0, Bad: 1" in report assert "- New OK: 1, Bad: 0" in report + + +def test_fingerprint_sensitive_to_file_changes(tmp_path: Path) -> None: + data = tmp_path / "data" + data.mkdir() + target = data / "file.txt" + target.write_text("aaa", encoding="utf-8") + first = _fingerprint_dir(data) + + target.write_text("bbb", encoding="utf-8") + now = time.time() + 1 + os.utime(target, (now, now)) + second = _fingerprint_dir(data) + + assert first["hash"] != second["hash"] + assert first["files_count"] == second["files_count"] == 1 From ac9e0f0f29cfc6f22a3b735690c1e11655c29f29 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 00:10:15 +0300 Subject: [PATCH 29/92] Include new bad cases in diffs --- examples/demo_qa/runner.py | 2 ++ tests/test_demo_qa_runner.py | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index c1edf42..06d8f7c 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -632,6 +632,8 @@ def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult) -> dict if base_res is None: new_cases.append(case_id) + if new_bad: + new_fail.append(_entry(case_id, base_res, new_res)) else: if base_res.status != new_res.status: changed_status.append({"id": case_id, "from": base_res.status, "to": new_res.status}) diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py index 5d61ed5..745a477 100644 --- a/tests/test_demo_qa_runner.py +++ b/tests/test_demo_qa_runner.py @@ -108,14 +108,25 @@ def test_diff_runs_tracks_regressions_and_improvements() -> None: duration_ms=10, tags=[], ), + RunResult( + id="new_bad", + question="", + status="failed", + checked=True, + reason=None, + details=None, + artifacts_dir="/tmp/newbad", + duration_ms=10, + tags=[], + ), ] diff = diff_runs(baseline, current, fail_on="bad", require_assert=True) - assert {row["id"] for row in diff["new_fail"]} == {"ok_to_bad"} + assert {row["id"] for row in diff["new_fail"]} == {"ok_to_bad", "new_bad"} assert {row["id"] for row in diff["fixed"]} == {"err_to_ok"} assert {row["id"] for row in diff["still_fail"]} == {"still_bad"} - assert diff["new_cases"] == ["new_ok"] + assert diff["new_cases"] == ["new_bad", "new_ok"] def test_summarize_counts_checked_and_unchecked() -> None: From 26de17fd9a2e978e8c61fc248c870b7968a0c8e3 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 00:41:07 +0300 Subject: [PATCH 30/92] Switch demo_qa schema output to JSON (#72) --- README_demo_qa.md | 8 ++++---- examples/demo_qa/data_gen.py | 12 +++--------- tests/test_demo_qa_schema_io.py | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+), 13 deletions(-) create mode 100644 tests/test_demo_qa_schema_io.py diff --git a/README_demo_qa.md b/README_demo_qa.md index 86f9e67..9698aa6 100644 --- a/README_demo_qa.md +++ b/README_demo_qa.md @@ -8,7 +8,7 @@ python -m examples.demo_qa.cli gen --out demo_data --rows 1000 --seed 42 ``` -Команда создаст четыре CSV, `schema.yaml`, `meta.json` и `stats.json`. +Команда создаст четыре CSV, `schema.json`, `meta.json` и `stats.json`. ## Конфигурация LLM (pydantic-settings) @@ -47,7 +47,7 @@ pip install -r examples/demo_qa/requirements.txt `base_url` (формат `http://host:port/v1`), модели и температуры. 2. Запустите чат с указанием конфига: ```bash -python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.yaml --config path/to/demo_qa.toml +python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.json --config path/to/demo_qa.toml ``` Флаг `--enable-semantic` строит семантический индекс, если передана модель эмбеддингов. @@ -59,7 +59,7 @@ python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.y ```bash python -m examples.demo_qa.cli batch \ --data demo_data \ - --schema demo_data/schema.yaml \ + --schema demo_data/schema.json \ --cases cases.jsonl \ --out results.jsonl ``` @@ -76,7 +76,7 @@ python -m examples.demo_qa.cli batch \ любым ключом доступа, если прокси не проверяет его. Запуск: ```bash -python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.yaml --config path/to/demo_qa.toml +python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.json --config path/to/demo_qa.toml ``` Большинство OpenAI-совместимых сервисов ожидают конечную точку `/v1` в `base_url`. diff --git a/examples/demo_qa/data_gen.py b/examples/demo_qa/data_gen.py index 8c8ab32..8fc21dd 100644 --- a/examples/demo_qa/data_gen.py +++ b/examples/demo_qa/data_gen.py @@ -256,15 +256,9 @@ def save_dataset(dataset: GeneratedDataset, out_dir: Path) -> None: def save_schema(schema: SchemaConfig, path: Path) -> None: - def _to_dict(obj): - if isinstance(obj, list): - return [_to_dict(o) for o in obj] - if hasattr(obj, "__dict__"): - return {k: _to_dict(v) for k, v in obj.__dict__.items()} - return obj - + schema_dict = asdict(schema) with path.open("w", encoding="utf-8") as f: - json.dump(_to_dict(schema), f, ensure_ascii=False, indent=2) + json.dump(schema_dict, f, ensure_ascii=False, indent=2) @dataclass @@ -299,7 +293,7 @@ def generate_and_save(out_dir: Path, *, rows: int = 1000, seed: int | None = Non validate_dataset(dataset, rows) save_dataset(dataset, out_dir) schema = default_schema(enable_semantic=enable_semantic) - save_schema(schema, out_dir / "schema.yaml") + save_schema(schema, out_dir / "schema.json") meta = MetaInfo(seed=seed, rows=rows, created_at=datetime.utcnow().isoformat()) write_meta(out_dir / "meta.json", meta) diff --git a/tests/test_demo_qa_schema_io.py b/tests/test_demo_qa_schema_io.py new file mode 100644 index 0000000..828229f --- /dev/null +++ b/tests/test_demo_qa_schema_io.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from pathlib import Path + +from examples.demo_qa.data_gen import generate_and_save +from examples.demo_qa.schema_io import load_schema + + +def test_generate_and_load_schema_json(tmp_path: Path) -> None: + out_dir = tmp_path / "demo_data" + generate_and_save(out_dir, rows=5, seed=123) + + schema_path = out_dir / "schema.json" + assert schema_path.exists() + + schema = load_schema(schema_path) + + assert schema.name == "demo_qa" + assert {e.name for e in schema.entities} >= {"customers", "products", "orders", "order_items"} + assert {r.name for r in schema.relations} >= {"orders_to_customers", "items_to_orders", "items_to_products"} From 804c65973a431509a1c429cc71055f24e7214170 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 01:09:03 +0300 Subject: [PATCH 31/92] Clean meta and stabilize results serialization --- examples/demo_qa/batch.py | 49 ++++++++++++++----------------------- examples/demo_qa/runner.py | 17 ++++++++++++- tests/test_demo_qa_batch.py | 40 +++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index a597863..c03711f 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -3,7 +3,6 @@ import datetime import hashlib import json -import platform import subprocess import sys import uuid @@ -30,16 +29,21 @@ from .settings import load_settings +def _dump_json(path: Path, obj: object) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(obj, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") + + def write_results(out_path: Path, results: Iterable[RunResult]) -> None: out_path.parent.mkdir(parents=True, exist_ok=True) with out_path.open("w", encoding="utf-8") as f: for res in results: - f.write(json.dumps(res.to_json(), ensure_ascii=False) + "\n") + f.write(json.dumps(res.to_json(), ensure_ascii=False, sort_keys=True, separators=(",", ":")) + "\n") def write_summary(out_path: Path, summary: dict) -> Path: summary_path = out_path.with_name("summary.json") - summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") + _dump_json(summary_path, summary) return summary_path @@ -75,16 +79,6 @@ def _load_ids(path: Optional[Path]) -> set[str] | None: return ids -def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]: - llm_settings = settings.llm - return { - "base_url": llm_settings.base_url or "https://api.openai.com/v1", - "plan_model": llm_settings.plan_model, - "synth_model": llm_settings.synth_model, - "cases_hash": _hash_file(cases_path), - } - - def _fingerprint_dir(data_dir: Path, *, verbose: bool = False) -> Mapping[str, object]: entries: list[dict] = [] total_bytes = 0 @@ -477,7 +471,6 @@ def handle_batch(args) -> int: "counts": counts, "summary_by_tag": counts.get("summary_by_tag"), "exit_code": exit_code, - "config_fingerprint": build_config_fingerprint(settings, args.cases), "results_path": str(results_path), "require_assert": args.require_assert, "fail_on": args.fail_on, @@ -489,9 +482,7 @@ def handle_batch(args) -> int: summary_by_tag = summary.get("summary_by_tag") if summary_by_tag: summary_by_tag_path = summary_path.with_name("summary_by_tag.json") - summary_by_tag_path.write_text( - json.dumps(summary_by_tag, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8" - ) + _dump_json(summary_by_tag_path, summary_by_tag) latest_path = run_folder.parent / "latest.txt" latest_results_path = run_folder.parent / "latest_results.txt" @@ -507,13 +498,15 @@ def handle_batch(args) -> int: run_meta = { "run_id": run_id, "timestamp": started_at.isoformat() + "Z", - "cases_path": str(args.cases), - "cases_hash": cases_hash, - "config_path": str(args.config) if args.config else None, - "config_hash": config_hash, - "schema_path": str(args.schema), - "schema_hash": schema_hash, - "data_dir": str(args.data), + "inputs": { + "cases_path": str(args.cases), + "cases_hash": cases_hash, + "config_path": str(args.config) if args.config else None, + "config_hash": config_hash, + "schema_path": str(args.schema), + "schema_hash": schema_hash, + "data_dir": str(args.data), + }, "data_fingerprint": data_fingerprint, "llm": { "plan_model": llm_settings.plan_model, @@ -523,17 +516,12 @@ def handle_batch(args) -> int: "base_url": llm_settings.base_url or "https://api.openai.com/v1", }, "enable_semantic": args.enable_semantic, - "embedding_model": None, "git_sha": _git_sha(), - "python_version": sys.version, - "platform": platform.platform(), "results_path": str(results_path), "summary_path": str(summary_path), "run_dir": str(run_folder), } - (run_folder / "run_meta.json").write_text( - json.dumps(run_meta, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8" - ) + _dump_json(run_folder / "run_meta.json", run_meta) prate = _pass_rate(counts) history_entry = { @@ -771,5 +759,4 @@ def handle_compare(args) -> int: "write_summary", "_load_latest_run", "_find_case_artifact", - "build_config_fingerprint", ] diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 06d8f7c..c49ee15 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -648,6 +648,12 @@ def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult) -> dict elif base_bad and new_bad: still_fail.append(_entry(case_id, base_res, new_res)) + new_fail = sorted(new_fail, key=lambda r: r.get("id", "")) + fixed = sorted(fixed, key=lambda r: r.get("id", "")) + still_fail = sorted(still_fail, key=lambda r: r.get("id", "")) + changed_status = sorted(changed_status, key=lambda r: r.get("id", "")) + new_cases = sorted(new_cases) + base_counts = summarize(base_by_id.values()) new_counts = summarize(new_by_id.values()) base_med = _median_duration(base_by_id) @@ -662,7 +668,16 @@ def _count_delta(key: str) -> int | float | None: return new_val - base_val return None - delta_keys = ["total", "ok", "mismatch", "failed", "error", "unchecked", "plan_only", "skipped"] + delta_keys = ( + "total", + "ok", + "mismatch", + "failed", + "error", + "skipped", + "unchecked", + "plan_only", + ) count_deltas = {k: _count_delta(k) for k in delta_keys} return { diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 44895e5..40ee32d 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -1,13 +1,15 @@ from __future__ import annotations import itertools +import json import os import time from pathlib import Path import pytest -from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown +from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown, write_results +from examples.demo_qa.runner import RunResult, diff_runs @pytest.mark.parametrize( @@ -54,3 +56,39 @@ def test_fingerprint_sensitive_to_file_changes(tmp_path: Path) -> None: assert first["hash"] != second["hash"] assert first["files_count"] == second["files_count"] == 1 + assert "files" not in first + + +def _mk_result(case_id: str, status: str) -> RunResult: + return RunResult( + id=case_id, + question="q", + status=status, + checked=True, + reason=None, + details=None, + artifacts_dir=f"/tmp/{case_id}", + duration_ms=1000, + tags=[], + ) + + +def test_compare_is_deterministic() -> None: + base_results = [_mk_result("b", "ok"), _mk_result("a", "ok")] + new_results = [_mk_result("a", "failed"), _mk_result("b", "ok")] + + first = diff_runs(base_results, new_results, fail_on="bad", require_assert=False) + second = diff_runs(list(reversed(base_results)), list(reversed(new_results)), fail_on="bad", require_assert=False) + + assert json.dumps(first, sort_keys=True) == json.dumps(second, sort_keys=True) + + +def test_write_results_is_deterministic(tmp_path: Path) -> None: + out = tmp_path / "results.jsonl" + res = _mk_result("a", "ok") + + write_results(out, [res]) + + line = out.read_text(encoding="utf-8").strip() + expected = json.dumps(res.to_json(), ensure_ascii=False, sort_keys=True, separators=(",", ":")) + assert line == expected From 31cef7cf98ab96152106ff8fa29b5201c2ce92fd Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 01:14:39 +0300 Subject: [PATCH 32/92] Refine demo QA JSON handling and lint hygiene --- examples/demo_qa/batch.py | 16 ++++++---------- examples/demo_qa/cli.py | 2 +- examples/demo_qa/runner.py | 2 +- examples/demo_qa/settings.py | 2 +- examples/demo_qa/utils.py | 12 ++++++++++++ 5 files changed, 21 insertions(+), 13 deletions(-) create mode 100644 examples/demo_qa/utils.py diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index c03711f..5a65eec 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -27,11 +27,7 @@ summarize, ) from .settings import load_settings - - -def _dump_json(path: Path, obj: object) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(obj, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") +from .utils import dump_json def write_results(out_path: Path, results: Iterable[RunResult]) -> None: @@ -43,7 +39,7 @@ def write_results(out_path: Path, results: Iterable[RunResult]) -> None: def write_summary(out_path: Path, summary: dict) -> Path: summary_path = out_path.with_name("summary.json") - _dump_json(summary_path, summary) + dump_json(summary_path, summary) return summary_path @@ -257,7 +253,7 @@ def table(title: str, rows: list[dict]) -> None: lines.append("|---|---|---|---|") for row in sorted(rows, key=lambda r: r.get("id", "")): artifacts = row.get("artifacts", {}) - links = ", ".join(f"[{k}]({v})" for k, v in artifacts.items()) + links = ", ".join(f"[{k}]({v})" for k, v in sorted(artifacts.items())) lines.append( f"| {row['id']} | {row['from']} → {row['to']} | {row.get('reason','')} | {links or ''} |" ) @@ -292,7 +288,7 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None: failure = ET.SubElement(tc, "failure", message=msg) artifacts = row.get("artifacts", {}) if artifacts: - failure.text = "\n".join(f"{k}: {v}" for k, v in artifacts.items()) + failure.text = "\n".join(f"{k}: {v}" for k, v in sorted(artifacts.items())) for row in sorted(fixed, key=lambda r: r.get("id", "")): ET.SubElement(suite, "testcase", name=row["id"]) @@ -482,7 +478,7 @@ def handle_batch(args) -> int: summary_by_tag = summary.get("summary_by_tag") if summary_by_tag: summary_by_tag_path = summary_path.with_name("summary_by_tag.json") - _dump_json(summary_by_tag_path, summary_by_tag) + dump_json(summary_by_tag_path, summary_by_tag) latest_path = run_folder.parent / "latest.txt" latest_results_path = run_folder.parent / "latest_results.txt" @@ -521,7 +517,7 @@ def handle_batch(args) -> int: "summary_path": str(summary_path), "run_dir": str(run_folder), } - _dump_json(run_folder / "run_meta.json", run_meta) + dump_json(run_folder / "run_meta.json", run_meta) prate = _pass_rate(counts) history_entry = { diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index be3089e..dbb279d 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -16,7 +16,7 @@ def ensure_repo_imports() -> None: ensure_repo_imports() -from .batch import ( +from .batch import ( # noqa: E402 handle_batch, handle_case_open, handle_case_run, diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index c49ee15..b0e199c 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -8,7 +8,7 @@ import uuid from dataclasses import dataclass, field from pathlib import Path -from typing import Dict, Iterable, List, Mapping, Optional +from typing import Dict, Iterable, List, Mapping from fetchgraph.core import create_generic_agent from fetchgraph.core.models import TaskProfile diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py index 396d368..a7da694 100644 --- a/examples/demo_qa/settings.py +++ b/examples/demo_qa/settings.py @@ -122,7 +122,7 @@ def load_settings( DemoQASettings._toml_path = resolved try: settings = DemoQASettings(**(overrides or {})) - except ValidationError as exc: + except ValidationError: DemoQASettings._toml_path = None raise DemoQASettings._toml_path = None diff --git a/examples/demo_qa/utils.py b/examples/demo_qa/utils.py new file mode 100644 index 0000000..55da4ab --- /dev/null +++ b/examples/demo_qa/utils.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +import json +from pathlib import Path + + +def dump_json(path: Path, obj: object) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(obj, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") + + +__all__ = ["dump_json"] From 12afc2a6b95c4a9b12b12e9dae5933977044556d Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 01:39:26 +0300 Subject: [PATCH 33/92] Harden results loading and emit batch completion event --- README_demo_qa.md | 1 + examples/demo_qa/batch.py | 12 ++++++++++ examples/demo_qa/chat_repl.py | 17 +++++--------- examples/demo_qa/requirements.txt | 3 ++- examples/demo_qa/runner.py | 38 ++++++++++++++++++++++--------- src/pydantic_settings/__init__.py | 14 ++++++++---- 6 files changed, 57 insertions(+), 28 deletions(-) diff --git a/README_demo_qa.md b/README_demo_qa.md index 9698aa6..2acb56f 100644 --- a/README_demo_qa.md +++ b/README_demo_qa.md @@ -32,6 +32,7 @@ export DEMO_QA_LLM__BASE_URL=http://localhost:8000/v1 ``` ### Зависимости демо +* Требуется Python 3.11+ (используется стандартный `tomllib`). ``` pip install -e .[demo] # или diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 5a65eec..a87eabb 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -480,6 +480,18 @@ def handle_batch(args) -> int: summary_by_tag_path = summary_path.with_name("summary_by_tag.json") dump_json(summary_by_tag_path, summary_by_tag) + if event_logger: + event_logger.emit( + { + "type": "run_finished", + "counts": counts, + "exit_code": exit_code, + "duration_ms": duration_ms, + "run_dir": str(run_folder), + "results_path": str(results_path), + } + ) + latest_path = run_folder.parent / "latest.txt" latest_results_path = run_folder.parent / "latest_results.txt" latest_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/examples/demo_qa/chat_repl.py b/examples/demo_qa/chat_repl.py index 65bb0f7..0990ee3 100644 --- a/examples/demo_qa/chat_repl.py +++ b/examples/demo_qa/chat_repl.py @@ -1,14 +1,12 @@ from __future__ import annotations -import datetime +import json +import readline import sys import uuid from pathlib import Path from typing import Optional, Sequence -import readline -import json - from .provider_factory import build_provider from .runner import Case, EventLogger, RunArtifacts, build_agent, run_one, save_artifacts @@ -101,16 +99,12 @@ def start_repl( continue run_id = uuid.uuid4().hex[:8] - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - run_dir = runs_root / f"{timestamp}_{run_id}" - events_path = run_dir / "events.jsonl" - event_logger = EventLogger(events_path, run_id) - print(f"Events: {events_path}") + event_logger = EventLogger(path=None, run_id=run_id) artifacts: RunArtifacts | None = None try: case = Case(id=run_id, question=line, tags=[]) - result = run_one(case, runner, run_dir, plan_only=False, event_logger=event_logger) + result = run_one(case, runner, runs_root, plan_only=False, event_logger=event_logger) plan_obj = _load_json(Path(result.artifacts_dir) / "plan.json") ctx_obj = _load_json(Path(result.artifacts_dir) / "context.json") or {} artifacts = RunArtifacts( @@ -128,8 +122,9 @@ def start_repl( print("--- PLAN ---") print(json.dumps(artifacts.plan, ensure_ascii=False, indent=2)) print(result.answer or "") + print(f"Events: {Path(result.artifacts_dir) / 'events.jsonl'}") except Exception as exc: # pragma: no cover - REPL resilience - error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=run_dir, question=line) + error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=runs_root, question=line) error_artifacts.error = error_artifacts.error or str(exc) last_artifacts = error_artifacts save_artifacts(error_artifacts) diff --git a/examples/demo_qa/requirements.txt b/examples/demo_qa/requirements.txt index 2c5bf86..098e3c1 100644 --- a/examples/demo_qa/requirements.txt +++ b/examples/demo_qa/requirements.txt @@ -1,3 +1,4 @@ +# Requires Python >=3.11 (relies on stdlib tomllib) pydantic-settings>=2.2 python-dotenv>=1.0 -openai \ No newline at end of file +openai diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index b0e199c..90ca626 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -8,7 +8,7 @@ import uuid from dataclasses import dataclass, field from pathlib import Path -from typing import Dict, Iterable, List, Mapping +from typing import Dict, Iterable, List, Mapping, TypedDict from fetchgraph.core import create_generic_agent from fetchgraph.core.models import TaskProfile @@ -376,18 +376,10 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]: summary["median_total_s"] = None for tag, bucket in per_tag.items(): - times: List[float] = [] - # no per-tag timing collected; reuse overall average for simplicity - if times: - bucket["avg_total_s"] = statistics.fmean(times) - bucket["median_total_s"] = statistics.median(times) - else: - bucket["avg_total_s"] = None - bucket["median_total_s"] = None total = bucket.get("total", 0) checked_total_tag = (bucket.get("ok", 0) or 0) + (bucket.get("mismatch", 0) or 0) + ( bucket.get("failed", 0) or 0 - ) + ) + (bucket.get("error", 0) or 0) bucket["checked_total"] = checked_total_tag non_skipped = total - (bucket.get("skipped", 0) or 0) if non_skipped > 0: @@ -527,6 +519,8 @@ def load_results(path: Path) -> Dict[str, RunResult]: except json.JSONDecodeError as exc: raise ValueError(f"Invalid result JSON on line {lineno}: {exc}") from exc result = _run_result_from_payload(payload) + if result.id in results: + raise ValueError(f"Duplicate result id {result.id!r} on line {lineno}") results[result.id] = result return results @@ -599,7 +593,7 @@ def diff_runs( *, fail_on: str, require_assert: bool, -) -> Dict[str, object]: +) -> DiffReport: base_by_id = {res.id: res for res in base_results} new_by_id = {res.id: res for res in new_results} all_ids = sorted(new_by_id.keys()) @@ -735,6 +729,28 @@ def for_case(self, case_id: str, path: Path | None = None) -> "EventLogger": return EventLogger(path, self.run_id) +class DiffReport(TypedDict): + all_ids: list[str] + new_fail: list[dict[str, object]] + fixed: list[dict[str, object]] + still_fail: list[dict[str, object]] + changed_status: list[dict[str, str | None]] + new_cases: list[str] + base_counts: Dict[str, object] + new_counts: Dict[str, object] + counts_delta: Dict[str, int | float | None] + base_median: float | None + new_median: float | None + base_avg: float | None + new_avg: float | None + median_delta: float | None + avg_delta: float | None + base_bad_total: int + new_bad_total: int + fail_on: str + require_assert: bool + + __all__ = [ "AgentRunner", "Case", diff --git a/src/pydantic_settings/__init__.py b/src/pydantic_settings/__init__.py index bf9fdae..3f759c5 100644 --- a/src/pydantic_settings/__init__.py +++ b/src/pydantic_settings/__init__.py @@ -1,14 +1,18 @@ from __future__ import annotations import os +import sys +from typing import Any, Callable, ClassVar, Dict, Iterable, Mapping, TypeAlias + +if sys.version_info < (3, 11): # pragma: no cover - demo dependency guard + raise ImportError("pydantic_settings requires Python 3.11+ (standard tomllib).") + import tomllib -from typing import Any, Callable, ClassVar, Dict, Iterable, Mapping -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict -def SettingsConfigDict(**kwargs: Any) -> Dict[str, Any]: - return dict(**kwargs) +SettingsConfigDict: TypeAlias = ConfigDict def _deep_update(base: Dict[str, Any], updates: Mapping[str, Any]) -> Dict[str, Any]: @@ -35,7 +39,7 @@ def __call__(self) -> Dict[str, Any]: class BaseSettings(BaseModel): - model_config: ClassVar[SettingsConfigDict] = {} + model_config: ClassVar[SettingsConfigDict] = ConfigDict() def __init__(self, **values: Any) -> None: sources = self.settings_customise_sources( From 3c838a0c211a9d0c7dec380051ec988b57954155 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 21 Dec 2025 01:45:03 +0300 Subject: [PATCH 34/92] =?UTF-8?q?Version=20increase:=20=D0=B4=D0=BE=D0=B1?= =?UTF-8?q?=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=20=D1=84=D1=83=D0=BD=D0=BA=D1=86?= =?UTF-8?q?=D0=B8=D0=BE=D0=BD=D0=B0=D0=BB=20=D0=BF=D0=BE=D1=82=D0=BE=D0=BA?= =?UTF-8?q?=D0=BE=D0=B2=D0=BE=D0=B3=D0=BE=20=D0=B7=D0=B0=D0=BF=D1=83=D1=81?= =?UTF-8?q?=D0=BA=D0=B0=20=D1=82=D0=B5=D1=81=D1=82=D0=BE=D0=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 70ebb36..a401b48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,9 @@ [project] name = "fetchgraph" -version = "0.1.1" +version = "0.1.2" description = "Graph-like planning → context fetching → synthesis agent (library-style)." readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.11" license = "MIT" authors = [{name = "Alexander Onischenko"}] keywords = ["agents", "context", "langgraph", "planning", "llm"] From 0c0a51001c6915db3dfe644ee3978719cae52cef Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 01:52:32 +0300 Subject: [PATCH 35/92] Remove pydantic-settings shim --- examples/demo_qa/settings.py | 18 +++++- src/pydantic_settings/__init__.py | 102 ------------------------------ 2 files changed, 17 insertions(+), 103 deletions(-) delete mode 100644 src/pydantic_settings/__init__.py diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py index a7da694..08478e0 100644 --- a/examples/demo_qa/settings.py +++ b/examples/demo_qa/settings.py @@ -5,10 +5,11 @@ from typing import Any, ClassVar, Dict from urllib.parse import urlparse +import tomllib from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator try: - from pydantic_settings import BaseSettings, SettingsConfigDict, TomlConfigSettingsSource + from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict except ImportError as exc: # pragma: no cover - make missing dependency explicit raise ImportError( "pydantic-settings is required for demo_qa configuration. " @@ -16,6 +17,21 @@ ) from exc +class TomlConfigSettingsSource(PydanticBaseSettingsSource): + def __init__(self, settings_cls: type[BaseSettings], path: Path | None): + super().__init__(settings_cls) + self._path = path + + def __call__(self) -> Dict[str, Any]: + if not self._path: + return {} + try: + with self._path.open("rb") as toml_file: + return tomllib.load(toml_file) + except FileNotFoundError: + return {} + + class LLMSettings(BaseModel): base_url: str | None = Field(default=None) api_key: str | None = Field(default=None) diff --git a/src/pydantic_settings/__init__.py b/src/pydantic_settings/__init__.py deleted file mode 100644 index 3f759c5..0000000 --- a/src/pydantic_settings/__init__.py +++ /dev/null @@ -1,102 +0,0 @@ -from __future__ import annotations - -import os -import sys -from typing import Any, Callable, ClassVar, Dict, Iterable, Mapping, TypeAlias - -if sys.version_info < (3, 11): # pragma: no cover - demo dependency guard - raise ImportError("pydantic_settings requires Python 3.11+ (standard tomllib).") - -import tomllib - -from pydantic import BaseModel, ConfigDict - - -SettingsConfigDict: TypeAlias = ConfigDict - - -def _deep_update(base: Dict[str, Any], updates: Mapping[str, Any]) -> Dict[str, Any]: - for key, value in updates.items(): - if isinstance(value, Mapping) and isinstance(base.get(key), dict): - base[key] = _deep_update(base[key], value) - else: - base[key] = value - return base - - -class TomlConfigSettingsSource: - def __init__(self, settings_cls: type[BaseModel], path: os.PathLike | str | None): - self._path = path - - def __call__(self) -> Dict[str, Any]: - if not self._path: - return {} - try: - with open(self._path, "rb") as toml_file: - return tomllib.load(toml_file) - except FileNotFoundError: - return {} - - -class BaseSettings(BaseModel): - model_config: ClassVar[SettingsConfigDict] = ConfigDict() - - def __init__(self, **values: Any) -> None: - sources = self.settings_customise_sources( - self.__class__, - self._build_init_settings(values), - self._build_env_settings(), - self._build_dotenv_settings(), - self._build_file_secret_settings(), - ) - merged: Dict[str, Any] = {} - for source in reversed(tuple(sources)): - merged = _deep_update(merged, source() or {}) - super().__init__(**merged) - - @classmethod - def settings_customise_sources( - cls, - settings_cls: type[BaseModel], - init_settings: Callable[[], Mapping[str, Any]], - env_settings: Callable[[], Mapping[str, Any]], - dotenv_settings: Callable[[], Mapping[str, Any]], - file_secret_settings: Callable[[], Mapping[str, Any]], - ) -> Iterable[Callable[[], Mapping[str, Any]]]: - return (init_settings, env_settings, dotenv_settings, file_secret_settings) - - @staticmethod - def _build_init_settings(values: Mapping[str, Any]) -> Callable[[], Mapping[str, Any]]: - return lambda: dict(values) - - @classmethod - def _build_env_settings(cls) -> Callable[[], Mapping[str, Any]]: - prefix = cls.model_config.get("env_prefix", "") or "" - delimiter = cls.model_config.get("env_nested_delimiter", "__") or "__" - - def source() -> Dict[str, Any]: - settings: Dict[str, Any] = {} - for key, value in os.environ.items(): - if not key.startswith(prefix): - continue - raw_key = key[len(prefix) :] - parts = raw_key.split(delimiter) if delimiter else [raw_key] - cls._insert_nested(settings, [part.lower() for part in parts], value) - return settings - - return source - - @classmethod - def _build_dotenv_settings(cls) -> Callable[[], Mapping[str, Any]]: - return lambda: {} - - @classmethod - def _build_file_secret_settings(cls) -> Callable[[], Mapping[str, Any]]: - return lambda: {} - - @staticmethod - def _insert_nested(target: Dict[str, Any], parts: list[str], value: Any) -> None: - current = target - for part in parts[:-1]: - current = current.setdefault(part, {}) - current[parts[-1]] = value From 5aceb938a1010f6b0a8a54fc47fbb7c9b09722e0 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 02:26:18 +0300 Subject: [PATCH 36/92] Align batch reporting with policy failures --- examples/demo_qa/batch.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index a87eabb..7abdeb4 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -454,8 +454,9 @@ def handle_batch(args) -> int: diff["baseline_path"] = str(baseline_path) diff_block = diff - failure_count = sum(1 for res in results if is_failure(res.status, args.fail_on, args.require_assert)) - exit_code = 1 if failure_count else 0 + policy_bad = bad_statuses(args.fail_on, args.require_assert) + bad_count = sum(int(counts.get(status, 0) or 0) for status in policy_bad) + exit_code = 1 if bad_count else 0 ended_at = datetime.datetime.utcnow() duration_ms = int((ended_at - started_at).total_seconds() * 1000) @@ -547,20 +548,23 @@ def handle_batch(args) -> int: "median_total_s": counts.get("median_total_s"), "run_dir": str(run_folder), "results_path": str(results_path), + "failed": counts.get("failed", 0), + "unchecked": counts.get("unchecked", 0), + "plan_only": counts.get("plan_only", 0), + "fail_on": args.fail_on, + "require_assert": args.require_assert, + "fail_count": bad_count, } history_path.parent.mkdir(parents=True, exist_ok=True) with history_path.open("a", encoding="utf-8") as f: f.write(json.dumps(history_entry, ensure_ascii=False, sort_keys=True) + "\n") - bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0) unchecked = counts.get("unchecked", 0) plan_only = counts.get("plan_only", 0) - if args.require_assert or args.fail_on in {"unchecked", "any"}: - bad_count += unchecked + plan_only summary_line = ( f"Batch: {counts.get('total', 0)} cases | Checked: {counts.get('checked_total', 0)} | " f"Checked OK: {counts.get('checked_ok', 0)} | Unchecked(no-assert): {unchecked} | " - f"Plan-only: {plan_only} | BAD: {bad_count} | Skipped: {counts.get('skipped', 0)}" + f"Plan-only: {plan_only} | FAIL(policy): {bad_count} | Skipped: {counts.get('skipped', 0)}" ) if args.quiet: @@ -697,7 +701,10 @@ def _print_stats(entries: list[dict]) -> None: if not entries: print("No history entries found.") return - header = f"{'run_id':<10} {'ok':>4} {'mis':>4} {'err':>4} {'skip':>5} {'pass%':>7} {'median_s':>10} {'Δpass':>8} {'Δmedian':>9}" + header = ( + f"{'run_id':<10} {'ok':>4} {'mis':>4} {'fail':>4} {'err':>4} {'skip':>5} " + f"{'pass%':>7} {'median_s':>10} {'Δpass':>8} {'Δmedian':>9} {'policy':>8} {'reqA':>5}" + ) print(header) prev = None for entry in entries: @@ -716,8 +723,10 @@ def _print_stats(entries: list[dict]) -> None: dm = f"{delta_median:+.2f}" if delta_median is not None else "n/a" print( f"{entry.get('run_id',''):<10} " - f"{entry.get('ok',0):>4} {entry.get('mismatch',0):>4} {entry.get('error',0):>4} {entry.get('skipped',0):>5} " - f"{pr_display:>7} {median_display:>10} {dp:>8} {dm:>9}" + f"{entry.get('ok',0):>4} {entry.get('mismatch',0):>4} {entry.get('failed',0):>4} " + f"{entry.get('error',0):>4} {entry.get('skipped',0):>5} " + f"{pr_display:>7} {median_display:>10} {dp:>8} {dm:>9} " + f"{entry.get('fail_on',''):>8} {str(entry.get('require_assert', False)):>5}" ) prev = entry From 92a297c0770836c4dd1410ea9fdc32c59f9033a3 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 21 Dec 2025 02:48:43 +0300 Subject: [PATCH 37/92] fixing tests --- examples/demo_qa/requirements.txt | 2 +- examples/demo_qa/settings.py | 22 +++------------------- tests/test_demo_qa_settings.py | 4 ---- tests/test_demo_qa_settings_sources.py | 2 +- 4 files changed, 5 insertions(+), 25 deletions(-) diff --git a/examples/demo_qa/requirements.txt b/examples/demo_qa/requirements.txt index 098e3c1..2e908b2 100644 --- a/examples/demo_qa/requirements.txt +++ b/examples/demo_qa/requirements.txt @@ -1,4 +1,4 @@ # Requires Python >=3.11 (relies on stdlib tomllib) -pydantic-settings>=2.2 +pydantic-settings>=2.12 python-dotenv>=1.0 openai diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py index 08478e0..a7ea9fd 100644 --- a/examples/demo_qa/settings.py +++ b/examples/demo_qa/settings.py @@ -5,33 +5,17 @@ from typing import Any, ClassVar, Dict from urllib.parse import urlparse -import tomllib from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator +from pydantic_settings.sources.providers.toml import TomlConfigSettingsSource try: - from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict + from pydantic_settings import BaseSettings, SettingsConfigDict except ImportError as exc: # pragma: no cover - make missing dependency explicit raise ImportError( "pydantic-settings is required for demo_qa configuration. " "Install demo extras via `pip install -e .[demo]` or `pip install -r examples/demo_qa/requirements.txt`." ) from exc - -class TomlConfigSettingsSource(PydanticBaseSettingsSource): - def __init__(self, settings_cls: type[BaseSettings], path: Path | None): - super().__init__(settings_cls) - self._path = path - - def __call__(self) -> Dict[str, Any]: - if not self._path: - return {} - try: - with self._path.open("rb") as toml_file: - return tomllib.load(toml_file) - except FileNotFoundError: - return {} - - class LLMSettings(BaseModel): base_url: str | None = Field(default=None) api_key: str | None = Field(default=None) @@ -98,7 +82,7 @@ def settings_customise_sources( ): sources = [init_settings, env_settings, dotenv_settings] if cls._toml_path: - sources.append(TomlConfigSettingsSource(settings_cls, cls._toml_path)) + sources.append(TomlConfigSettingsSource(settings_cls, toml_file=cls._toml_path)) sources.append(file_secret_settings) return tuple(sources) diff --git a/tests/test_demo_qa_settings.py b/tests/test_demo_qa_settings.py index 6b25e4f..16a1f30 100644 --- a/tests/test_demo_qa_settings.py +++ b/tests/test_demo_qa_settings.py @@ -6,10 +6,6 @@ import pytest -ROOT = Path(__file__).resolve().parents[1] -if str(ROOT) not in sys.path: - sys.path.insert(0, str(ROOT)) - from examples.demo_qa.llm.factory import build_llm from examples.demo_qa.llm.openai_adapter import OpenAILLM from examples.demo_qa.settings import load_settings diff --git a/tests/test_demo_qa_settings_sources.py b/tests/test_demo_qa_settings_sources.py index 97fef3a..f828329 100644 --- a/tests/test_demo_qa_settings_sources.py +++ b/tests/test_demo_qa_settings_sources.py @@ -4,7 +4,7 @@ import pytest -from examples.demo_qa.settings import DemoQASettings, load_settings, resolve_config_path +from examples.demo_qa.settings import load_settings, resolve_config_path def write_toml(path: Path, content: str) -> None: From 66c6dabeecfe8874dfcbca8790d25ebcc6989b56 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 21 Dec 2025 10:26:54 +0300 Subject: [PATCH 38/92] =?UTF-8?q?=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D1=8B=20=D1=82=D1=80=D0=B5=D0=B1=D0=BE=D0=B2=D0=B0?= =?UTF-8?q?=D0=BD=D0=B8=D0=BD=D1=8F=20=D0=BA=20pydantic=20settings=20?= =?UTF-8?q?=D0=B8=D0=B7-=D0=B7=D0=B0=20toml=20=D0=B2=20demo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a401b48..92cf687 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,8 +33,9 @@ dev = [ "python-dotenv>=1.0", ] demo = [ - "pydantic-settings>=2.2", + "pydantic-settings>=2.12", "python-dotenv>=1.0", + "openai" ] [project.urls] From 34d2c56335148a04e9c949f6abfa28d5cd0020e5 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 21 Dec 2025 10:38:23 +0300 Subject: [PATCH 39/92] =?UTF-8?q?=D1=84=D0=B8=D0=BA=D1=81=20=D0=BF=D1=83?= =?UTF-8?q?=D1=82=D0=B5=D0=B9=20=D0=B8=D0=BC=D0=BF=D0=BE=D1=80=D1=82=D0=B0?= =?UTF-8?q?=20=D0=BD=D0=B0=20=D0=BF=D1=83=D0=B1=D0=BB=D0=B8=D1=87=D0=BD?= =?UTF-8?q?=D1=8B=D0=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo_qa/settings.py | 2 +- pyproject.toml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py index a7ea9fd..064e179 100644 --- a/examples/demo_qa/settings.py +++ b/examples/demo_qa/settings.py @@ -6,10 +6,10 @@ from urllib.parse import urlparse from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator -from pydantic_settings.sources.providers.toml import TomlConfigSettingsSource try: from pydantic_settings import BaseSettings, SettingsConfigDict + from pydantic_settings.sources import TomlConfigSettingsSource except ImportError as exc: # pragma: no cover - make missing dependency explicit raise ImportError( "pydantic-settings is required for demo_qa configuration. " diff --git a/pyproject.toml b/pyproject.toml index 92cf687..710ebde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,8 @@ dev = [ "pre-commit", "pydantic-settings>=2.2", "python-dotenv>=1.0", + "ruff", + "pyright" ] demo = [ "pydantic-settings>=2.12", From d9184f13f314179cdff6def3c12f093f1397fa14 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 10:54:21 +0300 Subject: [PATCH 40/92] Handle missing cases in demo QA compare --- examples/demo_qa/runner.py | 23 ++++++++++++++++------- tests/test_demo_qa_runner.py | 27 +++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 90ca626..5f32f47 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -596,20 +596,20 @@ def diff_runs( ) -> DiffReport: base_by_id = {res.id: res for res in base_results} new_by_id = {res.id: res for res in new_results} - all_ids = sorted(new_by_id.keys()) + all_ids = sorted(set(base_by_id.keys()) | set(new_by_id.keys())) bad = bad_statuses(fail_on, require_assert) def _is_bad(res: RunResult | None) -> bool: return bool(res and res.status in bad) - def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult) -> dict[str, object]: + def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult | None) -> dict[str, object]: return { "id": case_id, "from": base_res.status if base_res else None, - "to": new_res.status, - "reason": _reason(new_res), - "artifacts": _artifact_links(new_res), + "to": new_res.status if new_res else "missing", + "reason": _reason(new_res) if new_res else "missing in new results", + "artifacts": _artifact_links(new_res) if new_res else {}, } new_fail: list[dict[str, object]] = [] @@ -619,21 +619,30 @@ def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult) -> dict new_cases: list[str] = [] for case_id in all_ids: - new_res = new_by_id[case_id] + new_res = new_by_id.get(case_id) base_res = base_by_id.get(case_id) base_bad = _is_bad(base_res) - new_bad = _is_bad(new_res) + new_bad = True if new_res is None else _is_bad(new_res) if base_res is None: new_cases.append(case_id) if new_bad: new_fail.append(_entry(case_id, base_res, new_res)) + elif new_res is None: + changed_status.append({"id": case_id, "from": base_res.status, "to": "missing"}) else: if base_res.status != new_res.status: changed_status.append({"id": case_id, "from": base_res.status, "to": new_res.status}) if base_res is None: continue + if new_res is None: + entry = _entry(case_id, base_res, new_res) + if base_bad: + still_fail.append(entry) + else: + new_fail.append(entry) + continue if not base_bad and new_bad: new_fail.append(_entry(case_id, base_res, new_res)) diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py index 745a477..31773cd 100644 --- a/tests/test_demo_qa_runner.py +++ b/tests/test_demo_qa_runner.py @@ -61,6 +61,28 @@ def test_diff_runs_tracks_regressions_and_improvements() -> None: duration_ms=10, tags=[], ), + RunResult( + id="missing_ok", + question="", + status="ok", + checked=True, + reason=None, + details=None, + artifacts_dir="/tmp/miss-ok", + duration_ms=10, + tags=[], + ), + RunResult( + id="missing_bad", + question="", + status="failed", + checked=True, + reason=None, + details=None, + artifacts_dir="/tmp/miss-bad", + duration_ms=10, + tags=[], + ), ] current = [ @@ -123,9 +145,10 @@ def test_diff_runs_tracks_regressions_and_improvements() -> None: diff = diff_runs(baseline, current, fail_on="bad", require_assert=True) - assert {row["id"] for row in diff["new_fail"]} == {"ok_to_bad", "new_bad"} + assert {row["id"] for row in diff["new_fail"]} == {"ok_to_bad", "new_bad", "missing_ok"} assert {row["id"] for row in diff["fixed"]} == {"err_to_ok"} - assert {row["id"] for row in diff["still_fail"]} == {"still_bad"} + assert {row["id"] for row in diff["still_fail"]} == {"still_bad", "missing_bad"} + assert {"missing_ok", "missing_bad"} <= {row["id"] for row in diff["changed_status"]} assert diff["new_cases"] == ["new_bad", "new_ok"] From d76422a760775b8719b03e627d398e88fb2ac0eb Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 20 Dec 2025 16:40:07 +0300 Subject: [PATCH 41/92] =?UTF-8?q?=D1=82=D0=B5=D1=81=D1=82=D0=BE=D0=B2?= =?UTF-8?q?=D1=8B=D0=B5=20=D0=B4=D0=B0=D0=BD=D0=BD=D1=8B=D0=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo_qa/cases/retail_cases.json | 200 +++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 examples/demo_qa/cases/retail_cases.json diff --git a/examples/demo_qa/cases/retail_cases.json b/examples/demo_qa/cases/retail_cases.json new file mode 100644 index 0000000..fb157a3 --- /dev/null +++ b/examples/demo_qa/cases/retail_cases.json @@ -0,0 +1,200 @@ +{"id": "agg_001", "question": "Сколько всего заказов (orders) в датасете? Ответь только числом.", "expected_regex": "(? 3)? Ответь только числом.", "expected_regex": "(? sum order_total) за весь период.", "tags": ["qa", "narrative"]} +{"id": "qa_010", "question": "Построй помесячную динамику количества заказов за весь период; отдельно для статусов delivered и cancelled.", "tags": ["qa", "narrative"]} +{"id": "qa_011", "question": "Сколько мы 'отгрузили' (status='shipped') в каждом городе за 2023 год? Сортируй по убыванию количества.", "tags": ["qa", "narrative"]} +{"id": "qa_012", "question": "Сколько уникальных клиентов (distinct customer_id) сделали заказы в каждом городе?", "tags": ["qa", "narrative"]} +{"id": "qa_013", "question": "Найди клиентов без единого заказа. Сколько их и приведи первые 20 customer_id + city + segment.", "tags": ["qa", "narrative"]} +{"id": "qa_014", "question": "Найди товары с нулевым остатком (in_stock=0). Сколько их и приведи первые 20 (product_id, name, category, price).", "tags": ["qa", "narrative"]} +{"id": "qa_015", "question": "Есть ли заказы, где сумма order_total не равна сумме line_total по order_items? Если есть — перечисли их (order_id, order_total, sum_line_total).", "tags": ["qa", "narrative"]} +{"id": "qa_016", "question": "Для клиента 882: покажи разбивку его выручки по категориям (category -> revenue) и укажи топ-3 категории.", "tags": ["qa", "narrative"]} +{"id": "qa_017", "question": "Для клиента 914: найди его самый дорогой заказ и перечисли товары в этом заказе (product_id, name, category, quantity, unit_price, line_total).", "tags": ["qa", "narrative"]} +{"id": "qa_018", "question": "Для заказа 204: перечисли товары (product_id, name, category) и суммы по ним; проверь, что сумма равна order_total.", "tags": ["qa", "narrative"]} +{"id": "qa_019", "question": "Сколько заказов было в 2022 году в городе 'San Diego', и как они распределены по каналам?", "tags": ["qa", "narrative"]} +{"id": "qa_020", "question": "Какая средняя сумма заказа (avg order_total) по каждому сегменту клиента (consumer/corporate/home_office)?", "tags": ["qa", "narrative"]} +{"id": "qa_021", "question": "Сравни средний чек по городам: выведи город, число заказов, avg order_total, median order_total.", "tags": ["qa", "narrative"]} +{"id": "qa_022", "question": "Найди 20 клиентов с самым высоким числом заказов; выведи customer_id, name, city, segment, orders_count, total_spend.", "tags": ["qa", "narrative"]} +{"id": "qa_023", "question": "Какие категории чаще всего покупают в канале 'phone'? Дай top-5 категорий по выручке в этом канале.", "tags": ["qa", "narrative"]} +{"id": "qa_024", "question": "Для категории 'books': выведи 10 самых продаваемых товаров по количеству (product_id, name, units_sold).", "tags": ["qa", "narrative"]} +{"id": "qa_025", "question": "Сформируй короткий executive summary по бизнесу: общий оборот, топ-3 категории, топ-3 города по выручке, доля отменённых заказов, и 3 инсайта по данным.", "tags": ["qa", "narrative"]} From a3d9d121c7984154f69ea30a8eb447067719e78d Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 11:16:32 +0300 Subject: [PATCH 42/92] Fix REPL error artifacts to use run-specific directories --- examples/demo_qa/chat_repl.py | 5 +++-- examples/demo_qa/runner.py | 9 +++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/examples/demo_qa/chat_repl.py b/examples/demo_qa/chat_repl.py index 0990ee3..2ebd14c 100644 --- a/examples/demo_qa/chat_repl.py +++ b/examples/demo_qa/chat_repl.py @@ -99,12 +99,13 @@ def start_repl( continue run_id = uuid.uuid4().hex[:8] + run_dir = runs_root / f"{run_id}_{uuid.uuid4().hex[:8]}" event_logger = EventLogger(path=None, run_id=run_id) artifacts: RunArtifacts | None = None try: case = Case(id=run_id, question=line, tags=[]) - result = run_one(case, runner, runs_root, plan_only=False, event_logger=event_logger) + result = run_one(case, runner, runs_root, plan_only=False, event_logger=event_logger, run_dir=run_dir) plan_obj = _load_json(Path(result.artifacts_dir) / "plan.json") ctx_obj = _load_json(Path(result.artifacts_dir) / "context.json") or {} artifacts = RunArtifacts( @@ -124,7 +125,7 @@ def start_repl( print(result.answer or "") print(f"Events: {Path(result.artifacts_dir) / 'events.jsonl'}") except Exception as exc: # pragma: no cover - REPL resilience - error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=runs_root, question=line) + error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=run_dir, question=line) error_artifacts.error = error_artifacts.error or str(exc) last_artifacts = error_artifacts save_artifacts(error_artifacts) diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 5f32f47..f0575dd 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -273,9 +273,14 @@ def run_one( *, plan_only: bool = False, event_logger: EventLogger | None = None, + run_dir: Path | None = None, ) -> RunResult: - run_id = uuid.uuid4().hex[:8] - run_dir = artifacts_root / f"{case.id}_{run_id}" + if run_dir is None: + run_id = uuid.uuid4().hex[:8] + run_dir = artifacts_root / f"{case.id}_{run_id}" + else: + run_id = run_dir.name.split("_")[-1] + case_logger = event_logger.for_case(case.id, run_dir / "events.jsonl") if event_logger else None if case_logger: case_logger.emit({"type": "case_started", "case_id": case.id, "run_dir": str(run_dir)}) From 34675c13aba8128857f927c3546a6e1e83814f46 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 21 Dec 2025 22:25:17 +0300 Subject: [PATCH 43/92] =?UTF-8?q?=D0=B8=D1=81=D0=BA=D0=BB=D1=8E=D1=87?= =?UTF-8?q?=D0=B0=D0=B5=D0=BC=20=D0=BB=D0=BE=D0=B3=D0=B8=20=D0=B8=D0=B7=20?= =?UTF-8?q?=D0=BE=D1=82=D1=81=D0=BB=D0=B5=D0=B6=D0=B8=D0=B2=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D1=8F=20=D0=B2=20=D1=80=D0=B5=D0=BF=D0=BE=D0=B7=D0=B8?= =?UTF-8?q?=D1=82=D0=BE=D1=80=D0=B8=D0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a4f7611..2ca3c0a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ build/ examples/demo_qa/demo_qa.toml **/demo_qa.toml .env.demo_qa +_demo_data/*/.runs/* From 9deaed739551cbf0146b4ff753bcf428cba4d84a Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 23:06:45 +0300 Subject: [PATCH 44/92] Handle partial batch runs and missed case selection --- examples/demo_qa/batch.py | 191 ++++++++++++++++++++++++++++++------ examples/demo_qa/cli.py | 7 ++ tests/test_demo_qa_batch.py | 60 ++++++++++- 3 files changed, 228 insertions(+), 30 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 7abdeb4..7ae8337 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -114,8 +114,21 @@ def _git_sha() -> Optional[str]: return result.stdout.strip() or None -def _load_latest_run(artifacts_dir: Path) -> Optional[Path]: - latest_file = artifacts_dir / "runs" / "latest.txt" +def _sanitize_tag(tag: str) -> str: + cleaned = "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "_" for ch in tag) + return cleaned or "tag" + + +def _latest_markers(artifacts_dir: Path, tag: str | None) -> tuple[Path, Path]: + runs_dir = artifacts_dir / "runs" + if tag: + slug = _sanitize_tag(tag) + return runs_dir / f"tag-latest-{slug}.txt", runs_dir / f"tag-latest-results-{slug}.txt" + return runs_dir / "latest.txt", runs_dir / "latest_results.txt" + + +def _load_latest_run(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]: + latest_file, _ = _latest_markers(artifacts_dir, tag) if latest_file.exists(): content = latest_file.read_text(encoding="utf-8").strip() if content: @@ -123,13 +136,13 @@ def _load_latest_run(artifacts_dir: Path) -> Optional[Path]: return None -def _load_latest_results(artifacts_dir: Path) -> Optional[Path]: - latest_file = artifacts_dir / "runs" / "latest_results.txt" +def _load_latest_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]: + _, latest_file = _latest_markers(artifacts_dir, tag) if latest_file.exists(): content = latest_file.read_text(encoding="utf-8").strip() if content: return Path(content) - latest_run = _load_latest_run(artifacts_dir) + latest_run = _load_latest_run(artifacts_dir, tag) if latest_run: summary_path = latest_run / "summary.json" if summary_path.exists(): @@ -143,6 +156,39 @@ def _load_latest_results(artifacts_dir: Path) -> Optional[Path]: return None +def _load_run_meta(run_path: Path | None) -> Optional[dict]: + if run_path is None: + return None + meta_path = run_path / "run_meta.json" + if not meta_path.exists(): + return None + try: + return json.loads(meta_path.read_text(encoding="utf-8")) + except Exception: + return None + + +def _missed_case_ids(planned_case_ids: Iterable[str], executed_results: Mapping[str, RunResult] | None) -> set[str]: + planned_set = set(planned_case_ids) + if not executed_results: + return planned_set + try: + executed_ids = set(executed_results.keys()) + except Exception: + executed_ids = set() + return planned_set - executed_ids + + +def _update_latest_markers(run_folder: Path, results_path: Path, artifacts_dir: Path, tag: str | None) -> None: + marker_pairs = {_latest_markers(artifacts_dir, None)} + if tag: + marker_pairs.add(_latest_markers(artifacts_dir, tag)) + for latest_path, latest_results_path in marker_pairs: + latest_path.parent.mkdir(parents=True, exist_ok=True) + latest_path.write_text(str(run_folder), encoding="utf-8") + latest_results_path.write_text(str(results_path), encoding="utf-8") + + def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]: cases_dir = run_path / "cases" if not cases_dir.exists(): @@ -336,6 +382,8 @@ def _select_cases_for_rerun( def handle_batch(args) -> int: started_at = datetime.datetime.utcnow() run_id = uuid.uuid4().hex[:8] + interrupted = False + interrupted_at_case_id: str | None = None try: settings = load_settings(config_path=args.config, data_dir=args.data) @@ -355,13 +403,18 @@ def handle_batch(args) -> int: if artifacts_dir is None: artifacts_dir = args.data / ".runs" + include_tags = _split_csv(args.include_tags) + exclude_tags = _split_csv(args.exclude_tags) + include_ids = _load_ids(args.include_ids) + exclude_ids = _load_ids(args.exclude_ids) + baseline_filter_path = args.only_failed_from if args.only_failed and not baseline_filter_path: - latest_results = _load_latest_results(artifacts_dir) + latest_results = _load_latest_results(artifacts_dir, args.tag) if latest_results: baseline_filter_path = latest_results else: - latest_run = _load_latest_run(artifacts_dir) + latest_run = _load_latest_run(artifacts_dir, args.tag) if latest_run: candidate = latest_run / "results.jsonl" if candidate.exists(): @@ -391,12 +444,47 @@ def handle_batch(args) -> int: baseline_for_filter, require_assert=args.require_assert, fail_on=args.fail_on, - include_tags=_split_csv(args.include_tags), - exclude_tags=_split_csv(args.exclude_tags), - include_ids=_load_ids(args.include_ids), - exclude_ids=_load_ids(args.exclude_ids), + include_tags=include_tags, + exclude_tags=exclude_tags, + include_ids=include_ids, + exclude_ids=exclude_ids, ) + baseline_planned_ids: set[str] | None = None + missed_baseline_results: Optional[Mapping[str, RunResult]] = None + missed_baseline_path: Path | None = None + missed_baseline_run: Path | None = None + if args.only_missed: + missed_baseline_path = _load_latest_results(artifacts_dir, args.tag) + missed_baseline_run = _load_latest_run(artifacts_dir, args.tag) + if missed_baseline_path: + try: + missed_baseline_results = load_results(missed_baseline_path) + except Exception as exc: + print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr) + return 2 + else: + print("No baseline results found for --only-missed; running all filtered cases.", file=sys.stderr) + baseline_meta = _load_run_meta(missed_baseline_run) + if isinstance(baseline_meta, dict): + planned_from_meta = baseline_meta.get("planned_case_ids") + if isinstance(planned_from_meta, list): + baseline_planned_ids = {str(cid) for cid in planned_from_meta} + else: + try: + planned_total_meta = int(baseline_meta.get("planned_total", 0)) + except Exception: + planned_total_meta = 0 + if planned_total_meta: + baseline_planned_ids = {case.id for case in cases} + + planned_case_ids = [case.id for case in cases] + if args.only_missed: + planned_pool = baseline_planned_ids or set(planned_case_ids) + missed_ids = _missed_case_ids(planned_pool, missed_baseline_results) + cases = [case for case in cases if case.id in missed_ids] + planned_case_ids = [case.id for case in cases] + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}" results_path = args.out or (run_folder / "results.jsonl") @@ -427,15 +515,22 @@ def handle_batch(args) -> int: results: list[RunResult] = [] failures = 0 - for case in cases: - result = run_one(case, runner, artifacts_root, plan_only=args.plan_only, event_logger=event_logger) - results.append(result) - if not args.quiet: - print(format_status_line(result)) - if is_failure(result.status, args.fail_on, args.require_assert): - failures += 1 - if args.fail_fast or (args.max_fails and failures >= args.max_fails): - break + current_case_id: str | None = None + try: + for case in cases: + current_case_id = case.id + result = run_one(case, runner, artifacts_root, plan_only=args.plan_only, event_logger=event_logger) + results.append(result) + if not args.quiet: + print(format_status_line(result)) + if is_failure(result.status, args.fail_on, args.require_assert): + failures += 1 + if args.fail_fast or (args.max_fails and failures >= args.max_fails): + break + except KeyboardInterrupt: + interrupted = True + interrupted_at_case_id = current_case_id + print("Interrupted; finalizing partial results...", file=sys.stderr) write_results(results_path, results) counts = summarize(results) @@ -456,10 +551,14 @@ def handle_batch(args) -> int: policy_bad = bad_statuses(args.fail_on, args.require_assert) bad_count = sum(int(counts.get(status, 0) or 0) for status in policy_bad) - exit_code = 1 if bad_count else 0 + exit_code = 130 if interrupted else (1 if bad_count else 0) ended_at = datetime.datetime.utcnow() duration_ms = int((ended_at - started_at).total_seconds() * 1000) + executed_results = {res.id: res for res in results} + planned_total = len(planned_case_ids) + executed_total = len(results) + missed_total = len(_missed_case_ids(planned_case_ids, executed_results)) summary = { "run_id": run_id, "started_at": started_at.isoformat() + "Z", @@ -471,6 +570,13 @@ def handle_batch(args) -> int: "results_path": str(results_path), "require_assert": args.require_assert, "fail_on": args.fail_on, + "planned_total": planned_total, + "executed_total": executed_total, + "missed_total": missed_total, + "interrupted": interrupted, + "interrupted_at_case_id": interrupted_at_case_id, + "tag": args.tag, + "note": args.note, } if diff_block: summary["diff"] = diff_block @@ -490,14 +596,14 @@ def handle_batch(args) -> int: "duration_ms": duration_ms, "run_dir": str(run_folder), "results_path": str(results_path), + "interrupted": interrupted, + "planned_total": planned_total, + "executed_total": executed_total, + "missed_total": missed_total, } ) - latest_path = run_folder.parent / "latest.txt" - latest_results_path = run_folder.parent / "latest_results.txt" - latest_path.parent.mkdir(parents=True, exist_ok=True) - latest_path.write_text(str(run_folder), encoding="utf-8") - latest_results_path.write_text(str(results_path), encoding="utf-8") + _update_latest_markers(run_folder, results_path, artifacts_dir, args.tag) config_hash = _hash_file(args.config) if args.config else None schema_hash = _hash_file(args.schema) @@ -507,6 +613,8 @@ def handle_batch(args) -> int: run_meta = { "run_id": run_id, "timestamp": started_at.isoformat() + "Z", + "tag": args.tag, + "note": args.note, "inputs": { "cases_path": str(args.cases), "cases_hash": cases_hash, @@ -516,6 +624,23 @@ def handle_batch(args) -> int: "schema_hash": schema_hash, "data_dir": str(args.data), }, + "planned_case_ids": planned_case_ids, + "planned_total": planned_total, + "selected_filters": { + "include_tags": sorted(include_tags) if include_tags else None, + "exclude_tags": sorted(exclude_tags) if exclude_tags else None, + "include_ids_path": str(args.include_ids) if args.include_ids else None, + "exclude_ids_path": str(args.exclude_ids) if args.exclude_ids else None, + "only_failed": bool(args.only_failed or args.only_failed_from), + "only_failed_from": str(baseline_filter_path) if baseline_filter_path else None, + "only_missed": args.only_missed, + "only_missed_from": str(missed_baseline_path) if missed_baseline_path else None, + "plan_only": args.plan_only, + "fail_fast": args.fail_fast, + "max_fails": args.max_fails, + }, + "interrupted": interrupted, + "interrupted_at_case_id": interrupted_at_case_id, "data_fingerprint": data_fingerprint, "llm": { "plan_model": llm_settings.plan_model, @@ -539,6 +664,8 @@ def handle_batch(args) -> int: "config_hash": config_hash, "schema_hash": schema_hash, "cases_hash": cases_hash, + "tag": args.tag, + "note": args.note, "ok": counts.get("ok", 0), "mismatch": counts.get("mismatch", 0), "error": counts.get("error", 0), @@ -554,6 +681,11 @@ def handle_batch(args) -> int: "fail_on": args.fail_on, "require_assert": args.require_assert, "fail_count": bad_count, + "planned_total": planned_total, + "executed_total": executed_total, + "missed_total": missed_total, + "interrupted": interrupted, + "interrupted_at_case_id": interrupted_at_case_id, } history_path.parent.mkdir(parents=True, exist_ok=True) with history_path.open("a", encoding="utf-8") as f: @@ -562,9 +694,10 @@ def handle_batch(args) -> int: unchecked = counts.get("unchecked", 0) plan_only = counts.get("plan_only", 0) summary_line = ( - f"Batch: {counts.get('total', 0)} cases | Checked: {counts.get('checked_total', 0)} | " - f"Checked OK: {counts.get('checked_ok', 0)} | Unchecked(no-assert): {unchecked} | " - f"Plan-only: {plan_only} | FAIL(policy): {bad_count} | Skipped: {counts.get('skipped', 0)}" + f"Batch: planned {planned_total}, executed {executed_total}, missed {missed_total} | " + f"Checked: {counts.get('checked_total', 0)} | Checked OK: {counts.get('checked_ok', 0)} | " + f"Unchecked(no-assert): {unchecked} | Plan-only: {plan_only} | " + f"FAIL(policy): {bad_count} | Skipped: {counts.get('skipped', 0)}" ) if args.quiet: diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index dbb279d..16127b6 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -52,6 +52,13 @@ def build_parser() -> argparse.ArgumentParser: batch_p.add_argument("--schema", type=Path, required=True) batch_p.add_argument("--config", type=Path, default=None, help="Path to demo_qa.toml") batch_p.add_argument("--cases", type=Path, required=True, help="Path to cases jsonl") + batch_p.add_argument("--tag", type=str, default=None, help="Label this run and use tag-specific latest pointers") + batch_p.add_argument("--note", type=str, default=None, help="Free-form note to attach to the run metadata") + batch_p.add_argument( + "--only-missed", + action="store_true", + help="Run only cases missing in the latest (or tag-latest) effective results", + ) batch_p.add_argument("--out", type=Path, required=False, default=None, help="Path to results jsonl") batch_p.add_argument("--artifacts-dir", type=Path, default=None, help="Where to store per-case artifacts") batch_p.add_argument("--enable-semantic", action="store_true") diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 40ee32d..16d7cb5 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -3,12 +3,46 @@ import itertools import json import os +import sys import time +import types from pathlib import Path import pytest +from pydantic import BaseModel -from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown, write_results +if "pydantic_settings" not in sys.modules: + stub = types.ModuleType("pydantic_settings") + + class BaseSettings(BaseModel): + model_config = {} + + def SettingsConfigDict(**kwargs): + return kwargs + + stub.BaseSettings = BaseSettings + stub.SettingsConfigDict = SettingsConfigDict + + sources_mod = types.ModuleType("pydantic_settings.sources") + + def TomlConfigSettingsSource(settings_cls, toml_file): + return {} + + sources_mod.TomlConfigSettingsSource = TomlConfigSettingsSource + stub.sources = sources_mod + sys.modules["pydantic_settings"] = stub + sys.modules["pydantic_settings.sources"] = sources_mod + +from examples.demo_qa.batch import ( + _fingerprint_dir, + _latest_markers, + _missed_case_ids, + _update_latest_markers, + bad_statuses, + is_failure, + render_markdown, + write_results, +) from examples.demo_qa.runner import RunResult, diff_runs @@ -92,3 +126,27 @@ def test_write_results_is_deterministic(tmp_path: Path) -> None: line = out.read_text(encoding="utf-8").strip() expected = json.dumps(res.to_json(), ensure_ascii=False, sort_keys=True, separators=(",", ":")) assert line == expected + + +def test_missed_case_ids_diff_planned_and_executed() -> None: + planned = ["a", "b", "c", "a"] + executed = {_mk_result("b", "ok").id: _mk_result("b", "ok")} + assert _missed_case_ids(planned, executed) == {"a", "c"} + + +def test_update_latest_markers_handles_tag(tmp_path: Path) -> None: + artifacts_dir = tmp_path / "data" / ".runs" + run_dir = artifacts_dir / "runs" / "20240101_cases" + results_path = run_dir / "results.jsonl" + run_dir.mkdir(parents=True) + results_path.write_text("{}", encoding="utf-8") + + _update_latest_markers(run_dir, results_path, artifacts_dir, "feature/beta") + + latest_default, latest_results_default = _latest_markers(artifacts_dir, None) + assert latest_default.read_text(encoding="utf-8").strip() == str(run_dir) + assert latest_results_default.read_text(encoding="utf-8").strip() == str(results_path) + + latest_tag, latest_results_tag = _latest_markers(artifacts_dir, "feature/beta") + assert latest_tag.read_text(encoding="utf-8").strip() == str(run_dir) + assert latest_results_tag.read_text(encoding="utf-8").strip() == str(results_path) From 9eb71782ac1308b26aec126d6f0032ffd171ebb7 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 23:14:28 +0300 Subject: [PATCH 45/92] Improve only-missed baseline resolution and interrupt handling --- examples/demo_qa/batch.py | 61 ++++++++++++++++++++++++++++++++----- tests/test_demo_qa_batch.py | 25 --------------- 2 files changed, 53 insertions(+), 33 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 7ae8337..d903f72 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -16,6 +16,7 @@ Case, EventLogger, RunResult, + RunTimings, bad_statuses, build_agent, diff_runs, @@ -24,6 +25,7 @@ load_cases, load_results, run_one, + save_status, summarize, ) from .settings import load_settings @@ -168,6 +170,22 @@ def _load_run_meta(run_path: Path | None) -> Optional[dict]: return None +def _run_dir_from_results_path(results_path: Path | None) -> Optional[Path]: + if results_path is None: + return None + run_dir = results_path.parent + summary_path = run_dir / "summary.json" + if summary_path.exists(): + try: + summary = json.loads(summary_path.read_text(encoding="utf-8")) + run_dir_from_summary = summary.get("run_dir") + if run_dir_from_summary: + return Path(run_dir_from_summary) + except Exception: + pass + return run_dir + + def _missed_case_ids(planned_case_ids: Iterable[str], executed_results: Mapping[str, RunResult] | None) -> set[str]: planned_set = set(planned_case_ids) if not executed_results: @@ -456,7 +474,9 @@ def handle_batch(args) -> int: missed_baseline_run: Path | None = None if args.only_missed: missed_baseline_path = _load_latest_results(artifacts_dir, args.tag) - missed_baseline_run = _load_latest_run(artifacts_dir, args.tag) + missed_baseline_run = _run_dir_from_results_path(missed_baseline_path) + if missed_baseline_run is None: + missed_baseline_run = _load_latest_run(artifacts_dir, args.tag) if missed_baseline_path: try: missed_baseline_results = load_results(missed_baseline_path) @@ -471,12 +491,11 @@ def handle_batch(args) -> int: if isinstance(planned_from_meta, list): baseline_planned_ids = {str(cid) for cid in planned_from_meta} else: - try: - planned_total_meta = int(baseline_meta.get("planned_total", 0)) - except Exception: - planned_total_meta = 0 - if planned_total_meta: - baseline_planned_ids = {case.id for case in cases} + print( + "Baseline run meta missing planned_case_ids; computing missed relative to current filtered cases.", + file=sys.stderr, + ) + baseline_planned_ids = {case.id for case in cases} planned_case_ids = [case.id for case in cases] if args.only_missed: @@ -519,7 +538,33 @@ def handle_batch(args) -> int: try: for case in cases: current_case_id = case.id - result = run_one(case, runner, artifacts_root, plan_only=args.plan_only, event_logger=event_logger) + try: + result = run_one(case, runner, artifacts_root, plan_only=args.plan_only, event_logger=event_logger) + except KeyboardInterrupt: + interrupted = True + interrupted_at_case_id = current_case_id + run_dir = artifacts_root / f"{case.id}_{uuid.uuid4().hex[:8]}" + run_dir.mkdir(parents=True, exist_ok=True) + stub = RunResult( + id=case.id, + question=case.question, + status="error", + checked=case.has_asserts, + reason="KeyboardInterrupt", + details={"error": "KeyboardInterrupt"}, + artifacts_dir=str(run_dir), + duration_ms=0, + tags=list(case.tags), + answer=None, + error="KeyboardInterrupt", + plan_path=None, + timings=RunTimings(), + expected_check=None, + ) + save_status(stub) + results.append(stub) + print("Interrupted during case execution; saved partial status.", file=sys.stderr) + break results.append(result) if not args.quiet: print(format_status_line(result)) diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 16d7cb5..e9d92fa 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -3,35 +3,10 @@ import itertools import json import os -import sys import time -import types from pathlib import Path import pytest -from pydantic import BaseModel - -if "pydantic_settings" not in sys.modules: - stub = types.ModuleType("pydantic_settings") - - class BaseSettings(BaseModel): - model_config = {} - - def SettingsConfigDict(**kwargs): - return kwargs - - stub.BaseSettings = BaseSettings - stub.SettingsConfigDict = SettingsConfigDict - - sources_mod = types.ModuleType("pydantic_settings.sources") - - def TomlConfigSettingsSource(settings_cls, toml_file): - return {} - - sources_mod.TomlConfigSettingsSource = TomlConfigSettingsSource - stub.sources = sources_mod - sys.modules["pydantic_settings"] = stub - sys.modules["pydantic_settings.sources"] = sources_mod from examples.demo_qa.batch import ( _fingerprint_dir, From bfff2f6c9f84866a875b855c49976a6896097e97 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 23:17:31 +0300 Subject: [PATCH 46/92] Add only-missed from path and baseline metadata --- examples/demo_qa/batch.py | 17 ++++++++++++++++- examples/demo_qa/cli.py | 6 ++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index d903f72..75a2f40 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -427,16 +427,21 @@ def handle_batch(args) -> int: exclude_ids = _load_ids(args.exclude_ids) baseline_filter_path = args.only_failed_from + only_failed_baseline_kind: str | None = None if args.only_failed and not baseline_filter_path: latest_results = _load_latest_results(artifacts_dir, args.tag) if latest_results: baseline_filter_path = latest_results + only_failed_baseline_kind = "latest" else: latest_run = _load_latest_run(artifacts_dir, args.tag) if latest_run: candidate = latest_run / "results.jsonl" if candidate.exists(): baseline_filter_path = candidate + only_failed_baseline_kind = "latest" + if args.only_failed_from: + only_failed_baseline_kind = "path" if baseline_filter_path: try: baseline_for_filter = load_results(baseline_filter_path) @@ -472,8 +477,13 @@ def handle_batch(args) -> int: missed_baseline_results: Optional[Mapping[str, RunResult]] = None missed_baseline_path: Path | None = None missed_baseline_run: Path | None = None + only_missed_baseline_kind: str | None = None if args.only_missed: - missed_baseline_path = _load_latest_results(artifacts_dir, args.tag) + missed_baseline_path = args.only_missed_from or _load_latest_results(artifacts_dir, args.tag) + if args.only_missed_from: + only_missed_baseline_kind = "path" + elif missed_baseline_path: + only_missed_baseline_kind = "latest" missed_baseline_run = _run_dir_from_results_path(missed_baseline_path) if missed_baseline_run is None: missed_baseline_run = _load_latest_run(artifacts_dir, args.tag) @@ -503,6 +513,8 @@ def handle_batch(args) -> int: missed_ids = _missed_case_ids(planned_pool, missed_baseline_results) cases = [case for case in cases if case.id in missed_ids] planned_case_ids = [case.id for case in cases] + if not cases: + print("0 missed cases selected.", file=sys.stderr) timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}" @@ -678,8 +690,11 @@ def handle_batch(args) -> int: "exclude_ids_path": str(args.exclude_ids) if args.exclude_ids else None, "only_failed": bool(args.only_failed or args.only_failed_from), "only_failed_from": str(baseline_filter_path) if baseline_filter_path else None, + "only_failed_baseline_kind": only_failed_baseline_kind, "only_missed": args.only_missed, "only_missed_from": str(missed_baseline_path) if missed_baseline_path else None, + "only_missed_baseline_kind": only_missed_baseline_kind, + "baseline_tag": args.tag, "plan_only": args.plan_only, "fail_fast": args.fail_fast, "max_fails": args.max_fails, diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index 16127b6..e4048bc 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -59,6 +59,12 @@ def build_parser() -> argparse.ArgumentParser: action="store_true", help="Run only cases missing in the latest (or tag-latest) effective results", ) + batch_p.add_argument( + "--only-missed-from", + type=Path, + default=None, + help="Run only cases missing in the provided results.jsonl (or latest if omitted)", + ) batch_p.add_argument("--out", type=Path, required=False, default=None, help="Path to results jsonl") batch_p.add_argument("--artifacts-dir", type=Path, default=None, help="Where to store per-case artifacts") batch_p.add_argument("--enable-semantic", action="store_true") From 5a5f1ee3643599dd93f574a7ea803d416d47311d Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 23:26:23 +0300 Subject: [PATCH 47/92] Add effective tag baselines and baseline metadata --- examples/demo_qa/batch.py | 195 +++++++++++++++++++++++++++++++++----- examples/demo_qa/cli.py | 2 +- 2 files changed, 174 insertions(+), 23 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 75a2f40..7bc0302 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -121,6 +121,11 @@ def _sanitize_tag(tag: str) -> str: return cleaned or "tag" +def _effective_paths(artifacts_dir: Path, tag: str) -> tuple[Path, Path]: + base = artifacts_dir / "runs" / "tags" / _sanitize_tag(tag) + return base / "effective_results.jsonl", base / "effective_meta.json" + + def _latest_markers(artifacts_dir: Path, tag: str | None) -> tuple[Path, Path]: runs_dir = artifacts_dir / "runs" if tag: @@ -207,6 +212,79 @@ def _update_latest_markers(run_folder: Path, results_path: Path, artifacts_dir: latest_results_path.write_text(str(results_path), encoding="utf-8") +def _load_effective_results(artifacts_dir: Path, tag: str) -> tuple[dict[str, RunResult], Optional[dict], Path]: + results_path, meta_path = _effective_paths(artifacts_dir, tag) + meta: Optional[dict] = None + results: dict[str, RunResult] = {} + if results_path.exists(): + results = load_results(results_path) + if meta_path.exists(): + try: + meta = json.loads(meta_path.read_text(encoding="utf-8")) + except Exception: + meta = None + return results, meta, results_path + + +def _write_effective_results(results_path: Path, results: Mapping[str, RunResult]) -> None: + results_path.parent.mkdir(parents=True, exist_ok=True) + ordered = [results[cid] for cid in sorted(results)] + write_results(results_path, ordered) + + +def _update_effective_snapshot( + *, + artifacts_dir: Path, + tag: str, + cases_hash: str, + cases_path: Path, + planned_case_ids: list[str], + executed_results: list[RunResult], + run_folder: Path, + planned_case_ids_source: list[str] | None, +) -> tuple[Path, Path]: + effective_results, effective_meta, effective_results_path = _load_effective_results(artifacts_dir, tag) + if effective_meta and effective_meta.get("cases_hash") and effective_meta["cases_hash"] != cases_hash: + raise ValueError( + f"Existing effective results for tag {tag!r} use a different cases_hash; refusing to merge." + ) + + planned_pool: set[str] + if effective_meta and isinstance(effective_meta.get("planned_case_ids"), list): + planned_pool = {str(cid) for cid in effective_meta["planned_case_ids"]} + elif planned_case_ids_source: + planned_pool = set(planned_case_ids_source) + else: + planned_pool = set(planned_case_ids) + + for res in executed_results: + effective_results[res.id] = res + _write_effective_results(effective_results_path, effective_results) + + summary_counts = summarize(effective_results.values()) + executed_total = len(effective_results) + missed_total = len(_missed_case_ids(planned_pool, effective_results)) + meta_path = effective_results_path.with_name("effective_meta.json") + built_from = set(effective_meta.get("built_from_runs", [])) if effective_meta else set() + built_from.add(str(run_folder)) + effective_meta_payload = { + "tag": tag, + "cases_hash": cases_hash, + "cases_path": str(cases_path), + "planned_case_ids": sorted(planned_pool), + "planned_total": len(planned_pool), + "executed_total": executed_total, + "missed_total": missed_total, + "counts": summary_counts, + "updated_at": datetime.datetime.utcnow().isoformat() + "Z", + "built_from_runs": sorted(built_from), + "effective_results_path": str(effective_results_path), + } + meta_path.parent.mkdir(parents=True, exist_ok=True) + dump_json(meta_path, effective_meta_payload) + return effective_results_path, meta_path + + def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]: cases_dir = run_path / "cases" if not cases_dir.exists(): @@ -402,6 +480,7 @@ def handle_batch(args) -> int: run_id = uuid.uuid4().hex[:8] interrupted = False interrupted_at_case_id: str | None = None + cases_hash = _hash_file(args.cases) try: settings = load_settings(config_path=args.config, data_dir=args.data) @@ -428,7 +507,25 @@ def handle_batch(args) -> int: baseline_filter_path = args.only_failed_from only_failed_baseline_kind: str | None = None - if args.only_failed and not baseline_filter_path: + effective_results_path: Path | None = None + if args.only_failed_from: + only_failed_baseline_kind = "path" + elif args.tag and args.only_failed: + effective_results, effective_meta, eff_path = _load_effective_results(artifacts_dir, args.tag) + if not effective_results: + print(f"No effective results found for tag {args.tag!r}; run a tagged batch first.", file=sys.stderr) + return 2 + if effective_meta and effective_meta.get("cases_hash") not in (None, cases_hash): + print( + f"Effective results cases_hash {effective_meta.get('cases_hash')} does not match current cases file.", + file=sys.stderr, + ) + return 2 + baseline_for_filter = effective_results + baseline_filter_path = eff_path + effective_results_path = eff_path + only_failed_baseline_kind = "effective" + elif args.only_failed: latest_results = _load_latest_results(artifacts_dir, args.tag) if latest_results: baseline_filter_path = latest_results @@ -440,14 +537,15 @@ def handle_batch(args) -> int: if candidate.exists(): baseline_filter_path = candidate only_failed_baseline_kind = "latest" - if args.only_failed_from: - only_failed_baseline_kind = "path" - if baseline_filter_path: + if baseline_filter_path and baseline_for_filter is None: try: baseline_for_filter = load_results(baseline_filter_path) except Exception as exc: print(f"Failed to read baseline for --only-failed-from: {exc}", file=sys.stderr) return 2 + if args.only_failed and baseline_for_filter is None: + print("No baseline found for --only-failed.", file=sys.stderr) + return 2 compare_path = args.compare_to if compare_path is None and args.only_failed and baseline_filter_path: @@ -479,33 +577,70 @@ def handle_batch(args) -> int: missed_baseline_run: Path | None = None only_missed_baseline_kind: str | None = None if args.only_missed: - missed_baseline_path = args.only_missed_from or _load_latest_results(artifacts_dir, args.tag) if args.only_missed_from: + missed_baseline_path = args.only_missed_from only_missed_baseline_kind = "path" - elif missed_baseline_path: - only_missed_baseline_kind = "latest" - missed_baseline_run = _run_dir_from_results_path(missed_baseline_path) - if missed_baseline_run is None: - missed_baseline_run = _load_latest_run(artifacts_dir, args.tag) - if missed_baseline_path: try: missed_baseline_results = load_results(missed_baseline_path) except Exception as exc: - print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr) + print(f"Failed to read baseline for --only-missed-from: {exc}", file=sys.stderr) return 2 - else: - print("No baseline results found for --only-missed; running all filtered cases.", file=sys.stderr) - baseline_meta = _load_run_meta(missed_baseline_run) - if isinstance(baseline_meta, dict): - planned_from_meta = baseline_meta.get("planned_case_ids") - if isinstance(planned_from_meta, list): - baseline_planned_ids = {str(cid) for cid in planned_from_meta} - else: + elif args.tag: + effective_results, effective_meta, eff_path = _load_effective_results(artifacts_dir, args.tag) + if not effective_results: + print(f"No effective results found for tag {args.tag!r}; run a tagged batch first.", file=sys.stderr) + return 2 + if effective_meta and effective_meta.get("cases_hash") not in (None, cases_hash): + print( + f"Effective results cases_hash {effective_meta.get('cases_hash')} does not match current cases file.", + file=sys.stderr, + ) + return 2 + missed_baseline_path = eff_path + missed_baseline_results = effective_results + only_missed_baseline_kind = "effective" + baseline_planned_ids = ( + {str(cid) for cid in effective_meta.get("planned_case_ids", [])} + if isinstance(effective_meta, dict) + else None + ) + if not baseline_planned_ids: print( - "Baseline run meta missing planned_case_ids; computing missed relative to current filtered cases.", + "Effective results missing planned_case_ids; computing missed relative to current filtered cases.", file=sys.stderr, ) baseline_planned_ids = {case.id for case in cases} + else: + missed_baseline_path = args.only_missed_from or _load_latest_results(artifacts_dir, args.tag) + if args.only_missed_from: + only_missed_baseline_kind = "path" + elif missed_baseline_path: + only_missed_baseline_kind = "latest" + missed_baseline_run = _run_dir_from_results_path(missed_baseline_path) + if missed_baseline_run is None: + missed_baseline_run = _load_latest_run(artifacts_dir, args.tag) + if missed_baseline_path: + try: + missed_baseline_results = load_results(missed_baseline_path) + except Exception as exc: + print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr) + return 2 + else: + print("No baseline results found for --only-missed; running all filtered cases.", file=sys.stderr) + baseline_meta = _load_run_meta(missed_baseline_run) + if isinstance(baseline_meta, dict): + planned_from_meta = baseline_meta.get("planned_case_ids") + if isinstance(planned_from_meta, list): + baseline_planned_ids = {str(cid) for cid in planned_from_meta} + else: + print( + "Baseline run meta missing planned_case_ids; computing missed relative to current filtered cases.", + file=sys.stderr, + ) + baseline_planned_ids = {case.id for case in cases} + if args.only_missed and missed_baseline_results is None: + print("No baseline found for --only-missed.", file=sys.stderr) + return 2 planned_case_ids = [case.id for case in cases] if args.only_missed: @@ -661,10 +796,25 @@ def handle_batch(args) -> int: ) _update_latest_markers(run_folder, results_path, artifacts_dir, args.tag) + effective_path = None + effective_meta_path = None + if args.tag: + try: + effective_path, effective_meta_path = _update_effective_snapshot( + artifacts_dir=artifacts_dir, + tag=args.tag, + cases_hash=cases_hash, + cases_path=args.cases, + planned_case_ids=planned_case_ids, + executed_results=results, + run_folder=run_folder, + planned_case_ids_source=planned_case_ids, + ) + except Exception as exc: + print(f"Failed to update effective results for tag {args.tag!r}: {exc}", file=sys.stderr) config_hash = _hash_file(args.config) if args.config else None schema_hash = _hash_file(args.schema) - cases_hash = _hash_file(args.cases) data_fingerprint = _fingerprint_dir(args.data, verbose=args.fingerprint_verbose) llm_settings = settings.llm run_meta = { @@ -695,6 +845,7 @@ def handle_batch(args) -> int: "only_missed_from": str(missed_baseline_path) if missed_baseline_path else None, "only_missed_baseline_kind": only_missed_baseline_kind, "baseline_tag": args.tag, + "effective_path": str(effective_path) if effective_path else None, "plan_only": args.plan_only, "fail_fast": args.fail_fast, "max_fails": args.max_fails, diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index e4048bc..3a431d5 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -57,7 +57,7 @@ def build_parser() -> argparse.ArgumentParser: batch_p.add_argument( "--only-missed", action="store_true", - help="Run only cases missing in the latest (or tag-latest) effective results", + help="Run only cases missing in effective results for --tag (or latest results when no tag is set)", ) batch_p.add_argument( "--only-missed-from", From 8582ae8222ee754526db02b76d44bc889d7fa888 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sun, 21 Dec 2025 23:47:24 +0300 Subject: [PATCH 48/92] Stabilize effective scope and suite planning --- examples/demo_qa/batch.py | 76 +++++++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 10 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 7bc0302..70407fa 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -126,6 +126,28 @@ def _effective_paths(artifacts_dir: Path, tag: str) -> tuple[Path, Path]: return base / "effective_results.jsonl", base / "effective_meta.json" +def _scope_payload( + *, + cases_hash: str, + include_tags: set[str] | None, + exclude_tags: set[str] | None, + include_ids: set[str] | None, + exclude_ids: set[str] | None, +) -> dict[str, object]: + return { + "cases_hash": cases_hash, + "include_tags": sorted(include_tags) if include_tags else None, + "exclude_tags": sorted(exclude_tags) if exclude_tags else None, + "include_ids": sorted(include_ids) if include_ids else None, + "exclude_ids": sorted(exclude_ids) if exclude_ids else None, + } + + +def _scope_hash(scope: Mapping[str, object]) -> str: + payload = json.dumps(scope, sort_keys=True, ensure_ascii=False) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + def _latest_markers(artifacts_dir: Path, tag: str | None) -> tuple[Path, Path]: runs_dir = artifacts_dir / "runs" if tag: @@ -238,24 +260,27 @@ def _update_effective_snapshot( tag: str, cases_hash: str, cases_path: Path, - planned_case_ids: list[str], + suite_case_ids: list[str], executed_results: list[RunResult], run_folder: Path, - planned_case_ids_source: list[str] | None, + scope: Mapping[str, object], + scope_hash: str, ) -> tuple[Path, Path]: effective_results, effective_meta, effective_results_path = _load_effective_results(artifacts_dir, tag) if effective_meta and effective_meta.get("cases_hash") and effective_meta["cases_hash"] != cases_hash: raise ValueError( f"Existing effective results for tag {tag!r} use a different cases_hash; refusing to merge." ) + if effective_meta and effective_meta.get("scope_hash") and effective_meta["scope_hash"] != scope_hash: + raise ValueError( + f"Existing effective results for tag {tag!r} have a different scope; refusing to merge." + ) planned_pool: set[str] if effective_meta and isinstance(effective_meta.get("planned_case_ids"), list): planned_pool = {str(cid) for cid in effective_meta["planned_case_ids"]} - elif planned_case_ids_source: - planned_pool = set(planned_case_ids_source) else: - planned_pool = set(planned_case_ids) + planned_pool = set(suite_case_ids) for res in executed_results: effective_results[res.id] = res @@ -279,6 +304,8 @@ def _update_effective_snapshot( "updated_at": datetime.datetime.utcnow().isoformat() + "Z", "built_from_runs": sorted(built_from), "effective_results_path": str(effective_results_path), + "scope": scope, + "scope_hash": scope_hash, } meta_path.parent.mkdir(parents=True, exist_ok=True) dump_json(meta_path, effective_meta_payload) @@ -504,6 +531,14 @@ def handle_batch(args) -> int: exclude_tags = _split_csv(args.exclude_tags) include_ids = _load_ids(args.include_ids) exclude_ids = _load_ids(args.exclude_ids) + scope = _scope_payload( + cases_hash=cases_hash, + include_tags=include_tags, + exclude_tags=exclude_tags, + include_ids=include_ids, + exclude_ids=exclude_ids, + ) + scope_id = _scope_hash(scope) baseline_filter_path = args.only_failed_from only_failed_baseline_kind: str | None = None @@ -521,6 +556,9 @@ def handle_batch(args) -> int: file=sys.stderr, ) return 2 + if effective_meta and effective_meta.get("scope_hash") not in (None, scope_id): + print("Effective results scope does not match current selection; refusing to merge.", file=sys.stderr) + return 2 baseline_for_filter = effective_results baseline_filter_path = eff_path effective_results_path = eff_path @@ -560,8 +598,19 @@ def handle_batch(args) -> int: print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr) return 2 - cases = _select_cases_for_rerun( + filtered_cases = _select_cases_for_rerun( cases, + None, + require_assert=args.require_assert, + fail_on=args.fail_on, + include_tags=include_tags, + exclude_tags=exclude_tags, + include_ids=include_ids, + exclude_ids=exclude_ids, + ) + suite_case_ids = [case.id for case in filtered_cases] + cases = _select_cases_for_rerun( + filtered_cases, baseline_for_filter, require_assert=args.require_assert, fail_on=args.fail_on, @@ -596,6 +645,9 @@ def handle_batch(args) -> int: file=sys.stderr, ) return 2 + if effective_meta and effective_meta.get("scope_hash") not in (None, scope_id): + print("Effective results scope does not match current selection; refusing to merge.", file=sys.stderr) + return 2 missed_baseline_path = eff_path missed_baseline_results = effective_results only_missed_baseline_kind = "effective" @@ -609,7 +661,7 @@ def handle_batch(args) -> int: "Effective results missing planned_case_ids; computing missed relative to current filtered cases.", file=sys.stderr, ) - baseline_planned_ids = {case.id for case in cases} + baseline_planned_ids = set(suite_case_ids) else: missed_baseline_path = args.only_missed_from or _load_latest_results(artifacts_dir, args.tag) if args.only_missed_from: @@ -637,7 +689,7 @@ def handle_batch(args) -> int: "Baseline run meta missing planned_case_ids; computing missed relative to current filtered cases.", file=sys.stderr, ) - baseline_planned_ids = {case.id for case in cases} + baseline_planned_ids = set(suite_case_ids) if args.only_missed and missed_baseline_results is None: print("No baseline found for --only-missed.", file=sys.stderr) return 2 @@ -805,10 +857,11 @@ def handle_batch(args) -> int: tag=args.tag, cases_hash=cases_hash, cases_path=args.cases, - planned_case_ids=planned_case_ids, + suite_case_ids=suite_case_ids, executed_results=results, run_folder=run_folder, - planned_case_ids_source=planned_case_ids, + scope=scope, + scope_hash=scope_id, ) except Exception as exc: print(f"Failed to update effective results for tag {args.tag!r}: {exc}", file=sys.stderr) @@ -831,6 +884,7 @@ def handle_batch(args) -> int: "schema_hash": schema_hash, "data_dir": str(args.data), }, + "suite_case_ids": suite_case_ids, "planned_case_ids": planned_case_ids, "planned_total": planned_total, "selected_filters": { @@ -846,6 +900,8 @@ def handle_batch(args) -> int: "only_missed_baseline_kind": only_missed_baseline_kind, "baseline_tag": args.tag, "effective_path": str(effective_path) if effective_path else None, + "scope_hash": scope_id, + "scope": scope, "plan_only": args.plan_only, "fail_fast": args.fail_fast, "max_fails": args.max_fails, From 4279c57fe353bba78a088165d1cb131ef3fe6d54 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Mon, 22 Dec 2025 00:10:53 +0300 Subject: [PATCH 49/92] Differentiate suite and selected coverage --- examples/demo_qa/batch.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 70407fa..972cac9 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -694,12 +694,12 @@ def handle_batch(args) -> int: print("No baseline found for --only-missed.", file=sys.stderr) return 2 - planned_case_ids = [case.id for case in cases] + selected_case_ids = [case.id for case in cases] if args.only_missed: - planned_pool = baseline_planned_ids or set(planned_case_ids) + planned_pool = baseline_planned_ids or set(selected_case_ids) missed_ids = _missed_case_ids(planned_pool, missed_baseline_results) cases = [case for case in cases if case.id in missed_ids] - planned_case_ids = [case.id for case in cases] + selected_case_ids = [case.id for case in cases] if not cases: print("0 missed cases selected.", file=sys.stderr) @@ -800,9 +800,11 @@ def handle_batch(args) -> int: ended_at = datetime.datetime.utcnow() duration_ms = int((ended_at - started_at).total_seconds() * 1000) executed_results = {res.id: res for res in results} - planned_total = len(planned_case_ids) + planned_total = len(selected_case_ids) executed_total = len(results) - missed_total = len(_missed_case_ids(planned_case_ids, executed_results)) + missed_total = len(_missed_case_ids(selected_case_ids, executed_results)) + suite_planned_total = len(suite_case_ids) + suite_missed_total = len(_missed_case_ids(suite_case_ids, executed_results)) summary = { "run_id": run_id, "started_at": started_at.isoformat() + "Z", @@ -817,6 +819,8 @@ def handle_batch(args) -> int: "planned_total": planned_total, "executed_total": executed_total, "missed_total": missed_total, + "suite_planned_total": suite_planned_total, + "suite_missed_total": suite_missed_total, "interrupted": interrupted, "interrupted_at_case_id": interrupted_at_case_id, "tag": args.tag, @@ -885,7 +889,7 @@ def handle_batch(args) -> int: "data_dir": str(args.data), }, "suite_case_ids": suite_case_ids, - "planned_case_ids": planned_case_ids, + "selected_case_ids": selected_case_ids, "planned_total": planned_total, "selected_filters": { "include_tags": sorted(include_tags) if include_tags else None, From 941a0015c832c819d8768008cded0b53e9db30d0 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Mon, 22 Dec 2025 00:15:47 +0300 Subject: [PATCH 50/92] Add case history indexing and tag reports --- examples/demo_qa/batch.py | 284 +++++++++++++++++++++++++++++++++++++- examples/demo_qa/cli.py | 33 +++++ 2 files changed, 313 insertions(+), 4 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 972cac9..24acef0 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -254,6 +254,115 @@ def _write_effective_results(results_path: Path, results: Mapping[str, RunResult write_results(results_path, ordered) +def _append_case_history( + artifacts_dir: Path, + result: RunResult, + *, + run_id: str, + tag: str | None, + note: str | None, + fail_on: str, + require_assert: bool, + scope_hash: str, + cases_hash: str, + git_sha: str | None, + run_dir: Path, + results_path: Path, +) -> None: + history_dir = artifacts_dir / "runs" / "cases" + history_dir.mkdir(parents=True, exist_ok=True) + payload = { + "timestamp": datetime.datetime.utcnow().isoformat() + "Z", + "run_id": run_id, + "tag": tag, + "note": note, + "status": result.status, + "reason": _reason(result), + "duration_ms": result.duration_ms, + "artifacts_dir": result.artifacts_dir, + "run_dir": str(run_dir), + "results_path": str(results_path), + "fail_on": fail_on, + "require_assert": require_assert, + "scope_hash": scope_hash, + "cases_hash": cases_hash, + "git_sha": git_sha, + } + target = history_dir / f"{result.id}.jsonl" + with target.open("a", encoding="utf-8") as f: + f.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n") + + +def _build_effective_diff( + before: Mapping[str, RunResult], + after: Mapping[str, RunResult], + *, + fail_on: str, + require_assert: bool, + run_id: str, + tag: str, + note: str | None, + run_dir: Path, + results_path: Path, + scope_hash: str, +) -> dict[str, object]: + bad = bad_statuses(fail_on, require_assert) + before_bad = {cid for cid, res in before.items() if res.status in bad} + after_bad = {cid for cid, res in after.items() if res.status in bad} + ids = set(before) | set(after) + regressed: list[dict[str, object]] = [] + fixed: list[dict[str, object]] = [] + changed_bad: list[dict[str, object]] = [] + new_cases: list[dict[str, object]] = [] + other_changed: list[dict[str, object]] = [] + for cid in ids: + prev = before.get(cid) + cur = after.get(cid) + prev_status = prev.status if prev else None + cur_status = cur.status if cur else None + if prev is None and cur is not None: + new_cases.append({"id": cid, "to": cur_status}) + continue + if cur is None or prev is None: + continue + if prev_status == cur_status: + continue + entry = {"id": cid, "from": prev_status, "to": cur_status, "reason": _reason(cur)} + was_bad = cid in before_bad + now_bad = cid in after_bad + if not was_bad and now_bad: + regressed.append(entry) + elif was_bad and not now_bad: + fixed.append(entry) + elif was_bad and now_bad: + changed_bad.append(entry) + else: + other_changed.append(entry) + return { + "timestamp": datetime.datetime.utcnow().isoformat() + "Z", + "tag": tag, + "note": note, + "run_id": run_id, + "run_dir": str(run_dir), + "results_path": str(results_path), + "fail_on": fail_on, + "require_assert": require_assert, + "scope_hash": scope_hash, + "regressed": sorted(regressed, key=lambda r: r["id"]), + "fixed": sorted(fixed, key=lambda r: r["id"]), + "changed_bad": sorted(changed_bad, key=lambda r: r["id"]), + "changed_other": sorted(other_changed, key=lambda r: r["id"]), + "new_cases": sorted(new_cases, key=lambda r: r["id"]), + } + + +def _append_effective_diff(tag_dir: Path, diff_entry: Mapping[str, object]) -> None: + tag_dir.mkdir(parents=True, exist_ok=True) + changes_path = tag_dir / "effective_changes.jsonl" + with changes_path.open("a", encoding="utf-8") as f: + f.write(json.dumps(diff_entry, ensure_ascii=False, sort_keys=True) + "\n") + + def _update_effective_snapshot( *, artifacts_dir: Path, @@ -265,7 +374,7 @@ def _update_effective_snapshot( run_folder: Path, scope: Mapping[str, object], scope_hash: str, -) -> tuple[Path, Path]: +) -> tuple[Path, Path, dict[str, RunResult], dict[str, RunResult]]: effective_results, effective_meta, effective_results_path = _load_effective_results(artifacts_dir, tag) if effective_meta and effective_meta.get("cases_hash") and effective_meta["cases_hash"] != cases_hash: raise ValueError( @@ -282,6 +391,7 @@ def _update_effective_snapshot( else: planned_pool = set(suite_case_ids) + before_effective = dict(effective_results) for res in executed_results: effective_results[res.id] = res _write_effective_results(effective_results_path, effective_results) @@ -309,7 +419,7 @@ def _update_effective_snapshot( } meta_path.parent.mkdir(parents=True, exist_ok=True) dump_json(meta_path, effective_meta_payload) - return effective_results_path, meta_path + return effective_results_path, meta_path, before_effective, effective_results def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]: @@ -856,7 +966,7 @@ def handle_batch(args) -> int: effective_meta_path = None if args.tag: try: - effective_path, effective_meta_path = _update_effective_snapshot( + effective_path, effective_meta_path, prev_effective, new_effective = _update_effective_snapshot( artifacts_dir=artifacts_dir, tag=args.tag, cases_hash=cases_hash, @@ -867,12 +977,26 @@ def handle_batch(args) -> int: scope=scope, scope_hash=scope_id, ) + diff_entry = _build_effective_diff( + prev_effective, + new_effective, + fail_on=args.fail_on, + require_assert=args.require_assert, + run_id=run_id, + tag=args.tag, + note=args.note, + run_dir=run_folder, + results_path=results_path, + scope_hash=scope_id, + ) + _append_effective_diff(effective_path.parent, diff_entry) except Exception as exc: print(f"Failed to update effective results for tag {args.tag!r}: {exc}", file=sys.stderr) config_hash = _hash_file(args.config) if args.config else None schema_hash = _hash_file(args.schema) data_fingerprint = _fingerprint_dir(args.data, verbose=args.fingerprint_verbose) + git_sha = _git_sha() llm_settings = settings.llm run_meta = { "run_id": run_id, @@ -921,7 +1045,7 @@ def handle_batch(args) -> int: "base_url": llm_settings.base_url or "https://api.openai.com/v1", }, "enable_semantic": args.enable_semantic, - "git_sha": _git_sha(), + "git_sha": git_sha, "results_path": str(results_path), "summary_path": str(summary_path), "run_dir": str(run_folder), @@ -955,9 +1079,27 @@ def handle_batch(args) -> int: "planned_total": planned_total, "executed_total": executed_total, "missed_total": missed_total, + "suite_planned_total": suite_planned_total, + "suite_missed_total": suite_missed_total, "interrupted": interrupted, "interrupted_at_case_id": interrupted_at_case_id, + "scope_hash": scope_id, } + for res in results: + _append_case_history( + artifacts_dir, + res, + run_id=run_id, + tag=args.tag, + note=args.note, + fail_on=args.fail_on, + require_assert=args.require_assert, + scope_hash=scope_id, + cases_hash=cases_hash, + git_sha=git_sha, + run_dir=run_folder, + results_path=results_path, + ) history_path.parent.mkdir(parents=True, exist_ok=True) with history_path.open("a", encoding="utf-8") as f: f.write(json.dumps(history_entry, ensure_ascii=False, sort_keys=True) + "\n") @@ -1169,11 +1311,145 @@ def handle_compare(args) -> int: return 0 +def _load_case_history(path: Path) -> list[dict]: + if not path.exists(): + return [] + entries: list[dict] = [] + with path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + entries.append(json.loads(line)) + except Exception: + continue + return entries + + +def handle_history_case(args) -> int: + artifacts_dir = args.data / ".runs" + path = artifacts_dir / "runs" / "cases" / f"{args.case_id}.jsonl" + entries = _load_case_history(path) + if args.tag: + entries = [e for e in entries if e.get("tag") == args.tag] + if not entries: + print(f"No history found for case {args.case_id}.") + return 0 + entries = list(reversed(entries))[: args.limit] + header = ( + f"{'timestamp':<25} {'run_id':<12} {'tag':<15} {'status':<10} " + f"{'reason':<30} {'note':<15} {'run_dir':<30}" + ) + print(header) + for e in entries: + ts = str(e.get("timestamp", ""))[:25] + print( + f"{ts:<25} {str(e.get('run_id','')):<12} {str(e.get('tag','')):<15} " + f"{str(e.get('status','')):<10} {str(e.get('reason','')):<30} {str(e.get('note','')):<15} " + f"{str(e.get('run_dir','')):<30}" + ) + return 0 + + +def _resolve_run_dir_arg(run_arg: Path, artifacts_dir: Path) -> Optional[Path]: + if run_arg.exists(): + return run_arg + candidate = artifacts_dir / "runs" / run_arg + if candidate.exists(): + return candidate + return None + + +def handle_report_run(args) -> int: + artifacts_dir = args.data / ".runs" + run_dir = _resolve_run_dir_arg(args.run, artifacts_dir) + if not run_dir: + print("Run directory not found.", file=sys.stderr) + return 2 + summary_path = run_dir / "summary.json" + if not summary_path.exists(): + print(f"summary.json not found in {run_dir}", file=sys.stderr) + return 2 + try: + summary = json.loads(summary_path.read_text(encoding="utf-8")) + except Exception as exc: + print(f"Failed to read summary: {exc}", file=sys.stderr) + return 2 + print(f"Run: {run_dir}") + for key in ["run_id", "tag", "note", "exit_code", "interrupted", "interrupted_at_case_id", "results_path"]: + if key in summary: + print(f"{key}: {summary.get(key)}") + counts = summary.get("counts") or {} + if counts: + print("Counts:", counts) + return 0 + + +def _load_effective_diff(tag_dir: Path) -> Optional[dict]: + path = tag_dir / "effective_changes.jsonl" + if not path.exists(): + return None + last: Optional[dict] = None + with path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + last = json.loads(line) + except Exception: + continue + return last + + +def handle_report_tag(args) -> int: + artifacts_dir = args.data / ".runs" + eff_results_path, eff_meta_path = _effective_paths(artifacts_dir, args.tag) + if not eff_results_path.exists() or not eff_meta_path.exists(): + print(f"No effective snapshot found for tag {args.tag!r}.", file=sys.stderr) + return 2 + try: + meta = json.loads(eff_meta_path.read_text(encoding="utf-8")) + except Exception as exc: + print(f"Failed to read effective_meta.json: {exc}", file=sys.stderr) + return 2 + try: + results = load_results(eff_results_path) + except Exception as exc: + print(f"Failed to read effective results: {exc}", file=sys.stderr) + return 2 + counts = meta.get("counts") or summarize(results.values()) + print(f"Tag: {args.tag}") + print(f"Planned: {meta.get('planned_total')} | Executed: {meta.get('executed_total')} | Missed: {meta.get('missed_total')}") + print("Counts:", counts) + bad = bad_statuses("bad", False) + failing = [res for res in results.values() if res.status in bad] + failing = sorted(failing, key=lambda r: r.id)[:10] + if failing: + print("Failing cases (top 10):") + for res in failing: + print(f"- {res.id}: {res.status} ({_reason(res)}) [{res.artifacts_dir}]") + diff_entry = _load_effective_diff(eff_results_path.parent) + if diff_entry: + print("Last effective change:") + for key in ["timestamp", "run_id", "note"]: + if key in diff_entry: + print(f" {key}: {diff_entry.get(key)}") + for label in ["regressed", "fixed", "changed_bad", "new_cases"]: + items = diff_entry.get(label) or [] + print(f" {label}: {len(items)}") + return 0 + + __all__ = [ "handle_batch", "handle_case_open", "handle_case_run", "handle_chat", + "handle_history_case", + "handle_report_run", + "handle_report_tag", "bad_statuses", "is_failure", "write_results", diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index 3a431d5..d228a1c 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -146,6 +146,23 @@ def build_parser() -> argparse.ArgumentParser: ) compare_p.add_argument("--require-assert", action="store_true", help="Treat unchecked cases as failures when diffing") + history_p = sub.add_parser("history", help="History utilities") + history_sub = history_p.add_subparsers(dest="history_command", required=True) + case_hist = history_sub.add_parser("case", help="Show history for a case id") + case_hist.add_argument("case_id") + case_hist.add_argument("--data", type=Path, required=True, help="Data dir containing .runs") + case_hist.add_argument("--tag", type=str, default=None, help="Filter by tag") + case_hist.add_argument("--limit", type=int, default=20, help="Limit rows") + + report_p = sub.add_parser("report", help="Reports over runs/effective snapshots") + report_sub = report_p.add_subparsers(dest="report_command", required=True) + tag_report = report_sub.add_parser("tag", help="Report current effective snapshot for a tag") + tag_report.add_argument("--data", type=Path, required=True, help="Data dir containing .runs") + tag_report.add_argument("--tag", type=str, required=True, help="Tag to report") + run_report = report_sub.add_parser("run", help="Report a specific run folder or run_id") + run_report.add_argument("--data", type=Path, required=True, help="Data dir containing .runs") + run_report.add_argument("--run", type=Path, required=True, help="Run dir or run_id under runs/") + return parser @@ -173,6 +190,22 @@ def main() -> None: code = handle_stats(args) elif args.command == "compare": code = handle_compare(args) + elif args.command == "history": + from .batch import handle_history_case + + if args.history_command == "case": + code = handle_history_case(args) + else: + code = 1 + elif args.command == "report": + from .batch import handle_report_run, handle_report_tag + + if args.report_command == "tag": + code = handle_report_tag(args) + elif args.report_command == "run": + code = handle_report_run(args) + else: + code = 1 else: code = 0 raise SystemExit(code) From 7d7bb86bff33e6701cd44e542b8059c8a6f01d0f Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Mon, 22 Dec 2025 00:20:37 +0300 Subject: [PATCH 51/92] Clarify only-missed baseline requirement --- examples/demo_qa/batch.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 24acef0..dce089c 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -788,7 +788,11 @@ def handle_batch(args) -> int: print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr) return 2 else: - print("No baseline results found for --only-missed; running all filtered cases.", file=sys.stderr) + print( + "No baseline found for --only-missed. Provide --only-missed-from or run a tagged batch first.", + file=sys.stderr, + ) + return 2 baseline_meta = _load_run_meta(missed_baseline_run) if isinstance(baseline_meta, dict): planned_from_meta = baseline_meta.get("planned_case_ids") From 303a601e36c00110cff24daba4871f12423a42d5 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Mon, 22 Dec 2025 01:57:38 +0300 Subject: [PATCH 52/92] Align report policy and clean helper imports --- examples/demo_qa/batch.py | 464 +------------------------- examples/demo_qa/cli.py | 6 +- examples/demo_qa/commands/__init__.py | 6 + examples/demo_qa/commands/history.py | 31 ++ examples/demo_qa/commands/report.py | 99 ++++++ examples/demo_qa/runs/__init__.py | 3 + examples/demo_qa/runs/case_history.py | 77 +++++ examples/demo_qa/runs/coverage.py | 19 ++ examples/demo_qa/runs/effective.py | 203 +++++++++++ examples/demo_qa/runs/io.py | 17 + examples/demo_qa/runs/layout.py | 102 ++++++ examples/demo_qa/runs/scope.py | 30 ++ tests/test_demo_qa_batch.py | 13 +- tests/test_demo_qa_commands.py | 18 + 14 files changed, 627 insertions(+), 461 deletions(-) create mode 100644 examples/demo_qa/commands/__init__.py create mode 100644 examples/demo_qa/commands/history.py create mode 100644 examples/demo_qa/commands/report.py create mode 100644 examples/demo_qa/runs/__init__.py create mode 100644 examples/demo_qa/runs/case_history.py create mode 100644 examples/demo_qa/runs/coverage.py create mode 100644 examples/demo_qa/runs/effective.py create mode 100644 examples/demo_qa/runs/io.py create mode 100644 examples/demo_qa/runs/layout.py create mode 100644 examples/demo_qa/runs/scope.py create mode 100644 tests/test_demo_qa_commands.py diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index dce089c..8fa3266 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -28,17 +28,23 @@ save_status, summarize, ) +from .runs.case_history import _append_case_history +from .runs.coverage import _missed_case_ids +from .runs.effective import _append_effective_diff, _build_effective_diff, _load_effective_results, _update_effective_snapshot +from .runs.io import write_results +from .runs.layout import ( + _latest_markers, + _load_latest_results, + _load_latest_run, + _load_run_meta, + _run_dir_from_results_path, + _update_latest_markers, +) +from .runs.scope import _scope_hash, _scope_payload from .settings import load_settings from .utils import dump_json -def write_results(out_path: Path, results: Iterable[RunResult]) -> None: - out_path.parent.mkdir(parents=True, exist_ok=True) - with out_path.open("w", encoding="utf-8") as f: - for res in results: - f.write(json.dumps(res.to_json(), ensure_ascii=False, sort_keys=True, separators=(",", ":")) + "\n") - - def write_summary(out_path: Path, summary: dict) -> Path: summary_path = out_path.with_name("summary.json") dump_json(summary_path, summary) @@ -116,310 +122,6 @@ def _git_sha() -> Optional[str]: return result.stdout.strip() or None -def _sanitize_tag(tag: str) -> str: - cleaned = "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "_" for ch in tag) - return cleaned or "tag" - - -def _effective_paths(artifacts_dir: Path, tag: str) -> tuple[Path, Path]: - base = artifacts_dir / "runs" / "tags" / _sanitize_tag(tag) - return base / "effective_results.jsonl", base / "effective_meta.json" - - -def _scope_payload( - *, - cases_hash: str, - include_tags: set[str] | None, - exclude_tags: set[str] | None, - include_ids: set[str] | None, - exclude_ids: set[str] | None, -) -> dict[str, object]: - return { - "cases_hash": cases_hash, - "include_tags": sorted(include_tags) if include_tags else None, - "exclude_tags": sorted(exclude_tags) if exclude_tags else None, - "include_ids": sorted(include_ids) if include_ids else None, - "exclude_ids": sorted(exclude_ids) if exclude_ids else None, - } - - -def _scope_hash(scope: Mapping[str, object]) -> str: - payload = json.dumps(scope, sort_keys=True, ensure_ascii=False) - return hashlib.sha256(payload.encode("utf-8")).hexdigest() - - -def _latest_markers(artifacts_dir: Path, tag: str | None) -> tuple[Path, Path]: - runs_dir = artifacts_dir / "runs" - if tag: - slug = _sanitize_tag(tag) - return runs_dir / f"tag-latest-{slug}.txt", runs_dir / f"tag-latest-results-{slug}.txt" - return runs_dir / "latest.txt", runs_dir / "latest_results.txt" - - -def _load_latest_run(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]: - latest_file, _ = _latest_markers(artifacts_dir, tag) - if latest_file.exists(): - content = latest_file.read_text(encoding="utf-8").strip() - if content: - return Path(content) - return None - - -def _load_latest_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]: - _, latest_file = _latest_markers(artifacts_dir, tag) - if latest_file.exists(): - content = latest_file.read_text(encoding="utf-8").strip() - if content: - return Path(content) - latest_run = _load_latest_run(artifacts_dir, tag) - if latest_run: - summary_path = latest_run / "summary.json" - if summary_path.exists(): - try: - summary = json.loads(summary_path.read_text(encoding="utf-8")) - results_path = summary.get("results_path") - if results_path: - return Path(results_path) - except Exception: - pass - return None - - -def _load_run_meta(run_path: Path | None) -> Optional[dict]: - if run_path is None: - return None - meta_path = run_path / "run_meta.json" - if not meta_path.exists(): - return None - try: - return json.loads(meta_path.read_text(encoding="utf-8")) - except Exception: - return None - - -def _run_dir_from_results_path(results_path: Path | None) -> Optional[Path]: - if results_path is None: - return None - run_dir = results_path.parent - summary_path = run_dir / "summary.json" - if summary_path.exists(): - try: - summary = json.loads(summary_path.read_text(encoding="utf-8")) - run_dir_from_summary = summary.get("run_dir") - if run_dir_from_summary: - return Path(run_dir_from_summary) - except Exception: - pass - return run_dir - - -def _missed_case_ids(planned_case_ids: Iterable[str], executed_results: Mapping[str, RunResult] | None) -> set[str]: - planned_set = set(planned_case_ids) - if not executed_results: - return planned_set - try: - executed_ids = set(executed_results.keys()) - except Exception: - executed_ids = set() - return planned_set - executed_ids - - -def _update_latest_markers(run_folder: Path, results_path: Path, artifacts_dir: Path, tag: str | None) -> None: - marker_pairs = {_latest_markers(artifacts_dir, None)} - if tag: - marker_pairs.add(_latest_markers(artifacts_dir, tag)) - for latest_path, latest_results_path in marker_pairs: - latest_path.parent.mkdir(parents=True, exist_ok=True) - latest_path.write_text(str(run_folder), encoding="utf-8") - latest_results_path.write_text(str(results_path), encoding="utf-8") - - -def _load_effective_results(artifacts_dir: Path, tag: str) -> tuple[dict[str, RunResult], Optional[dict], Path]: - results_path, meta_path = _effective_paths(artifacts_dir, tag) - meta: Optional[dict] = None - results: dict[str, RunResult] = {} - if results_path.exists(): - results = load_results(results_path) - if meta_path.exists(): - try: - meta = json.loads(meta_path.read_text(encoding="utf-8")) - except Exception: - meta = None - return results, meta, results_path - - -def _write_effective_results(results_path: Path, results: Mapping[str, RunResult]) -> None: - results_path.parent.mkdir(parents=True, exist_ok=True) - ordered = [results[cid] for cid in sorted(results)] - write_results(results_path, ordered) - - -def _append_case_history( - artifacts_dir: Path, - result: RunResult, - *, - run_id: str, - tag: str | None, - note: str | None, - fail_on: str, - require_assert: bool, - scope_hash: str, - cases_hash: str, - git_sha: str | None, - run_dir: Path, - results_path: Path, -) -> None: - history_dir = artifacts_dir / "runs" / "cases" - history_dir.mkdir(parents=True, exist_ok=True) - payload = { - "timestamp": datetime.datetime.utcnow().isoformat() + "Z", - "run_id": run_id, - "tag": tag, - "note": note, - "status": result.status, - "reason": _reason(result), - "duration_ms": result.duration_ms, - "artifacts_dir": result.artifacts_dir, - "run_dir": str(run_dir), - "results_path": str(results_path), - "fail_on": fail_on, - "require_assert": require_assert, - "scope_hash": scope_hash, - "cases_hash": cases_hash, - "git_sha": git_sha, - } - target = history_dir / f"{result.id}.jsonl" - with target.open("a", encoding="utf-8") as f: - f.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n") - - -def _build_effective_diff( - before: Mapping[str, RunResult], - after: Mapping[str, RunResult], - *, - fail_on: str, - require_assert: bool, - run_id: str, - tag: str, - note: str | None, - run_dir: Path, - results_path: Path, - scope_hash: str, -) -> dict[str, object]: - bad = bad_statuses(fail_on, require_assert) - before_bad = {cid for cid, res in before.items() if res.status in bad} - after_bad = {cid for cid, res in after.items() if res.status in bad} - ids = set(before) | set(after) - regressed: list[dict[str, object]] = [] - fixed: list[dict[str, object]] = [] - changed_bad: list[dict[str, object]] = [] - new_cases: list[dict[str, object]] = [] - other_changed: list[dict[str, object]] = [] - for cid in ids: - prev = before.get(cid) - cur = after.get(cid) - prev_status = prev.status if prev else None - cur_status = cur.status if cur else None - if prev is None and cur is not None: - new_cases.append({"id": cid, "to": cur_status}) - continue - if cur is None or prev is None: - continue - if prev_status == cur_status: - continue - entry = {"id": cid, "from": prev_status, "to": cur_status, "reason": _reason(cur)} - was_bad = cid in before_bad - now_bad = cid in after_bad - if not was_bad and now_bad: - regressed.append(entry) - elif was_bad and not now_bad: - fixed.append(entry) - elif was_bad and now_bad: - changed_bad.append(entry) - else: - other_changed.append(entry) - return { - "timestamp": datetime.datetime.utcnow().isoformat() + "Z", - "tag": tag, - "note": note, - "run_id": run_id, - "run_dir": str(run_dir), - "results_path": str(results_path), - "fail_on": fail_on, - "require_assert": require_assert, - "scope_hash": scope_hash, - "regressed": sorted(regressed, key=lambda r: r["id"]), - "fixed": sorted(fixed, key=lambda r: r["id"]), - "changed_bad": sorted(changed_bad, key=lambda r: r["id"]), - "changed_other": sorted(other_changed, key=lambda r: r["id"]), - "new_cases": sorted(new_cases, key=lambda r: r["id"]), - } - - -def _append_effective_diff(tag_dir: Path, diff_entry: Mapping[str, object]) -> None: - tag_dir.mkdir(parents=True, exist_ok=True) - changes_path = tag_dir / "effective_changes.jsonl" - with changes_path.open("a", encoding="utf-8") as f: - f.write(json.dumps(diff_entry, ensure_ascii=False, sort_keys=True) + "\n") - - -def _update_effective_snapshot( - *, - artifacts_dir: Path, - tag: str, - cases_hash: str, - cases_path: Path, - suite_case_ids: list[str], - executed_results: list[RunResult], - run_folder: Path, - scope: Mapping[str, object], - scope_hash: str, -) -> tuple[Path, Path, dict[str, RunResult], dict[str, RunResult]]: - effective_results, effective_meta, effective_results_path = _load_effective_results(artifacts_dir, tag) - if effective_meta and effective_meta.get("cases_hash") and effective_meta["cases_hash"] != cases_hash: - raise ValueError( - f"Existing effective results for tag {tag!r} use a different cases_hash; refusing to merge." - ) - if effective_meta and effective_meta.get("scope_hash") and effective_meta["scope_hash"] != scope_hash: - raise ValueError( - f"Existing effective results for tag {tag!r} have a different scope; refusing to merge." - ) - - planned_pool: set[str] - if effective_meta and isinstance(effective_meta.get("planned_case_ids"), list): - planned_pool = {str(cid) for cid in effective_meta["planned_case_ids"]} - else: - planned_pool = set(suite_case_ids) - - before_effective = dict(effective_results) - for res in executed_results: - effective_results[res.id] = res - _write_effective_results(effective_results_path, effective_results) - - summary_counts = summarize(effective_results.values()) - executed_total = len(effective_results) - missed_total = len(_missed_case_ids(planned_pool, effective_results)) - meta_path = effective_results_path.with_name("effective_meta.json") - built_from = set(effective_meta.get("built_from_runs", [])) if effective_meta else set() - built_from.add(str(run_folder)) - effective_meta_payload = { - "tag": tag, - "cases_hash": cases_hash, - "cases_path": str(cases_path), - "planned_case_ids": sorted(planned_pool), - "planned_total": len(planned_pool), - "executed_total": executed_total, - "missed_total": missed_total, - "counts": summary_counts, - "updated_at": datetime.datetime.utcnow().isoformat() + "Z", - "built_from_runs": sorted(built_from), - "effective_results_path": str(effective_results_path), - "scope": scope, - "scope_hash": scope_hash, - } - meta_path.parent.mkdir(parents=True, exist_ok=True) - dump_json(meta_path, effective_meta_payload) - return effective_results_path, meta_path, before_effective, effective_results def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]: @@ -980,6 +682,8 @@ def handle_batch(args) -> int: run_folder=run_folder, scope=scope, scope_hash=scope_id, + fail_on=args.fail_on, + require_assert=args.require_assert, ) diff_entry = _build_effective_diff( prev_effective, @@ -1315,149 +1019,15 @@ def handle_compare(args) -> int: return 0 -def _load_case_history(path: Path) -> list[dict]: - if not path.exists(): - return [] - entries: list[dict] = [] - with path.open("r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line: - continue - try: - entries.append(json.loads(line)) - except Exception: - continue - return entries - - -def handle_history_case(args) -> int: - artifacts_dir = args.data / ".runs" - path = artifacts_dir / "runs" / "cases" / f"{args.case_id}.jsonl" - entries = _load_case_history(path) - if args.tag: - entries = [e for e in entries if e.get("tag") == args.tag] - if not entries: - print(f"No history found for case {args.case_id}.") - return 0 - entries = list(reversed(entries))[: args.limit] - header = ( - f"{'timestamp':<25} {'run_id':<12} {'tag':<15} {'status':<10} " - f"{'reason':<30} {'note':<15} {'run_dir':<30}" - ) - print(header) - for e in entries: - ts = str(e.get("timestamp", ""))[:25] - print( - f"{ts:<25} {str(e.get('run_id','')):<12} {str(e.get('tag','')):<15} " - f"{str(e.get('status','')):<10} {str(e.get('reason','')):<30} {str(e.get('note','')):<15} " - f"{str(e.get('run_dir','')):<30}" - ) - return 0 - - -def _resolve_run_dir_arg(run_arg: Path, artifacts_dir: Path) -> Optional[Path]: - if run_arg.exists(): - return run_arg - candidate = artifacts_dir / "runs" / run_arg - if candidate.exists(): - return candidate - return None - - -def handle_report_run(args) -> int: - artifacts_dir = args.data / ".runs" - run_dir = _resolve_run_dir_arg(args.run, artifacts_dir) - if not run_dir: - print("Run directory not found.", file=sys.stderr) - return 2 - summary_path = run_dir / "summary.json" - if not summary_path.exists(): - print(f"summary.json not found in {run_dir}", file=sys.stderr) - return 2 - try: - summary = json.loads(summary_path.read_text(encoding="utf-8")) - except Exception as exc: - print(f"Failed to read summary: {exc}", file=sys.stderr) - return 2 - print(f"Run: {run_dir}") - for key in ["run_id", "tag", "note", "exit_code", "interrupted", "interrupted_at_case_id", "results_path"]: - if key in summary: - print(f"{key}: {summary.get(key)}") - counts = summary.get("counts") or {} - if counts: - print("Counts:", counts) - return 0 - - -def _load_effective_diff(tag_dir: Path) -> Optional[dict]: - path = tag_dir / "effective_changes.jsonl" - if not path.exists(): - return None - last: Optional[dict] = None - with path.open("r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line: - continue - try: - last = json.loads(line) - except Exception: - continue - return last - - -def handle_report_tag(args) -> int: - artifacts_dir = args.data / ".runs" - eff_results_path, eff_meta_path = _effective_paths(artifacts_dir, args.tag) - if not eff_results_path.exists() or not eff_meta_path.exists(): - print(f"No effective snapshot found for tag {args.tag!r}.", file=sys.stderr) - return 2 - try: - meta = json.loads(eff_meta_path.read_text(encoding="utf-8")) - except Exception as exc: - print(f"Failed to read effective_meta.json: {exc}", file=sys.stderr) - return 2 - try: - results = load_results(eff_results_path) - except Exception as exc: - print(f"Failed to read effective results: {exc}", file=sys.stderr) - return 2 - counts = meta.get("counts") or summarize(results.values()) - print(f"Tag: {args.tag}") - print(f"Planned: {meta.get('planned_total')} | Executed: {meta.get('executed_total')} | Missed: {meta.get('missed_total')}") - print("Counts:", counts) - bad = bad_statuses("bad", False) - failing = [res for res in results.values() if res.status in bad] - failing = sorted(failing, key=lambda r: r.id)[:10] - if failing: - print("Failing cases (top 10):") - for res in failing: - print(f"- {res.id}: {res.status} ({_reason(res)}) [{res.artifacts_dir}]") - diff_entry = _load_effective_diff(eff_results_path.parent) - if diff_entry: - print("Last effective change:") - for key in ["timestamp", "run_id", "note"]: - if key in diff_entry: - print(f" {key}: {diff_entry.get(key)}") - for label in ["regressed", "fixed", "changed_bad", "new_cases"]: - items = diff_entry.get(label) or [] - print(f" {label}: {len(items)}") - return 0 - - __all__ = [ "handle_batch", "handle_case_open", "handle_case_run", "handle_chat", - "handle_history_case", - "handle_report_run", - "handle_report_tag", + "handle_stats", + "handle_compare", "bad_statuses", "is_failure", "write_results", "write_summary", - "_load_latest_run", - "_find_case_artifact", ] diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index d228a1c..c8ae01c 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -24,6 +24,8 @@ def ensure_repo_imports() -> None: handle_compare, handle_stats, ) # noqa: E402 +from .commands.history import handle_history_case # noqa: E402 +from .commands.report import handle_report_run, handle_report_tag # noqa: E402 from .data_gen import generate_and_save # noqa: E402 @@ -191,15 +193,11 @@ def main() -> None: elif args.command == "compare": code = handle_compare(args) elif args.command == "history": - from .batch import handle_history_case - if args.history_command == "case": code = handle_history_case(args) else: code = 1 elif args.command == "report": - from .batch import handle_report_run, handle_report_tag - if args.report_command == "tag": code = handle_report_tag(args) elif args.report_command == "run": diff --git a/examples/demo_qa/commands/__init__.py b/examples/demo_qa/commands/__init__.py new file mode 100644 index 0000000..4794645 --- /dev/null +++ b/examples/demo_qa/commands/__init__.py @@ -0,0 +1,6 @@ +"""Lightweight command entrypoints for demo QA CLI.""" + +from .history import handle_history_case +from .report import handle_report_run, handle_report_tag + +__all__ = ["handle_history_case", "handle_report_run", "handle_report_tag"] diff --git a/examples/demo_qa/commands/history.py b/examples/demo_qa/commands/history.py new file mode 100644 index 0000000..376f90d --- /dev/null +++ b/examples/demo_qa/commands/history.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from ..runs.case_history import _load_case_history + + +def handle_history_case(args) -> int: + artifacts_dir = args.data / ".runs" + path = artifacts_dir / "runs" / "cases" / f"{args.case_id}.jsonl" + entries = _load_case_history(path) + if args.tag: + entries = [e for e in entries if e.get("tag") == args.tag] + if not entries: + print(f"No history found for case {args.case_id}.") + return 0 + entries = list(reversed(entries))[: args.limit] + header = ( + f"{'timestamp':<25} {'run_id':<12} {'tag':<15} {'status':<10} " + f"{'reason':<30} {'note':<15} {'run_dir':<30}" + ) + print(header) + for e in entries: + ts = str(e.get("timestamp", ""))[:25] + print( + f"{ts:<25} {str(e.get('run_id','')):<12} {str(e.get('tag','')):<15} " + f"{str(e.get('status','')):<10} {str(e.get('reason','')):<30} {str(e.get('note','')):<15} " + f"{str(e.get('run_dir','')):<30}" + ) + return 0 + + +__all__ = ["handle_history_case"] diff --git a/examples/demo_qa/commands/report.py b/examples/demo_qa/commands/report.py new file mode 100644 index 0000000..c588a31 --- /dev/null +++ b/examples/demo_qa/commands/report.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Optional + +from ..runner import bad_statuses, load_results, summarize +from ..runs.effective import _load_effective_diff +from ..runs.layout import _effective_paths + + +def _resolve_run_dir_arg(run_arg: Path, artifacts_dir: Path) -> Optional[Path]: + if run_arg.exists(): + return run_arg + candidate = artifacts_dir / "runs" / run_arg + if candidate.exists(): + return candidate + return None + + +def handle_report_run(args) -> int: + artifacts_dir = args.data / ".runs" + run_dir = _resolve_run_dir_arg(args.run, artifacts_dir) + if not run_dir: + print("Run directory not found.", file=sys.stderr) + return 2 + summary_path = run_dir / "summary.json" + if not summary_path.exists(): + print(f"summary.json not found in {run_dir}", file=sys.stderr) + return 2 + try: + summary = json.loads(summary_path.read_text(encoding="utf-8")) + except Exception as exc: + print(f"Failed to read summary: {exc}", file=sys.stderr) + return 2 + print(f"Run: {run_dir}") + for key in ["run_id", "tag", "note", "exit_code", "interrupted", "interrupted_at_case_id", "results_path"]: + if key in summary: + print(f"{key}: {summary.get(key)}") + counts = summary.get("counts") or {} + if counts: + print("Counts:", counts) + return 0 + + +def _reason_text(res) -> str: + if getattr(res, "reason", None): + return res.reason + if getattr(res, "error", None): + return res.error + expected = getattr(res, "expected_check", None) + if expected and getattr(expected, "detail", None): + return expected.detail + return "" + + +def handle_report_tag(args) -> int: + artifacts_dir = args.data / ".runs" + eff_results_path, eff_meta_path = _effective_paths(artifacts_dir, args.tag) + if not eff_results_path.exists() or not eff_meta_path.exists(): + print(f"No effective snapshot found for tag {args.tag!r}.", file=sys.stderr) + return 2 + try: + meta = json.loads(eff_meta_path.read_text(encoding="utf-8")) + except Exception as exc: + print(f"Failed to read effective_meta.json: {exc}", file=sys.stderr) + return 2 + try: + results = load_results(eff_results_path) + except Exception as exc: + print(f"Failed to read effective results: {exc}", file=sys.stderr) + return 2 + counts = meta.get("counts") or summarize(results.values()) + fail_on = meta.get("fail_on", "bad") + require_assert = bool(meta.get("require_assert", False)) + print(f"Tag: {args.tag}") + print(f"Planned: {meta.get('planned_total')} | Executed: {meta.get('executed_total')} | Missed: {meta.get('missed_total')}") + print("Counts:", counts) + bad = bad_statuses(str(fail_on), require_assert) + failing = [res for res in results.values() if res.status in bad] + failing = sorted(failing, key=lambda r: r.id)[:10] + if failing: + print("Failing cases (top 10):") + for res in failing: + print(f"- {res.id}: {res.status} ({_reason_text(res)}) [{res.artifacts_dir}]") + diff_entry = _load_effective_diff(eff_results_path.parent) + if diff_entry: + print("Last effective change:") + for key in ["timestamp", "run_id", "note"]: + if key in diff_entry: + print(f" {key}: {diff_entry.get(key)}") + for label in ["regressed", "fixed", "changed_bad", "new_cases"]: + items = diff_entry.get(label) or [] + print(f" {label}: {len(items)}") + return 0 + + +__all__ = ["handle_report_run", "handle_report_tag", "_resolve_run_dir_arg"] diff --git a/examples/demo_qa/runs/__init__.py b/examples/demo_qa/runs/__init__.py new file mode 100644 index 0000000..1a2dfbb --- /dev/null +++ b/examples/demo_qa/runs/__init__.py @@ -0,0 +1,3 @@ +"""Utilities for managing demo QA run artifacts.""" + +__all__ = [] diff --git a/examples/demo_qa/runs/case_history.py b/examples/demo_qa/runs/case_history.py new file mode 100644 index 0000000..7f14267 --- /dev/null +++ b/examples/demo_qa/runs/case_history.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import datetime +import json +from pathlib import Path +from typing import Optional + +from ..runner import RunResult + + +def _reason_text(res: RunResult) -> str: + if res.reason: + return res.reason + if res.error: + return res.error + expected = getattr(res, "expected_check", None) + if expected and getattr(expected, "detail", None): + return expected.detail + return "" + + +def _append_case_history( + artifacts_dir: Path, + result: RunResult, + *, + run_id: str, + tag: str | None, + note: str | None, + fail_on: str, + require_assert: bool, + scope_hash: str, + cases_hash: str, + git_sha: str | None, + run_dir: Path, + results_path: Path, +) -> None: + history_dir = artifacts_dir / "runs" / "cases" + history_dir.mkdir(parents=True, exist_ok=True) + payload = { + "timestamp": datetime.datetime.utcnow().isoformat() + "Z", + "run_id": run_id, + "tag": tag, + "note": note, + "status": result.status, + "reason": _reason_text(result), + "duration_ms": result.duration_ms, + "artifacts_dir": result.artifacts_dir, + "run_dir": str(run_dir), + "results_path": str(results_path), + "fail_on": fail_on, + "require_assert": require_assert, + "scope_hash": scope_hash, + "cases_hash": cases_hash, + "git_sha": git_sha, + } + target = history_dir / f"{result.id}.jsonl" + with target.open("a", encoding="utf-8") as f: + f.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n") + + +def _load_case_history(path: Path) -> list[dict]: + if not path.exists(): + return [] + entries: list[dict] = [] + with path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + entries.append(json.loads(line)) + except Exception: + continue + return entries + + +__all__ = ["_append_case_history", "_load_case_history"] diff --git a/examples/demo_qa/runs/coverage.py b/examples/demo_qa/runs/coverage.py new file mode 100644 index 0000000..7f21a47 --- /dev/null +++ b/examples/demo_qa/runs/coverage.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from typing import Iterable, Mapping, Optional + +from ..runner import RunResult + + +def _missed_case_ids(planned_case_ids: Iterable[str], executed_results: Mapping[str, RunResult] | None) -> set[str]: + planned_set = set(planned_case_ids) + if not executed_results: + return planned_set + try: + executed_ids = set(executed_results.keys()) + except Exception: + executed_ids = set() + return planned_set - executed_ids + + +__all__ = ["_missed_case_ids"] diff --git a/examples/demo_qa/runs/effective.py b/examples/demo_qa/runs/effective.py new file mode 100644 index 0000000..e7c2b3a --- /dev/null +++ b/examples/demo_qa/runs/effective.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +import datetime +import json +from pathlib import Path +from typing import Mapping, Optional + +from ..runner import RunResult, bad_statuses, load_results, summarize +from ..utils import dump_json +from .coverage import _missed_case_ids +from .layout import _effective_paths +from .io import write_results + + +def _load_effective_results(artifacts_dir: Path, tag: str) -> tuple[dict[str, RunResult], Optional[dict], Path]: + results_path, meta_path = _effective_paths(artifacts_dir, tag) + meta: Optional[dict] = None + results: dict[str, RunResult] = {} + if results_path.exists(): + results = load_results(results_path) + if meta_path.exists(): + try: + meta = json.loads(meta_path.read_text(encoding="utf-8")) + except Exception: + meta = None + return results, meta, results_path + + +def _write_effective_results(results_path: Path, results: Mapping[str, RunResult]) -> None: + results_path.parent.mkdir(parents=True, exist_ok=True) + ordered = [results[cid] for cid in sorted(results)] + write_results(results_path, ordered) + + +def _reason_text(res: RunResult) -> str: + if res.reason: + return res.reason + if res.error: + return res.error + expected = getattr(res, "expected_check", None) + if expected and getattr(expected, "detail", None): + return expected.detail + return "" + + +def _build_effective_diff( + before: Mapping[str, RunResult], + after: Mapping[str, RunResult], + *, + fail_on: str, + require_assert: bool, + run_id: str, + tag: str, + note: str | None, + run_dir: Path, + results_path: Path, + scope_hash: str, +) -> dict[str, object]: + bad = bad_statuses(fail_on, require_assert) + before_bad = {cid for cid, res in before.items() if res.status in bad} + after_bad = {cid for cid, res in after.items() if res.status in bad} + ids = set(before) | set(after) + regressed: list[dict[str, object]] = [] + fixed: list[dict[str, object]] = [] + changed_bad: list[dict[str, object]] = [] + new_cases: list[dict[str, object]] = [] + other_changed: list[dict[str, object]] = [] + for cid in ids: + prev = before.get(cid) + cur = after.get(cid) + prev_status = prev.status if prev else None + cur_status = cur.status if cur else None + if prev is None and cur is not None: + new_cases.append({"id": cid, "to": cur_status}) + continue + if cur is None or prev is None: + continue + if prev_status == cur_status: + continue + entry = {"id": cid, "from": prev_status, "to": cur_status, "reason": _reason_text(cur)} + was_bad = cid in before_bad + now_bad = cid in after_bad + if not was_bad and now_bad: + regressed.append(entry) + elif was_bad and not now_bad: + fixed.append(entry) + elif was_bad and now_bad: + changed_bad.append(entry) + else: + other_changed.append(entry) + return { + "timestamp": datetime.datetime.utcnow().isoformat() + "Z", + "tag": tag, + "note": note, + "run_id": run_id, + "run_dir": str(run_dir), + "results_path": str(results_path), + "fail_on": fail_on, + "require_assert": require_assert, + "scope_hash": scope_hash, + "regressed": sorted(regressed, key=lambda r: r["id"]), + "fixed": sorted(fixed, key=lambda r: r["id"]), + "changed_bad": sorted(changed_bad, key=lambda r: r["id"]), + "changed_other": sorted(other_changed, key=lambda r: r["id"]), + "new_cases": sorted(new_cases, key=lambda r: r["id"]), + } + + +def _append_effective_diff(tag_dir: Path, diff_entry: Mapping[str, object]) -> None: + tag_dir.mkdir(parents=True, exist_ok=True) + changes_path = tag_dir / "effective_changes.jsonl" + with changes_path.open("a", encoding="utf-8") as f: + f.write(json.dumps(diff_entry, ensure_ascii=False, sort_keys=True) + "\n") + + +def _load_effective_diff(tag_dir: Path) -> Optional[dict]: + path = tag_dir / "effective_changes.jsonl" + if not path.exists(): + return None + last: Optional[dict] = None + with path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + last = json.loads(line) + except Exception: + continue + return last + + +def _update_effective_snapshot( + *, + artifacts_dir: Path, + tag: str, + cases_hash: str, + cases_path: Path, + suite_case_ids: list[str], + executed_results: list[RunResult], + run_folder: Path, + scope: Mapping[str, object], + scope_hash: str, + fail_on: str, + require_assert: bool, +) -> tuple[Path, Path, dict[str, RunResult], dict[str, RunResult]]: + effective_results, effective_meta, effective_results_path = _load_effective_results(artifacts_dir, tag) + if effective_meta and effective_meta.get("cases_hash") and effective_meta["cases_hash"] != cases_hash: + raise ValueError( + f"Existing effective results for tag {tag!r} use a different cases_hash; refusing to merge." + ) + if effective_meta and effective_meta.get("scope_hash") and effective_meta["scope_hash"] != scope_hash: + raise ValueError( + f"Existing effective results for tag {tag!r} have a different scope; refusing to merge." + ) + + planned_pool: set[str] + if effective_meta and isinstance(effective_meta.get("planned_case_ids"), list): + planned_pool = {str(cid) for cid in effective_meta["planned_case_ids"]} + else: + planned_pool = set(suite_case_ids) + + before_effective = dict(effective_results) + for res in executed_results: + effective_results[res.id] = res + _write_effective_results(effective_results_path, effective_results) + + summary_counts = summarize(effective_results.values()) + executed_total = len(effective_results) + missed_total = len(_missed_case_ids(planned_pool, effective_results)) + meta_path = effective_results_path.with_name("effective_meta.json") + built_from = set(effective_meta.get("built_from_runs", [])) if effective_meta else set() + built_from.add(str(run_folder)) + effective_meta_payload = { + "tag": tag, + "cases_hash": cases_hash, + "cases_path": str(cases_path), + "planned_case_ids": sorted(planned_pool), + "planned_total": len(planned_pool), + "executed_total": executed_total, + "missed_total": missed_total, + "counts": summary_counts, + "updated_at": datetime.datetime.utcnow().isoformat() + "Z", + "built_from_runs": sorted(built_from), + "effective_results_path": str(effective_results_path), + "scope": scope, + "scope_hash": scope_hash, + "fail_on": fail_on, + "require_assert": require_assert, + } + meta_path.parent.mkdir(parents=True, exist_ok=True) + dump_json(meta_path, effective_meta_payload) + return effective_results_path, meta_path, before_effective, effective_results + + +__all__ = [ + "_append_effective_diff", + "_build_effective_diff", + "_load_effective_results", + "_load_effective_diff", + "_update_effective_snapshot", + "_write_effective_results", +] diff --git a/examples/demo_qa/runs/io.py b/examples/demo_qa/runs/io.py new file mode 100644 index 0000000..54db989 --- /dev/null +++ b/examples/demo_qa/runs/io.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Iterable + +from ..runner import RunResult + + +def write_results(out_path: Path, results: Iterable[RunResult]) -> None: + out_path.parent.mkdir(parents=True, exist_ok=True) + with out_path.open("w", encoding="utf-8") as f: + for res in results: + f.write(json.dumps(res.to_json(), ensure_ascii=False, sort_keys=True, separators=(",", ":")) + "\n") + + +__all__ = ["write_results"] diff --git a/examples/demo_qa/runs/layout.py b/examples/demo_qa/runs/layout.py new file mode 100644 index 0000000..704f643 --- /dev/null +++ b/examples/demo_qa/runs/layout.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Optional + + +def _sanitize_tag(tag: str) -> str: + cleaned = "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "_" for ch in tag) + return cleaned or "tag" + + +def _effective_paths(artifacts_dir: Path, tag: str) -> tuple[Path, Path]: + base = artifacts_dir / "runs" / "tags" / _sanitize_tag(tag) + return base / "effective_results.jsonl", base / "effective_meta.json" + + +def _latest_markers(artifacts_dir: Path, tag: str | None) -> tuple[Path, Path]: + runs_dir = artifacts_dir / "runs" + if tag: + slug = _sanitize_tag(tag) + return runs_dir / f"tag-latest-{slug}.txt", runs_dir / f"tag-latest-results-{slug}.txt" + return runs_dir / "latest.txt", runs_dir / "latest_results.txt" + + +def _load_latest_run(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]: + latest_file, _ = _latest_markers(artifacts_dir, tag) + if latest_file.exists(): + content = latest_file.read_text(encoding="utf-8").strip() + if content: + return Path(content) + return None + + +def _load_latest_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]: + _, latest_file = _latest_markers(artifacts_dir, tag) + if latest_file.exists(): + content = latest_file.read_text(encoding="utf-8").strip() + if content: + return Path(content) + latest_run = _load_latest_run(artifacts_dir, tag) + if latest_run: + summary_path = latest_run / "summary.json" + if summary_path.exists(): + try: + summary = json.loads(summary_path.read_text(encoding="utf-8")) + results_path = summary.get("results_path") + if results_path: + return Path(results_path) + except Exception: + pass + return None + + +def _load_run_meta(run_path: Path | None) -> Optional[dict]: + if run_path is None: + return None + meta_path = run_path / "run_meta.json" + if not meta_path.exists(): + return None + try: + return json.loads(meta_path.read_text(encoding="utf-8")) + except Exception: + return None + + +def _run_dir_from_results_path(results_path: Path | None) -> Optional[Path]: + if results_path is None: + return None + run_dir = results_path.parent + summary_path = run_dir / "summary.json" + if summary_path.exists(): + try: + summary = json.loads(summary_path.read_text(encoding="utf-8")) + run_dir_from_summary = summary.get("run_dir") + if run_dir_from_summary: + return Path(run_dir_from_summary) + except Exception: + pass + return run_dir + + +def _update_latest_markers(run_folder: Path, results_path: Path, artifacts_dir: Path, tag: str | None) -> None: + marker_pairs = {_latest_markers(artifacts_dir, None)} + if tag: + marker_pairs.add(_latest_markers(artifacts_dir, tag)) + for latest_path, latest_results_path in marker_pairs: + latest_path.parent.mkdir(parents=True, exist_ok=True) + latest_path.write_text(str(run_folder), encoding="utf-8") + latest_results_path.write_text(str(results_path), encoding="utf-8") + + +__all__ = [ + "_effective_paths", + "_latest_markers", + "_load_latest_results", + "_load_latest_run", + "_load_run_meta", + "_run_dir_from_results_path", + "_sanitize_tag", + "_update_latest_markers", +] diff --git a/examples/demo_qa/runs/scope.py b/examples/demo_qa/runs/scope.py new file mode 100644 index 0000000..571ace4 --- /dev/null +++ b/examples/demo_qa/runs/scope.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import hashlib +import json +from typing import Mapping, Optional, Set + + +def _scope_payload( + *, + cases_hash: str, + include_tags: Set[str] | None, + exclude_tags: Set[str] | None, + include_ids: Set[str] | None, + exclude_ids: Set[str] | None, +) -> dict[str, object]: + return { + "cases_hash": cases_hash, + "include_tags": sorted(include_tags) if include_tags else None, + "exclude_tags": sorted(exclude_tags) if exclude_tags else None, + "include_ids": sorted(include_ids) if include_ids else None, + "exclude_ids": sorted(exclude_ids) if exclude_ids else None, + } + + +def _scope_hash(scope: Mapping[str, object]) -> str: + payload = json.dumps(scope, sort_keys=True, ensure_ascii=False) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +__all__ = ["_scope_hash", "_scope_payload"] diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index e9d92fa..5efb13d 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -8,16 +8,9 @@ import pytest -from examples.demo_qa.batch import ( - _fingerprint_dir, - _latest_markers, - _missed_case_ids, - _update_latest_markers, - bad_statuses, - is_failure, - render_markdown, - write_results, -) +from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown, write_results +from examples.demo_qa.runs.coverage import _missed_case_ids +from examples.demo_qa.runs.layout import _latest_markers, _update_latest_markers from examples.demo_qa.runner import RunResult, diff_runs diff --git a/tests/test_demo_qa_commands.py b/tests/test_demo_qa_commands.py new file mode 100644 index 0000000..a0afba0 --- /dev/null +++ b/tests/test_demo_qa_commands.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +import subprocess +import sys + + +def test_commands_report_import_is_lightweight() -> None: + script = """ +import sys + +import examples.demo_qa.commands.report # noqa: F401 + +heavy = [name for name in sys.modules if name.startswith("examples.demo_qa.llm") or name.startswith("examples.demo_qa.provider")] +if heavy: + raise SystemExit(f"Heads up: heavy deps imported: {heavy}") +""" + result = subprocess.run([sys.executable, "-c", script], capture_output=True, text=True) + assert result.returncode == 0, result.stderr or result.stdout From cf9d33d0b39ead42bfce5e4f34e23ebd5dca0592 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 22 Dec 2025 02:23:41 +0300 Subject: [PATCH 53/92] doc update --- README_demo_qa.md | 89 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 7 deletions(-) diff --git a/README_demo_qa.md b/README_demo_qa.md index 2acb56f..23cea09 100644 --- a/README_demo_qa.md +++ b/README_demo_qa.md @@ -55,7 +55,14 @@ python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.j ## Batch -Запустить пакетный прогон вопросов из `cases.jsonl` (по одному JSON на строку, поля `id`, `question`, опционально `expected`/`expected_regex`/`expected_contains` и `skip`): +Запустить пакетный прогон вопросов из файла кейсов (`cases.jsonl` или `cases.json`). + +Поддерживаемые форматы: + +* **JSONL**: по одному JSON-объекту на строку. +* **JSON**: массив объектов. + +Поля кейса: `id`, `question`, опционально `expected`/`expected_regex`/`expected_contains` и `skip`. ```bash python -m examples.demo_qa.cli batch \ @@ -65,12 +72,80 @@ python -m examples.demo_qa.cli batch \ --out results.jsonl ``` -* Артефакты по умолчанию пишутся в `/.runs/runs/_/cases/_/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`). -* `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов и, при наличии `--compare-to`, diff по прогрессу. -* Флаги `--fail-on (error|bad|unchecked|any|skipped)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to`, `--only-failed-from/--only-failed` и `--plan-only` управляют выбором кейсов, остановкой и кодом выхода (0/1/2). -* Без `--out` результаты складываются в `/runs/_/results.jsonl`, а `runs/latest.txt` указывает на последнюю папку запуска. -* Быстрый фокус на упавших: `--only-failed` возьмёт `runs/latest/results.jsonl`, `--show-artifacts` печатает пути, репро-команды выводятся для каждого FAIL. -* Команды уровня кейса: `demo_qa case run --cases ...` и `demo_qa case open --run runs/latest` для быстрого воспроизведения. +Что сохраняется: + +* Артефакты по кейсам по умолчанию пишутся в `/.runs/runs/_/cases/_/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`). +* `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов. +* Без `--out` результаты складываются в `/.runs/runs/_/results.jsonl`, а `runs/latest.txt` указывает на последнюю папку запуска, `runs/latest_results.txt` — на путь к results. +* При `Ctrl-C` сохраняются частичные результаты: уже пройденные кейсы попадают в `results.jsonl/summary.json`, а прогон помечается как `interrupted`. + +Ключевые флаги: + +* `--fail-on (error|bad|unchecked|any|skipped)`, `--max-fails`, `--fail-fast`, `--require-assert` — остановка/код выхода (0/1/2) и строгость проверок. +* `--only-failed` / `--only-failed-from PATH` — перепрогон только плохих кейсов (baseline = latest либо явно заданный results). +* `--only-missed` / `--only-missed-from PATH` — “добить” только те кейсы, которые отсутствуют в baseline results (удобно после Ctrl-C). +* `--tag TAG` / `--note "..."` — пометить прогон как часть эксперимента. Для `--tag` поддерживается “effective snapshot”: результаты по тегу накапливаются инкрементально, так что `--only-failed/--only-missed` по тегу корректно работают даже после частичных прогонов. +* `--plan-only` — строить планы без выполнения. + +Команды уровня кейса: + +* `python -m examples.demo_qa.cli case run --cases ...` — прогнать один кейс. +* `python -m examples.demo_qa.cli case open --data ...` — открыть папку артефактов кейса. + +Отчёты и история: + +* `python -m examples.demo_qa.cli stats --data --last 10` — последние прогоны. +* `python -m examples.demo_qa.cli report tag --data --tag ` — сводка по “effective” результатам тега. +* `python -m examples.demo_qa.cli report run --data --run runs/latest` — сводка по конкретному run. +* `python -m examples.demo_qa.cli history case --data [--tag ]` — история по кейсу. + +### Удобные алиасы (bash/zsh) + +Добавьте в `~/.bashrc` или `~/.zshrc` и перезапустите shell. + +```bash +# 1) Настройте свои дефолты под проект/датасет +export DQ_DATA="./_demo_data/shop" +export DQ_SCHEMA="$DQ_DATA/schema.yaml" +export DQ_CASES="./examples/demo_qa/cases/retail_cases.json" +export DQ_OUT="$DQ_DATA/.runs/results.jsonl" +export DQ_TAG="retail-iter1" + +# 2) Базовая команда +dq() { python -m examples.demo_qa.cli "$@"; } + +# 3) Самые частые сценарии +dq-batch() { dq batch --data "$DQ_DATA" --schema "$DQ_SCHEMA" --cases "$DQ_CASES" --out "$DQ_OUT" "$@"; } +dq-failed() { dq-batch --only-failed "$@"; } +dq-missed() { dq-batch --only-missed "$@"; } + +# Tagged (effective) workflow +dq-batch-tag() { dq-batch --tag "$DQ_TAG" "$@"; } +dq-failed-tag() { dq-batch --tag "$DQ_TAG" --only-failed "$@"; } +dq-missed-tag() { dq-batch --tag "$DQ_TAG" --only-missed "$@"; } + +# Отчёты +dq-stats() { dq stats --data "$DQ_DATA" "$@"; } +dq-report() { dq report tag --data "$DQ_DATA" --tag "$DQ_TAG" "$@"; } +dq-run() { dq report run --data "$DQ_DATA" --run "${1:-runs/latest}"; } +dq-hist() { dq history case "$1" --data "$DQ_DATA" --tag "$DQ_TAG" "${@:2}"; } + +# Дебаг кейса +dq-case() { dq case run "$1" --cases "$DQ_CASES" --data "$DQ_DATA" --schema "$DQ_SCHEMA" "${@:2}"; } +dq-open() { dq case open "$1" --data "$DQ_DATA" "${@:2}"; } +``` + +Минимальный набор, если не хочется “тегов”: + +```bash +dq() { python -m examples.demo_qa.cli "$@"; } +dq-batch() { dq batch --data "$DQ_DATA" --schema "$DQ_SCHEMA" --cases "$DQ_CASES" --out "$DQ_OUT" "$@"; } +dq-failed() { dq-batch --only-failed "$@"; } +dq-missed() { dq-batch --only-missed "$@"; } +dq-stats() { dq stats --data "$DQ_DATA" --last 10; } +``` + + ## Local proxy Для OpenAI-совместимых серверов (например, LM Studio) укажите `base_url` с `.../v1` и From 9a4a9f3aad1adab9cd35b60ba90c99184d494c8c Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 22 Dec 2025 02:48:03 +0300 Subject: [PATCH 54/92] =?UTF-8?q?=D0=B0=D0=BF=D0=B3=D1=80=D0=B5=D0=B9?= =?UTF-8?q?=D0=B4=20=D0=B2=D0=B5=D1=80=D1=81=D0=B8=D0=B8=20=D0=B4=D0=BB?= =?UTF-8?q?=D1=8F=20=D0=BF=D1=80=D0=B0=D0=B2=D0=B8=D0=BB=D1=8C=D0=BD=D0=BE?= =?UTF-8?q?=D0=B3=D0=BE=20=D0=BC=D0=B5=D1=80=D0=B4=D0=B6=D0=B0=20=D1=81=20?= =?UTF-8?q?main?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 710ebde..4a9053c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "fetchgraph" -version = "0.1.2" +version = "0.1.3" description = "Graph-like planning → context fetching → synthesis agent (library-style)." readme = "README.md" requires-python = ">=3.11" From aa62371a19e1e30174176f2e83b5e246c1358797 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 25 Dec 2025 20:36:33 +0300 Subject: [PATCH 55/92] =?UTF-8?q?demo=5Faq=20(make)=20-=20=D0=B0=D0=BB?= =?UTF-8?q?=D0=B8=D0=B0=D1=81=D1=8B=20=D0=B4=D0=BB=D1=8F=20=D0=BA=D0=BE?= =?UTF-8?q?=D0=BC=D0=B0=D0=BD=D0=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 299 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 299 insertions(+) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0491f57 --- /dev/null +++ b/Makefile @@ -0,0 +1,299 @@ +# Makefile — алиасы для examples.demo_qa (без ~/.bashrc / ~/.zshrc) +# +# Быстрый старт: +# make init +# make chat +# make batch +# make help +# +# Примечание про venv: +# - Makefile НЕ "активирует" venv в текущем терминале (это невозможно из make). +# - Но он автоматически использует .venv/bin/python, если он существует. + +SHELL := /bin/bash + +# ============================================================================== +# 1) Локальный конфиг (не коммитить; удобно добавить в .gitignore) +# ============================================================================== +CONFIG ?= .demo_qa.mk +-include $(CONFIG) + +# ============================================================================== +# 2) Значения по умолчанию (для make init) +# ============================================================================== +DEFAULT_DATA := _demo_data/shop +DEFAULT_SCHEMA := _demo_data/shop/schema.yaml +DEFAULT_CASES := examples/demo_qa/cases/retail_cases.json + +# ============================================================================== +# 3) Python / CLI +# ============================================================================== +VENV ?= .venv +PYTHON ?= $(if $(wildcard $(VENV)/bin/python),$(VENV)/bin/python,python) +CLI := $(PYTHON) -m examples.demo_qa.cli + +# ============================================================================== +# 4) Пути demo_qa (можно переопределять через CLI или в $(CONFIG)) +# ============================================================================== +DATA ?= +SCHEMA ?= +CASES ?= +OUT ?= $(DATA)/.runs/results.jsonl + +# ============================================================================== +# 5) Параметры команд +# ============================================================================== +TAG ?= +NOTE ?= +CASE ?= +LIMIT ?= 50 + +ONLY_FAILED_FROM ?= +ONLY_MISSED_FROM ?= + +BASE ?= +NEW ?= +DIFF_OUT ?= $(DATA)/.runs/diff.md +JUNIT ?= $(DATA)/.runs/diff.junit.xml + +MAX_FAILS ?= 5 + +# ============================================================================== +# 6) Настройки LLM-конфига (редактирование/просмотр) +# ============================================================================== +# Если у тебя конфиг лежит иначе — переопредели: +# make llm-edit LLM_TOML=path/to/demo_qa.toml +LLM_TOML ?= demo_qa.toml +LLM_TOML_EXAMPLE ?= demo_qa.toml.example + +# macOS: открываем в TextEdit +OPEN ?= open +EDITOR_APP ?= TextEdit + +# ============================================================================== +# 7) Вспомогательные флаги (не передавать пустые) +# ============================================================================== +TAG_FLAG := $(if $(strip $(TAG)),--tag "$(TAG)",) +NOTE_FLAG := $(if $(strip $(NOTE)),--note "$(NOTE)",) +LIMIT_FLAG := $(if $(strip $(LIMIT)),--limit $(LIMIT),) + +# ============================================================================== +# 8) PHONY +# ============================================================================== +.PHONY: help init show-config check ensure-runs-dir venv-check \ + llm-init llm-show llm-edit \ + chat \ + batch batch-tag batch-failed batch-failed-from \ + batch-missed batch-missed-from batch-fail-fast batch-max-fails \ + stats history-case report-tag case-run case-open compare + +# ============================================================================== +# help (на русском) +# ============================================================================== +help: + @echo "" + @echo "DemoQA: Makefile-алиасы (без ~/.bashrc или ~/.zshrc)" + @echo "===================================================" + @echo "" + @echo "Быстрый старт:" + @echo " make init" + @echo " make chat" + @echo "" + @echo "Конфигурация:" + @echo " Настройки хранятся в: $(CONFIG)" + @echo " Можно переопределять переменные так:" + @echo " make chat DATA=_demo_data/shop SCHEMA=_demo_data/shop/schema.yaml" + @echo "" + @echo "Основные переменные:" + @echo " DATA - путь к датасету (например: _demo_data/shop)" + @echo " SCHEMA - путь к schema.yaml" + @echo " CASES - путь к cases.json" + @echo " OUT - куда писать results.jsonl (по умолчанию: \$$DATA/.runs/results.jsonl)" + @echo "" + @echo "Команды:" + @echo " make chat - интерактивный чат" + @echo " make batch - полный прогон всего набора" + @echo " make batch-tag TAG=... NOTE='...' - полный прогон с тегом и заметкой" + @echo " make batch-failed - перепрогон только упавших (baseline = latest)" + @echo " make batch-failed-from ONLY_FAILED_FROM=path/results.jsonl - only-failed от явного baseline" + @echo " make batch-missed [TAG=...] - добить missed (если TAG задан — относительно effective по тегу)" + @echo " make batch-missed-from ONLY_MISSED_FROM=path/results.jsonl - добить missed от явного baseline" + @echo " make batch-fail-fast - быстрый smoke (остановиться на первом фейле)" + @echo " make batch-max-fails MAX_FAILS=5 - остановиться после N фейлов" + @echo " make stats - stats по последним 10 прогонов" + @echo "" + @echo "Диагностика / анализ:" + @echo " make history-case CASE=case_42 [TAG=...] [LIMIT=50] - история по кейсу" + @echo " make report-tag TAG=... - сводка по тегу (effective snapshot)" + @echo " make case-run CASE=case_42 - прогнать один кейс" + @echo " make case-open CASE=case_42 - открыть артефакты кейса" + @echo "" + @echo "Сравнение результатов:" + @echo " make compare BASE=... NEW=... [DIFF_OUT=...] [JUNIT=...]" + @echo "" + @echo "LLM конфиг:" + @echo " make llm-init - создать $(LLM_TOML) из $(LLM_TOML_EXAMPLE)" + @echo " make llm-show - показать первые ~200 строк $(LLM_TOML)" + @echo " make llm-edit - открыть $(LLM_TOML) в TextEdit (macOS)" + @echo "" + @echo "Сервисные:" + @echo " make venv-check - показать, какой python будет использоваться" + @echo " make show-config - показать текущие значения переменных" + @echo "" + +# ============================================================================== +# Конфиг проекта +# ============================================================================== +init: + @set -euo pipefail; \ + if [ -f "$(CONFIG)" ] && [ "$${FORCE:-0}" != "1" ]; then \ + echo "Файл $(CONFIG) уже существует. Чтобы перезаписать: FORCE=1 make init"; \ + exit 1; \ + fi; \ + DATA="$${DATA:-$(DEFAULT_DATA)}"; \ + SCHEMA="$${SCHEMA:-$(DEFAULT_SCHEMA)}"; \ + CASES="$${CASES:-$(DEFAULT_CASES)}"; \ + mkdir -p "$$DATA/.runs"; \ + { \ + echo "# Локальные настройки demo_qa (генерируется командой: make init)"; \ + echo "# Можно редактировать руками. Рекомендуется добавить в .gitignore."; \ + echo "DATA=$$DATA"; \ + echo "SCHEMA=$$SCHEMA"; \ + echo "CASES=$$CASES"; \ + echo "# OUT можно не задавать: по умолчанию OUT=\$${DATA}/.runs/results.jsonl"; \ + echo "# OUT=$$DATA/.runs/results.jsonl"; \ + } > "$(CONFIG)"; \ + echo "Ок: создан $(CONFIG)"; \ + echo "Создана папка: $$DATA/.runs"; \ + echo "Дальше: make chat / make batch / make help" + +show-config: + @echo "CONFIG = $(CONFIG)" + @echo "VENV = $(VENV)" + @echo "PYTHON = $(PYTHON)" + @echo "DATA = $(DATA)" + @echo "SCHEMA = $(SCHEMA)" + @echo "CASES = $(CASES)" + @echo "OUT = $(OUT)" + @echo "LLM_TOML= $(LLM_TOML)" + @echo "TAG = $(TAG)" + @echo "NOTE = $(NOTE)" + @echo "CASE = $(CASE)" + @echo "LIMIT = $(LIMIT)" + +venv-check: + @if [ -x "$(VENV)/bin/python" ]; then \ + echo "OK: venv найден: $(VENV) (использую $(VENV)/bin/python)"; \ + else \ + echo "INFO: venv не найден: $(VENV) (использую системный python: $$(command -v $(PYTHON) || echo 'python'))"; \ + fi + +check: + @test -n "$(strip $(DATA))" || (echo "DATA не задан. Запусти: make init (или передай DATA=...)" && exit 1) + @test -n "$(strip $(SCHEMA))" || (echo "SCHEMA не задан. Запусти: make init (или передай SCHEMA=...)" && exit 1) + @test -n "$(strip $(CASES))" || (echo "CASES не задан. Запусти: make init (или передай CASES=...)" && exit 1) + +ensure-runs-dir: check + @mkdir -p "$(DATA)/.runs" + +# ============================================================================== +# LLM конфиг (без проверок доступности — это задача приложения) +# ============================================================================== +llm-init: + @set -euo pipefail; \ + if [ -f "$(LLM_TOML)" ]; then \ + echo "Файл уже существует: $(LLM_TOML)"; \ + exit 0; \ + fi; \ + if [ -f "$(LLM_TOML_EXAMPLE)" ]; then \ + cp "$(LLM_TOML_EXAMPLE)" "$(LLM_TOML)"; \ + echo "Ок: создан $(LLM_TOML) из $(LLM_TOML_EXAMPLE)"; \ + else \ + echo "Не найден пример: $(LLM_TOML_EXAMPLE). Создай $(LLM_TOML) вручную."; \ + exit 1; \ + fi + +llm-show: + @echo "LLM config: $(LLM_TOML)" + @echo "----------------------------------------" + @sed -n '1,200p' "$(LLM_TOML)" 2>/dev/null || (echo "Файл не найден: $(LLM_TOML). Сделай: make llm-init" && exit 1) + +llm-edit: + @$(OPEN) -a "$(EDITOR_APP)" "$(LLM_TOML)" + +# ============================================================================== +# Алиасы под команды CLI +# ============================================================================== +chat: check + @$(CLI) chat --data "$(DATA)" --schema "$(SCHEMA)" + +# 1) Полный прогон всего набора +batch: ensure-runs-dir + @$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" + +# 2) Полный прогон с тегом + заметка +batch-tag: ensure-runs-dir + @test -n "$(strip $(TAG))" || (echo "TAG обязателен: make batch-tag TAG=..." && exit 1) + @$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" $(TAG_FLAG) $(NOTE_FLAG) + +# 3) only-failed от latest +batch-failed: ensure-runs-dir + @$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" --only-failed + +# 4) only-failed от явного baseline +batch-failed-from: ensure-runs-dir + @test -n "$(strip $(ONLY_FAILED_FROM))" || (echo "Нужно задать ONLY_FAILED_FROM=.../results.jsonl" && exit 1) + @$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" \ + --only-failed-from "$(ONLY_FAILED_FROM)" + +# 5) only-missed (relative to effective по TAG или latest) +batch-missed: ensure-runs-dir + @$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" \ + $(TAG_FLAG) --only-missed + +# 6) only-missed от явного baseline +batch-missed-from: ensure-runs-dir + @test -n "$(strip $(ONLY_MISSED_FROM))" || (echo "Нужно задать ONLY_MISSED_FROM=.../results.jsonl" && exit 1) + @$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" \ + --only-missed --only-missed-from "$(ONLY_MISSED_FROM)" + +# 7) fail-fast / max-fails +batch-fail-fast: ensure-runs-dir + @$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" --fail-fast + +batch-max-fails: ensure-runs-dir + @$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" --max-fails "$(MAX_FAILS)" + +# stats (последние 10) +stats: check + @$(CLI) stats --data "$(DATA)" --last 10 + +# 8) История по кейсу (TAG опционален) +history-case: check + @test -n "$(strip $(CASE))" || (echo "Нужно задать CASE=case_42" && exit 1) + @$(CLI) history case "$(CASE)" --data "$(DATA)" $(TAG_FLAG) $(LIMIT_FLAG) + +# 9) Сводка по тегу +report-tag: check + @test -n "$(strip $(TAG))" || (echo "TAG обязателен: make report-tag TAG=..." && exit 1) + @$(CLI) report tag --data "$(DATA)" --tag "$(TAG)" + +# 10) Дебаг 1 кейса +case-run: check + @test -n "$(strip $(CASE))" || (echo "Нужно задать CASE=case_42" && exit 1) + @$(CLI) case run "$(CASE)" --cases "$(CASES)" --data "$(DATA)" --schema "$(SCHEMA)" + +case-open: check + @test -n "$(strip $(CASE))" || (echo "Нужно задать CASE=case_42" && exit 1) + @$(CLI) case open "$(CASE)" --data "$(DATA)" + +# compare (diff.md + junit) +compare: check + @test -n "$(strip $(BASE))" || (echo "Нужно задать BASE=.../results_prev.jsonl" && exit 1) + @test -n "$(strip $(NEW))" || (echo "Нужно задать NEW=.../results.jsonl" && exit 1) + @mkdir -p "$(DATA)/.runs" + @$(CLI) compare \ + --base "$(BASE)" \ + --new "$(NEW)" \ + --out "$(DIFF_OUT)" \ + --junit "$(JUNIT)" From daae6a3dd93b9214b5473020556d74d9f4e903ff Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Fri, 26 Dec 2025 23:25:02 +0300 Subject: [PATCH 56/92] Handle JSON array case files in demo QA runner --- examples/demo_qa/runner.py | 96 ++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 39 deletions(-) diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index f0575dd..151c4df 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -399,46 +399,64 @@ def load_cases(path: Path) -> List[Case]: raise FileNotFoundError(f"Cases file not found: {path}") cases: List[Case] = [] seen_ids: set[str] = set() - with path.open("r", encoding="utf-8") as f: - for lineno, line in enumerate(f, start=1): - line = line.strip() - if not line: - continue + text = path.read_text(encoding="utf-8") + stripped = text.lstrip() + + def add_case(payload: Mapping[str, object], location: str) -> None: + if not isinstance(payload, Mapping): + raise ValueError(f"Case on {location} must be an object") + if "id" not in payload or "question" not in payload: + raise ValueError(f"Case on {location} missing required fields 'id' and 'question'") + case_id = str(payload["id"]) + if case_id in seen_ids: + raise ValueError(f"Duplicate case id {case_id!r} on {location}") + seen_ids.add(case_id) + expected = payload.get("expected") + expected_regex = payload.get("expected_regex") + expected_contains = payload.get("expected_contains") + for field_name, val in [ + ("expected", expected), + ("expected_regex", expected_regex), + ("expected_contains", expected_contains), + ]: + if val is not None and str(val).strip() == "": + raise ValueError(f"{field_name} must not be empty on {location}") + if expected_regex is not None: try: - payload = json.loads(line) - except json.JSONDecodeError as exc: - raise ValueError(f"Invalid JSON on line {lineno}: {exc}") from exc - if "id" not in payload or "question" not in payload: - raise ValueError(f"Case on line {lineno} missing required fields 'id' and 'question'") - case_id = str(payload["id"]) - if case_id in seen_ids: - raise ValueError(f"Duplicate case id {case_id!r} on line {lineno}") - seen_ids.add(case_id) - expected = payload.get("expected") - expected_regex = payload.get("expected_regex") - expected_contains = payload.get("expected_contains") - for field_name, val in [ - ("expected", expected), - ("expected_regex", expected_regex), - ("expected_contains", expected_contains), - ]: - if val is not None and str(val).strip() == "": - raise ValueError(f"{field_name} must not be empty on line {lineno}") - if expected_regex is not None: - try: - re.compile(expected_regex) - except re.error as exc: - raise ValueError(f"Invalid expected_regex on line {lineno}: {exc}") from exc - case = Case( - id=case_id, - question=str(payload["question"]), - expected=expected, - expected_regex=expected_regex, - expected_contains=expected_contains, - tags=list(payload.get("tags", []) or []), - skip=bool(payload.get("skip", False)), - ) - cases.append(case) + re.compile(expected_regex) + except re.error as exc: + raise ValueError(f"Invalid expected_regex on {location}: {exc}") from exc + case = Case( + id=case_id, + question=str(payload["question"]), + expected=expected, + expected_regex=expected_regex, + expected_contains=expected_contains, + tags=list(payload.get("tags", []) or []), + skip=bool(payload.get("skip", False)), + ) + cases.append(case) + + if stripped.startswith("["): + try: + payloads = json.loads(text) + except json.JSONDecodeError as exc: + raise ValueError(f"Invalid JSON array: {exc}") from exc + if not isinstance(payloads, list): + raise ValueError("Cases JSON must be an array of objects") + for index, payload in enumerate(payloads, start=1): + add_case(payload, f"array index {index}") + return cases + + for lineno, line in enumerate(text.splitlines(), start=1): + line = line.strip() + if not line: + continue + try: + payload = json.loads(line) + except json.JSONDecodeError as exc: + raise ValueError(f"Invalid JSON on line {lineno}: {exc}") from exc + add_case(payload, f"line {lineno}") return cases From 14d4e183b26d0e277e83d9d51331ccfc1f3f8db1 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 26 Dec 2025 23:56:48 +0300 Subject: [PATCH 57/92] =?UTF-8?q?llm(fix)=20=D1=84=D0=B8=D0=BA=D1=81=D0=B8?= =?UTF-8?q?=D0=BC=20=D1=81=D0=BA=D1=80=D1=8B=D1=82=D1=8B=D0=B9=20=D0=B1?= =?UTF-8?q?=D0=B0=D0=B3=20-=20=D0=B5=D1=81=D0=BB=D0=B8=20=D1=84=D0=B0?= =?UTF-8?q?=D0=B9=D0=BB=20=D0=BE=D1=82=D1=81=D1=83=D1=82=D1=81=D1=82=D0=B2?= =?UTF-8?q?=D1=83=D0=B5=D1=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo_qa/batch.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 8fa3266..91ab75e 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -7,7 +7,7 @@ import sys import uuid from pathlib import Path -from typing import Iterable, Mapping, Optional +from typing import Mapping, Optional from .llm.factory import build_llm from .logging_config import configure_logging @@ -30,10 +30,14 @@ ) from .runs.case_history import _append_case_history from .runs.coverage import _missed_case_ids -from .runs.effective import _append_effective_diff, _build_effective_diff, _load_effective_results, _update_effective_snapshot +from .runs.effective import ( + _append_effective_diff, + _build_effective_diff, + _load_effective_results, + _update_effective_snapshot, +) from .runs.io import write_results from .runs.layout import ( - _latest_markers, _load_latest_results, _load_latest_run, _load_run_meta, @@ -319,7 +323,6 @@ def handle_batch(args) -> int: run_id = uuid.uuid4().hex[:8] interrupted = False interrupted_at_case_id: str | None = None - cases_hash = _hash_file(args.cases) try: settings = load_settings(config_path=args.config, data_dir=args.data) @@ -328,6 +331,7 @@ def handle_batch(args) -> int: return 2 try: cases = load_cases(args.cases) + cases_hash = _hash_file(args.cases) except Exception as exc: print(f"Cases error: {exc}", file=sys.stderr) return 2 From d425cc5506d00cb2d4b10d92b7b76b2b81feec2f Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 27 Dec 2025 00:07:54 +0300 Subject: [PATCH 58/92] Improve demo QA typing and timestamps --- examples/demo_qa/batch.py | 178 +++++++++++++++----------- examples/demo_qa/data_gen.py | 5 +- examples/demo_qa/runner.py | 24 +++- examples/demo_qa/runs/case_history.py | 2 +- examples/demo_qa/runs/effective.py | 4 +- 5 files changed, 128 insertions(+), 85 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 8fa3266..6a19f71 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -14,6 +14,7 @@ from .provider_factory import build_provider from .runner import ( Case, + DiffReport, EventLogger, RunResult, RunTimings, @@ -51,13 +52,40 @@ def write_summary(out_path: Path, summary: dict) -> Path: return summary_path +def _coerce_number(value: object | None) -> float | None: + if isinstance(value, bool): + return float(value) + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return float(value) + except ValueError: + return None + return None + + +def _coerce_int(value: object | None) -> int: + number = _coerce_number(value) + if number is None: + return 0 + return int(number) + + +def _isoformat_utc(dt: datetime.datetime) -> str: + if dt.tzinfo is None: + dt = dt.replace(tzinfo=datetime.timezone.utc) + return dt.astimezone(datetime.timezone.utc).isoformat().replace("+00:00", "Z") + + def _pass_rate(counts: Mapping[str, object]) -> Optional[float]: - total = int(counts.get("total", 0) or 0) - skipped = int(counts.get("skipped", 0) or 0) + total = _coerce_int(counts.get("total")) + skipped = _coerce_int(counts.get("skipped")) denom = total - skipped if denom <= 0: return None - return (counts.get("ok", 0) or 0) / denom + ok = _coerce_number(counts.get("ok")) + return None if ok is None else ok / denom def _hash_file(path: Path) -> str: @@ -185,34 +213,28 @@ def handle_chat(args) -> int: return 0 -def compare_runs(base_path: Path, new_path: Path, *, fail_on: str, require_assert: bool) -> dict[str, object]: +def compare_runs(base_path: Path, new_path: Path, *, fail_on: str, require_assert: bool) -> DiffReport: base = load_results(base_path) new = load_results(new_path) return diff_runs(base.values(), new.values(), fail_on=fail_on, require_assert=require_assert) -def render_markdown(compare: dict[str, object], out_path: Optional[Path]) -> str: +def render_markdown(compare: DiffReport, out_path: Optional[Path]) -> str: lines: list[str] = [] - base_counts = compare["base_counts"] # type: ignore[index] - new_counts = compare["new_counts"] # type: ignore[index] - fail_on = compare.get("fail_on", "bad") # type: ignore[assignment] - require_assert = bool(compare.get("require_assert", False)) - - def _bad_total(counts: dict) -> int: - bad_from_compare = compare.get("base_bad_total") if counts is base_counts else compare.get("new_bad_total") - if isinstance(bad_from_compare, int): - return bad_from_compare - bad_set = bad_statuses(str(fail_on), require_assert) + base_counts = compare["base_counts"] + new_counts = compare["new_counts"] + fail_on = compare.get("fail_on", "bad") + require_assert = compare.get("require_assert", False) + + def _bad_total(counts: Mapping[str, object], *, fallback: int) -> int: + bad_set = bad_statuses(str(fail_on), bool(require_assert)) total = 0 for status in bad_set: - try: - total += int(counts.get(status, 0) or 0) - except Exception: - continue - return total + total += _coerce_int(counts.get(status)) + return total or fallback - base_bad = _bad_total(base_counts) # type: ignore[arg-type] - new_bad = _bad_total(new_counts) # type: ignore[arg-type] + base_bad = _bad_total(base_counts, fallback=compare.get("base_bad_total", 0)) + new_bad = _bad_total(new_counts, fallback=compare.get("new_bad_total", 0)) lines.append("# Batch comparison report") lines.append("") lines.append("## Summary") @@ -224,7 +246,7 @@ def _bad_total(counts: dict) -> int: lines.append(f"- Median total time: base {base_med:.2f}s → new {new_med:.2f}s (Δ {new_med - base_med:+.2f}s)") lines.append("") - def table(title: str, rows: list[dict]) -> None: + def table(title: str, rows: list[Mapping[str, object]]) -> None: lines.append(f"## {title}") if not rows: lines.append("None") @@ -233,7 +255,8 @@ def table(title: str, rows: list[dict]) -> None: lines.append("| id | status | reason | artifacts |") lines.append("|---|---|---|---|") for row in sorted(rows, key=lambda r: r.get("id", "")): - artifacts = row.get("artifacts", {}) + artifacts_val = row.get("artifacts", {}) + artifacts = artifacts_val if isinstance(artifacts_val, Mapping) else {} links = ", ".join(f"[{k}]({v})" for k, v in sorted(artifacts.items())) lines.append( f"| {row['id']} | {row['from']} → {row['to']} | {row.get('reason','')} | {links or ''} |" @@ -250,13 +273,13 @@ def table(title: str, rows: list[dict]) -> None: return content -def write_junit(compare: dict[str, object], out_path: Path) -> None: +def write_junit(compare: DiffReport, out_path: Path) -> None: import xml.etree.ElementTree as ET suite = ET.Element("testsuite", name="demo_qa_compare") - bad = compare["new_fail"] + compare["still_fail"] # type: ignore[operator] - fixed = compare["fixed"] # type: ignore[assignment] - all_ids_list = list(compare.get("all_ids", []) or []) # type: ignore[arg-type] + bad = compare["new_fail"] + compare["still_fail"] + fixed = compare["fixed"] + all_ids_list = list(compare.get("all_ids", []) or []) all_ids = sorted(all_ids_list) cases_total = len(all_ids) suite.set("tests", str(cases_total)) @@ -267,7 +290,8 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None: tc = ET.SubElement(suite, "testcase", name=row["id"]) msg = row.get("reason", "") or f"{row.get('from')} → {row.get('to')}" failure = ET.SubElement(tc, "failure", message=msg) - artifacts = row.get("artifacts", {}) + artifacts_val = row.get("artifacts", {}) + artifacts = artifacts_val if isinstance(artifacts_val, Mapping) else {} if artifacts: failure.text = "\n".join(f"{k}: {v}" for k, v in sorted(artifacts.items())) @@ -315,19 +339,23 @@ def _select_cases_for_rerun( def handle_batch(args) -> int: - started_at = datetime.datetime.utcnow() + started_at = datetime.datetime.now(datetime.timezone.utc) run_id = uuid.uuid4().hex[:8] interrupted = False interrupted_at_case_id: str | None = None - cases_hash = _hash_file(args.cases) + data_dir = Path(args.data) + schema_path = Path(args.schema) + cases_path = Path(args.cases) + config_path = Path(args.config) if args.config else None + cases_hash = _hash_file(cases_path) try: - settings = load_settings(config_path=args.config, data_dir=args.data) + settings = load_settings(config_path=config_path, data_dir=data_dir) except Exception as exc: print(f"Configuration error: {exc}", file=sys.stderr) return 2 try: - cases = load_cases(args.cases) + cases = load_cases(cases_path) except Exception as exc: print(f"Cases error: {exc}", file=sys.stderr) return 2 @@ -335,9 +363,7 @@ def handle_batch(args) -> int: baseline_for_filter: Optional[Mapping[str, RunResult]] = None baseline_for_compare: Optional[Mapping[str, RunResult]] = None - artifacts_dir = args.artifacts_dir - if artifacts_dir is None: - artifacts_dir = args.data / ".runs" + artifacts_dir = Path(args.artifacts_dir) if args.artifacts_dir else data_dir / ".runs" include_tags = _split_csv(args.include_tags) exclude_tags = _split_csv(args.exclude_tags) @@ -352,9 +378,8 @@ def handle_batch(args) -> int: ) scope_id = _scope_hash(scope) - baseline_filter_path = args.only_failed_from + baseline_filter_path = Path(args.only_failed_from) if args.only_failed_from else None only_failed_baseline_kind: str | None = None - effective_results_path: Path | None = None if args.only_failed_from: only_failed_baseline_kind = "path" elif args.tag and args.only_failed: @@ -373,7 +398,6 @@ def handle_batch(args) -> int: return 2 baseline_for_filter = effective_results baseline_filter_path = eff_path - effective_results_path = eff_path only_failed_baseline_kind = "effective" elif args.only_failed: latest_results = _load_latest_results(artifacts_dir, args.tag) @@ -397,7 +421,7 @@ def handle_batch(args) -> int: print("No baseline found for --only-failed.", file=sys.stderr) return 2 - compare_path = args.compare_to + compare_path: Path | None = Path(args.compare_to) if args.compare_to else None if compare_path is None and args.only_failed and baseline_filter_path: compare_path = baseline_filter_path if compare_path: @@ -520,15 +544,15 @@ def handle_batch(args) -> int: print("0 missed cases selected.", file=sys.stderr) timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}" - results_path = args.out or (run_folder / "results.jsonl") + run_folder = artifacts_dir / "runs" / f"{timestamp}_{cases_path.stem}" + results_path = Path(args.out) if args.out else (run_folder / "results.jsonl") artifacts_root = run_folder / "cases" results_path.parent.mkdir(parents=True, exist_ok=True) summary_path = results_path.with_name("summary.json") artifacts_dir.mkdir(parents=True, exist_ok=True) - history_path = args.history or (args.data / ".runs" / "history.jsonl") + history_path = Path(args.history) if args.history else (data_dir / ".runs" / "history.jsonl") - log_dir = args.log_dir or args.data / ".runs" / "logs" + log_dir = Path(args.log_dir) if args.log_dir else data_dir / ".runs" / "logs" configure_logging( level=args.log_level, log_dir=log_dir, @@ -537,7 +561,7 @@ def handle_batch(args) -> int: run_id=None, ) - provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic) + provider, _ = build_provider(data_dir, schema_path, enable_semantic=args.enable_semantic) llm = build_llm(settings) runner = build_agent(llm, provider) events_path = None @@ -595,10 +619,10 @@ def handle_batch(args) -> int: write_results(results_path, results) counts = summarize(results) - diff_block: dict | None = None + diff_block: DiffReport | None = None baseline_path: Path | None = None if baseline_for_compare: - baseline_path = args.compare_to or baseline_filter_path + baseline_path = compare_path or baseline_filter_path diff = diff_runs( baseline_for_compare.values(), results, @@ -610,10 +634,10 @@ def handle_batch(args) -> int: diff_block = diff policy_bad = bad_statuses(args.fail_on, args.require_assert) - bad_count = sum(int(counts.get(status, 0) or 0) for status in policy_bad) + bad_count = sum(_coerce_int(counts.get(status)) for status in policy_bad) exit_code = 130 if interrupted else (1 if bad_count else 0) - ended_at = datetime.datetime.utcnow() + ended_at = datetime.datetime.now(datetime.timezone.utc) duration_ms = int((ended_at - started_at).total_seconds() * 1000) executed_results = {res.id: res for res in results} planned_total = len(selected_case_ids) @@ -623,8 +647,8 @@ def handle_batch(args) -> int: suite_missed_total = len(_missed_case_ids(suite_case_ids, executed_results)) summary = { "run_id": run_id, - "started_at": started_at.isoformat() + "Z", - "ended_at": ended_at.isoformat() + "Z", + "started_at": _isoformat_utc(started_at), + "ended_at": _isoformat_utc(ended_at), "duration_ms": duration_ms, "counts": counts, "summary_by_tag": counts.get("summary_by_tag"), @@ -676,7 +700,7 @@ def handle_batch(args) -> int: artifacts_dir=artifacts_dir, tag=args.tag, cases_hash=cases_hash, - cases_path=args.cases, + cases_path=cases_path, suite_case_ids=suite_case_ids, executed_results=results, run_folder=run_folder, @@ -701,24 +725,24 @@ def handle_batch(args) -> int: except Exception as exc: print(f"Failed to update effective results for tag {args.tag!r}: {exc}", file=sys.stderr) - config_hash = _hash_file(args.config) if args.config else None - schema_hash = _hash_file(args.schema) - data_fingerprint = _fingerprint_dir(args.data, verbose=args.fingerprint_verbose) + config_hash = _hash_file(config_path) if config_path else None + schema_hash = _hash_file(schema_path) + data_fingerprint = _fingerprint_dir(data_dir, verbose=args.fingerprint_verbose) git_sha = _git_sha() llm_settings = settings.llm run_meta = { "run_id": run_id, - "timestamp": started_at.isoformat() + "Z", + "timestamp": _isoformat_utc(started_at), "tag": args.tag, "note": args.note, "inputs": { - "cases_path": str(args.cases), + "cases_path": str(cases_path), "cases_hash": cases_hash, - "config_path": str(args.config) if args.config else None, + "config_path": str(config_path) if config_path else None, "config_hash": config_hash, - "schema_path": str(args.schema), + "schema_path": str(schema_path), "schema_hash": schema_hash, - "data_dir": str(args.data), + "data_dir": str(data_dir), }, "suite_case_ids": suite_case_ids, "selected_case_ids": selected_case_ids, @@ -763,7 +787,7 @@ def handle_batch(args) -> int: prate = _pass_rate(counts) history_entry = { "run_id": run_id, - "timestamp": started_at.isoformat() + "Z", + "timestamp": _isoformat_utc(started_at), "config_hash": config_hash, "schema_hash": schema_hash, "cases_hash": cases_hash, @@ -962,15 +986,17 @@ def _print_stats(entries: list[dict]) -> None: print(header) prev = None for entry in entries: - pass_rate = entry.get("pass_rate") - median = entry.get("median_total_s") + pass_rate = _coerce_number(entry.get("pass_rate")) + median = _coerce_number(entry.get("median_total_s")) delta_pass = None delta_median = None if prev: - if pass_rate is not None and prev.get("pass_rate") is not None: - delta_pass = pass_rate - prev.get("pass_rate") - if median is not None and prev.get("median_total_s") is not None: - delta_median = median - prev.get("median_total_s") + prev_pass_rate = _coerce_number(prev.get("pass_rate")) + if pass_rate is not None and prev_pass_rate is not None: + delta_pass = pass_rate - prev_pass_rate + prev_median = _coerce_number(prev.get("median_total_s")) + if median is not None and prev_median is not None: + delta_median = median - prev_median pr_display = f"{pass_rate*100:.1f}%" if pass_rate is not None else "n/a" median_display = f"{median:.2f}" if median is not None else "n/a" dp = f"{delta_pass*100:+.1f}pp" if delta_pass is not None else "n/a" @@ -986,12 +1012,12 @@ def _print_stats(entries: list[dict]) -> None: def handle_stats(args) -> int: - history_path: Optional[Path] = args.history + history_path: Path | None = args.history if history_path is None: if not args.data: print("Provide --data or --history to locate history.jsonl", file=sys.stderr) return 2 - history_path = args.data / ".runs" / "history.jsonl" + history_path = Path(args.data) / ".runs" / "history.jsonl" entries = _load_history(history_path) if args.group_by == "config_hash": grouped: dict[str, list[dict]] = {} @@ -1007,15 +1033,19 @@ def handle_stats(args) -> int: def handle_compare(args) -> int: - if not args.base.exists() or not args.new.exists(): + base_path = Path(args.base) + new_path = Path(args.new) + if not base_path.exists() or not new_path.exists(): print("Base or new results file not found.", file=sys.stderr) return 2 - comparison = compare_runs(args.base, args.new, fail_on=args.fail_on, require_assert=args.require_assert) - report = render_markdown(comparison, args.out) + comparison = compare_runs(base_path, new_path, fail_on=args.fail_on, require_assert=args.require_assert) + out_path = Path(args.out) if args.out is not None else None + report = render_markdown(comparison, out_path) print(report) if args.junit: - write_junit(comparison, args.junit) - print(f"JUnit written to {args.junit}") + junit_path = Path(args.junit) + write_junit(comparison, junit_path) + print(f"JUnit written to {junit_path}") return 0 diff --git a/examples/demo_qa/data_gen.py b/examples/demo_qa/data_gen.py index 8fc21dd..4732ee1 100644 --- a/examples/demo_qa/data_gen.py +++ b/examples/demo_qa/data_gen.py @@ -3,7 +3,7 @@ import json import random from dataclasses import asdict, dataclass -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Dict, List @@ -294,7 +294,8 @@ def generate_and_save(out_dir: Path, *, rows: int = 1000, seed: int | None = Non save_dataset(dataset, out_dir) schema = default_schema(enable_semantic=enable_semantic) save_schema(schema, out_dir / "schema.json") - meta = MetaInfo(seed=seed, rows=rows, created_at=datetime.utcnow().isoformat()) + created_at = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") + meta = MetaInfo(seed=seed, rows=rows, created_at=created_at) write_meta(out_dir / "meta.json", meta) # Simple statistics diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 151c4df..0218acf 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -8,7 +8,7 @@ import uuid from dataclasses import dataclass, field from pathlib import Path -from typing import Dict, Iterable, List, Mapping, TypedDict +from typing import Dict, Iterable, List, Mapping, NotRequired, TypedDict from fetchgraph.core import create_generic_agent from fetchgraph.core.models import TaskProfile @@ -599,14 +599,24 @@ def _median_duration(results: Mapping[str, RunResult]) -> float | None: return (durations[mid - 1] + durations[mid]) / 2000 +def _coerce_int(value: object | None) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, (int, float)): + return int(value) + if isinstance(value, str): + try: + return int(value) + except ValueError: + return 0 + return 0 + + def _count_bad_from_summary(counts: Mapping[str, object], fail_on: str, require_assert: bool) -> int: bad = bad_statuses(fail_on, require_assert) total = 0 for status in bad: - try: - total += int(counts.get(status, 0) or 0) - except Exception: - continue + total += _coerce_int(counts.get(status, 0)) return total @@ -751,7 +761,8 @@ def __init__(self, path: Path | None, run_id: str): def emit(self, event: Dict[str, object]) -> None: if not self.path: return - payload = {"timestamp": datetime.datetime.utcnow().isoformat() + "Z", "run_id": self.run_id, **event} + now = datetime.datetime.now(datetime.timezone.utc) + payload = {"timestamp": now.isoformat().replace("+00:00", "Z"), "run_id": self.run_id, **event} with self.path.open("a", encoding="utf-8") as f: f.write(json.dumps(payload, ensure_ascii=False) + "\n") @@ -781,6 +792,7 @@ class DiffReport(TypedDict): new_bad_total: int fail_on: str require_assert: bool + baseline_path: NotRequired[str] __all__ = [ diff --git a/examples/demo_qa/runs/case_history.py b/examples/demo_qa/runs/case_history.py index 7f14267..860483c 100644 --- a/examples/demo_qa/runs/case_history.py +++ b/examples/demo_qa/runs/case_history.py @@ -37,7 +37,7 @@ def _append_case_history( history_dir = artifacts_dir / "runs" / "cases" history_dir.mkdir(parents=True, exist_ok=True) payload = { - "timestamp": datetime.datetime.utcnow().isoformat() + "Z", + "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z"), "run_id": run_id, "tag": tag, "note": note, diff --git a/examples/demo_qa/runs/effective.py b/examples/demo_qa/runs/effective.py index e7c2b3a..ca2457d 100644 --- a/examples/demo_qa/runs/effective.py +++ b/examples/demo_qa/runs/effective.py @@ -89,7 +89,7 @@ def _build_effective_diff( else: other_changed.append(entry) return { - "timestamp": datetime.datetime.utcnow().isoformat() + "Z", + "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z"), "tag": tag, "note": note, "run_id": run_id, @@ -180,7 +180,7 @@ def _update_effective_snapshot( "executed_total": executed_total, "missed_total": missed_total, "counts": summary_counts, - "updated_at": datetime.datetime.utcnow().isoformat() + "Z", + "updated_at": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z"), "built_from_runs": sorted(built_from), "effective_results_path": str(effective_results_path), "scope": scope, From 7c3f753ffbc3b7ea77b18711e1dbe5755813c196 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 27 Dec 2025 00:18:45 +0300 Subject: [PATCH 59/92] Improve batch typing safety --- examples/demo_qa/batch.py | 44 +++++++++++++++++++++++-------------- tests/test_demo_qa_batch.py | 10 ++++++--- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 246be54..ded52c1 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -7,7 +7,7 @@ import sys import uuid from pathlib import Path -from typing import Mapping, Optional +from typing import Mapping, Optional, cast from .llm.factory import build_llm from .logging_config import configure_logging @@ -223,6 +223,15 @@ def compare_runs(base_path: Path, new_path: Path, *, fail_on: str, require_asser return diff_runs(base.values(), new.values(), fail_on=fail_on, require_assert=require_assert) +def _id_sort_key(row: Mapping[str, object]) -> str: + identifier = row.get("id") + if isinstance(identifier, str): + return identifier + if identifier is None: + return "" + return str(identifier) + + def render_markdown(compare: DiffReport, out_path: Optional[Path]) -> str: lines: list[str] = [] base_counts = compare["base_counts"] @@ -258,7 +267,7 @@ def table(title: str, rows: list[Mapping[str, object]]) -> None: return lines.append("| id | status | reason | artifacts |") lines.append("|---|---|---|---|") - for row in sorted(rows, key=lambda r: r.get("id", "")): + for row in sorted(rows, key=_id_sort_key): artifacts_val = row.get("artifacts", {}) artifacts = artifacts_val if isinstance(artifacts_val, Mapping) else {} links = ", ".join(f"[{k}]({v})" for k, v in sorted(artifacts.items())) @@ -290,7 +299,7 @@ def write_junit(compare: DiffReport, out_path: Path) -> None: suite.set("failures", str(len(bad))) suite.set("errors", "0") - for row in sorted(bad, key=lambda r: r.get("id", "")): + for row in sorted(bad, key=_id_sort_key): tc = ET.SubElement(suite, "testcase", name=row["id"]) msg = row.get("reason", "") or f"{row.get('from')} → {row.get('to')}" failure = ET.SubElement(tc, "failure", message=msg) @@ -299,7 +308,7 @@ def write_junit(compare: DiffReport, out_path: Path) -> None: if artifacts: failure.text = "\n".join(f"{k}: {v}" for k, v in sorted(artifacts.items())) - for row in sorted(fixed, key=lambda r: r.get("id", "")): + for row in sorted(fixed, key=_id_sort_key): ET.SubElement(suite, "testcase", name=row["id"]) bad_ids = {row["id"] for row in bad} @@ -382,9 +391,10 @@ def handle_batch(args) -> int: ) scope_id = _scope_hash(scope) - baseline_filter_path = Path(args.only_failed_from) if args.only_failed_from else None + baseline_filter_path_arg = cast(Optional[Path], args.only_failed_from) + baseline_filter_path: Path | None = Path(baseline_filter_path_arg) if baseline_filter_path_arg else None only_failed_baseline_kind: str | None = None - if args.only_failed_from: + if baseline_filter_path_arg: only_failed_baseline_kind = "path" elif args.tag and args.only_failed: effective_results, effective_meta, eff_path = _load_effective_results(artifacts_dir, args.tag) @@ -415,7 +425,7 @@ def handle_batch(args) -> int: if candidate.exists(): baseline_filter_path = candidate only_failed_baseline_kind = "latest" - if baseline_filter_path and baseline_for_filter is None: + if baseline_filter_path is not None and baseline_for_filter is None: try: baseline_for_filter = load_results(baseline_filter_path) except Exception as exc: @@ -425,12 +435,13 @@ def handle_batch(args) -> int: print("No baseline found for --only-failed.", file=sys.stderr) return 2 - compare_path: Path | None = Path(args.compare_to) if args.compare_to else None + compare_to_arg = cast(Optional[Path], args.compare_to) + compare_path: Path | None = Path(compare_to_arg) if compare_to_arg else None if compare_path is None and args.only_failed and baseline_filter_path: compare_path = baseline_filter_path - if compare_path: + if compare_path is not None: try: - if baseline_filter_path and compare_path.resolve() == baseline_filter_path.resolve(): + if baseline_filter_path is not None and compare_path.resolve() == baseline_filter_path.resolve(): baseline_for_compare = baseline_for_filter else: baseline_for_compare = load_results(compare_path) @@ -466,8 +477,9 @@ def handle_batch(args) -> int: missed_baseline_run: Path | None = None only_missed_baseline_kind: str | None = None if args.only_missed: - if args.only_missed_from: - missed_baseline_path = args.only_missed_from + only_missed_from_arg = cast(Optional[Path], args.only_missed_from) + if only_missed_from_arg: + missed_baseline_path = only_missed_from_arg only_missed_baseline_kind = "path" try: missed_baseline_results = load_results(missed_baseline_path) @@ -503,15 +515,15 @@ def handle_batch(args) -> int: ) baseline_planned_ids = set(suite_case_ids) else: - missed_baseline_path = args.only_missed_from or _load_latest_results(artifacts_dir, args.tag) - if args.only_missed_from: + missed_baseline_path = only_missed_from_arg or _load_latest_results(artifacts_dir, args.tag) + if only_missed_from_arg: only_missed_baseline_kind = "path" - elif missed_baseline_path: + elif missed_baseline_path is not None: only_missed_baseline_kind = "latest" missed_baseline_run = _run_dir_from_results_path(missed_baseline_path) if missed_baseline_run is None: missed_baseline_run = _load_latest_run(artifacts_dir, args.tag) - if missed_baseline_path: + if missed_baseline_path is not None: try: missed_baseline_results = load_results(missed_baseline_path) except Exception as exc: diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 5efb13d..cdde099 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -5,13 +5,14 @@ import os import time from pathlib import Path +from typing import cast import pytest from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown, write_results from examples.demo_qa.runs.coverage import _missed_case_ids from examples.demo_qa.runs.layout import _latest_markers, _update_latest_markers -from examples.demo_qa.runner import RunResult, diff_runs +from examples.demo_qa.runner import DiffReport, RunResult, diff_runs @pytest.mark.parametrize( @@ -27,7 +28,9 @@ def test_is_failure_matches_bad_statuses(fail_on: str, require_assert: bool) -> def test_render_markdown_uses_fail_policy() -> None: - compare = { + compare = cast( + DiffReport, + { "base_counts": {"ok": 0, "mismatch": 2, "error": 1, "failed": 0}, "new_counts": {"ok": 1, "mismatch": 0, "error": 0, "failed": 0}, "base_bad_total": 1, @@ -38,7 +41,8 @@ def test_render_markdown_uses_fail_policy() -> None: "fixed": [], "still_fail": [], "all_ids": [], - } + }, + ) report = render_markdown(compare, None) assert "- Base OK: 0, Bad: 1" in report assert "- New OK: 1, Bad: 0" in report From 832edad06f0bf6f9d46ce46c835328ecaa182413 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 27 Dec 2025 00:31:04 +0300 Subject: [PATCH 60/92] Improve typed diff structures for demo QA reports --- examples/demo_qa/batch.py | 11 ++++----- examples/demo_qa/runner.py | 47 ++++++++++++++++++++++++++++++-------- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index ded52c1..afc1f4f 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -290,9 +290,9 @@ def write_junit(compare: DiffReport, out_path: Path) -> None: import xml.etree.ElementTree as ET suite = ET.Element("testsuite", name="demo_qa_compare") - bad = compare["new_fail"] + compare["still_fail"] - fixed = compare["fixed"] - all_ids_list = list(compare.get("all_ids", []) or []) + bad: list[DiffCaseChange] = compare["new_fail"] + compare["still_fail"] + fixed: list[DiffCaseChange] = compare["fixed"] + all_ids_list: list[str] = list(compare.get("all_ids", []) or []) all_ids = sorted(all_ids_list) cases_total = len(all_ids) suite.set("tests", str(cases_total)) @@ -301,10 +301,9 @@ def write_junit(compare: DiffReport, out_path: Path) -> None: for row in sorted(bad, key=_id_sort_key): tc = ET.SubElement(suite, "testcase", name=row["id"]) - msg = row.get("reason", "") or f"{row.get('from')} → {row.get('to')}" + msg: str = row["reason"] or f"{row.get('from')} → {row.get('to')}" failure = ET.SubElement(tc, "failure", message=msg) - artifacts_val = row.get("artifacts", {}) - artifacts = artifacts_val if isinstance(artifacts_val, Mapping) else {} + artifacts = row.get("artifacts", {}) if artifacts: failure.text = "\n".join(f"{k}: {v}" for k, v in sorted(artifacts.items())) diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 0218acf..03906c9 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -636,19 +636,24 @@ def diff_runs( def _is_bad(res: RunResult | None) -> bool: return bool(res and res.status in bad) - def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult | None) -> dict[str, object]: + def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult | None) -> DiffCaseChange: + artifacts: dict[str, str] + if new_res is None: + artifacts = {} + else: + artifacts = _artifact_links(new_res) return { "id": case_id, "from": base_res.status if base_res else None, "to": new_res.status if new_res else "missing", "reason": _reason(new_res) if new_res else "missing in new results", - "artifacts": _artifact_links(new_res) if new_res else {}, + "artifacts": artifacts, } - new_fail: list[dict[str, object]] = [] - fixed: list[dict[str, object]] = [] - still_fail: list[dict[str, object]] = [] - changed_status: list[dict[str, str | None]] = [] + new_fail: list[DiffCaseChange] = [] + fixed: list[DiffCaseChange] = [] + still_fail: list[DiffCaseChange] = [] + changed_status: list[DiffStatusChange] = [] new_cases: list[str] = [] for case_id in all_ids: @@ -772,12 +777,34 @@ def for_case(self, case_id: str, path: Path | None = None) -> "EventLogger": return EventLogger(path, self.run_id) +DiffCaseChange = TypedDict( + "DiffCaseChange", + { + "id": str, + "from": str | None, + "to": str | None, + "reason": str, + "artifacts": Mapping[str, str], + }, +) + + +DiffStatusChange = TypedDict( + "DiffStatusChange", + { + "id": str, + "from": str | None, + "to": str | None, + }, +) + + class DiffReport(TypedDict): all_ids: list[str] - new_fail: list[dict[str, object]] - fixed: list[dict[str, object]] - still_fail: list[dict[str, object]] - changed_status: list[dict[str, str | None]] + new_fail: list[DiffCaseChange] + fixed: list[DiffCaseChange] + still_fail: list[DiffCaseChange] + changed_status: list[DiffStatusChange] new_cases: list[str] base_counts: Dict[str, object] new_counts: Dict[str, object] From c04903e45ffb598efa7010b943845327a44fffa1 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 27 Dec 2025 00:40:06 +0300 Subject: [PATCH 61/92] import fix --- examples/demo_qa/batch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index afc1f4f..a3c7f27 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -15,6 +15,7 @@ from .runner import ( Case, DiffReport, + DiffCaseChange, EventLogger, RunResult, RunTimings, From 07cd3c03209b5bc992a2187844238f1eb24cca75 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 27 Dec 2025 00:54:01 +0300 Subject: [PATCH 62/92] Fix history case to honor artifacts dir --- examples/demo_qa/cli.py | 1 + examples/demo_qa/commands/history.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index c8ae01c..87d940c 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -153,6 +153,7 @@ def build_parser() -> argparse.ArgumentParser: case_hist = history_sub.add_parser("case", help="Show history for a case id") case_hist.add_argument("case_id") case_hist.add_argument("--data", type=Path, required=True, help="Data dir containing .runs") + case_hist.add_argument("--artifacts-dir", type=Path, default=None, help="Base artifacts dir (default: /.runs)") case_hist.add_argument("--tag", type=str, default=None, help="Filter by tag") case_hist.add_argument("--limit", type=int, default=20, help="Limit rows") diff --git a/examples/demo_qa/commands/history.py b/examples/demo_qa/commands/history.py index 376f90d..a60f7c1 100644 --- a/examples/demo_qa/commands/history.py +++ b/examples/demo_qa/commands/history.py @@ -1,10 +1,12 @@ from __future__ import annotations +from pathlib import Path + from ..runs.case_history import _load_case_history def handle_history_case(args) -> int: - artifacts_dir = args.data / ".runs" + artifacts_dir = Path(args.artifacts_dir) if args.artifacts_dir else args.data / ".runs" path = artifacts_dir / "runs" / "cases" / f"{args.case_id}.jsonl" entries = _load_case_history(path) if args.tag: From d3b2863697ae68ce4be884d3202ea2503973dfce Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 04:48:52 +0300 Subject: [PATCH 63/92] Handle non-string expectations in demo QA runner --- examples/demo_qa/runner.py | 29 +++++++++++++++++++---------- tests/test_demo_qa_runner.py | 13 +++++++++++++ 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py index 03906c9..0a8029b 100644 --- a/examples/demo_qa/runner.py +++ b/examples/demo_qa/runner.py @@ -204,25 +204,34 @@ def save_status(result: RunResult) -> None: _save_json(status_path, result.to_json()) +def _stringify(value: object | None) -> str | None: + if value is None: + return None + return str(value) + + def _match_expected(case: Case, answer: str | None) -> ExpectedCheck | None: if not case.has_asserts: return None - expected_value = case.expected or case.expected_regex or case.expected_contains or "" + expected_value = _stringify(case.expected) or _stringify(case.expected_regex) or _stringify(case.expected_contains) or "" if answer is None: return ExpectedCheck(mode="none", expected=expected_value, passed=False, detail="no answer") if case.expected is not None: - passed = answer.strip() == case.expected.strip() - detail = None if passed else f"expected={case.expected!r}, got={answer!r}" - return ExpectedCheck(mode="exact", expected=case.expected, passed=passed, detail=detail) + expected_str = _stringify(case.expected) or "" + passed = answer.strip() == expected_str.strip() + detail = None if passed else f"expected={expected_str!r}, got={answer!r}" + return ExpectedCheck(mode="exact", expected=expected_str, passed=passed, detail=detail) if case.expected_regex is not None: - pattern = re.compile(case.expected_regex) + expected_regex = _stringify(case.expected_regex) or "" + pattern = re.compile(expected_regex) passed = bool(pattern.search(answer)) - detail = None if passed else f"regex {case.expected_regex!r} not found" - return ExpectedCheck(mode="regex", expected=case.expected_regex, passed=passed, detail=detail) + detail = None if passed else f"regex {expected_regex!r} not found" + return ExpectedCheck(mode="regex", expected=expected_regex, passed=passed, detail=detail) if case.expected_contains is not None: - passed = case.expected_contains in answer - detail = None if passed else f"expected to contain {case.expected_contains!r}" - return ExpectedCheck(mode="contains", expected=case.expected_contains, passed=passed, detail=detail) + expected_contains = _stringify(case.expected_contains) or "" + passed = expected_contains in answer + detail = None if passed else f"expected to contain {expected_contains!r}" + return ExpectedCheck(mode="contains", expected=expected_contains, passed=passed, detail=detail) return None diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py index 31773cd..0888dfe 100644 --- a/tests/test_demo_qa_runner.py +++ b/tests/test_demo_qa_runner.py @@ -8,6 +8,19 @@ def test_match_expected_unchecked_when_no_expectations() -> None: assert _match_expected(case, "anything") is None +def test_match_expected_coerces_non_string_expected_values() -> None: + case = Case(id="c1", question="What is foo?", expected=42) + + mismatch = _match_expected(case, "43") + assert mismatch is not None + assert mismatch.passed is False + assert "expected='42'" in (mismatch.detail or "") + + match = _match_expected(case, "42") + assert match is not None + assert match.passed is True + + def test_match_expected_contains_pass_and_fail() -> None: case = Case(id="c2", question="Q", expected_contains="bar") From e4b697e3159f4d907ae9652082a4905d11841015 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 05:23:16 +0300 Subject: [PATCH 64/92] Default batch history to artifacts directory --- examples/demo_qa/batch.py | 2 +- examples/demo_qa/cli.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index a3c7f27..bc3cf66 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -566,7 +566,7 @@ def handle_batch(args) -> int: results_path.parent.mkdir(parents=True, exist_ok=True) summary_path = results_path.with_name("summary.json") artifacts_dir.mkdir(parents=True, exist_ok=True) - history_path = Path(args.history) if args.history else (data_dir / ".runs" / "history.jsonl") + history_path = Path(args.history) if args.history else (artifacts_dir / "history.jsonl") log_dir = Path(args.log_dir) if args.log_dir else data_dir / ".runs" / "logs" configure_logging( diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index 87d940c..78cc697 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -95,7 +95,7 @@ def build_parser() -> argparse.ArgumentParser: batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code") batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show") batch_p.add_argument("--show-artifacts", action="store_true", help="Show artifact paths for failures") - batch_p.add_argument("--history", type=Path, default=None, help="Path to history.jsonl (default: /.runs/history.jsonl)") + batch_p.add_argument("--history", type=Path, default=None, help="Path to history.jsonl (default: /history.jsonl)") batch_p.add_argument("--include-tags", type=str, default=None, help="Comma-separated tags to include") batch_p.add_argument("--exclude-tags", type=str, default=None, help="Comma-separated tags to exclude") batch_p.add_argument("--include-ids", type=Path, default=None, help="Path to file with ids to include (one per line)") From 4f4aac0f706351fa66762e7550bb6dbaafed5db7 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 3 Jan 2026 05:39:12 +0300 Subject: [PATCH 65/92] Update quick-start aliases for schema.json, allow default model usage --- README_demo_qa.md | 2 +- examples/demo_qa/settings.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README_demo_qa.md b/README_demo_qa.md index 23cea09..8a8cc07 100644 --- a/README_demo_qa.md +++ b/README_demo_qa.md @@ -106,7 +106,7 @@ python -m examples.demo_qa.cli batch \ ```bash # 1) Настройте свои дефолты под проект/датасет export DQ_DATA="./_demo_data/shop" -export DQ_SCHEMA="$DQ_DATA/schema.yaml" +export DQ_SCHEMA="$DQ_DATA/schema.json" export DQ_CASES="./examples/demo_qa/cases/retail_cases.json" export DQ_OUT="$DQ_DATA/.runs/results.jsonl" export DQ_TAG="retail-iter1" diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py index 064e179..be3f03a 100644 --- a/examples/demo_qa/settings.py +++ b/examples/demo_qa/settings.py @@ -20,8 +20,8 @@ class LLMSettings(BaseModel): base_url: str | None = Field(default=None) api_key: str | None = Field(default=None) model: str | None = None - plan_model: str = "gpt-4o-mini" - synth_model: str = "gpt-4o-mini" + plan_model: str = "default" + synth_model: str = "default" plan_temperature: float = 0.0 synth_temperature: float = 0.2 timeout_s: float | None = None From b565b7a6bb281f5efcd27b67c90661c6f4f27669 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 06:08:17 +0300 Subject: [PATCH 66/92] Handle baseline overlay selection for partial runs --- examples/demo_qa/batch.py | 248 +++++++++++++++++++------------- examples/demo_qa/cli.py | 5 + examples/demo_qa/runs/layout.py | 124 +++++++++++----- tests/test_demo_qa_batch.py | 66 +++++++-- 4 files changed, 302 insertions(+), 141 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index bc3cf66..26e07ee 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -7,7 +7,7 @@ import sys import uuid from pathlib import Path -from typing import Mapping, Optional, cast +from typing import Iterable, Mapping, Optional, cast from .llm.factory import build_llm from .logging_config import configure_logging @@ -41,8 +41,10 @@ from .runs.io import write_results from .runs.layout import ( _load_latest_results, + _load_latest_any_results, _load_latest_run, _load_run_meta, + _resolve_results_path_for_run, _run_dir_from_results_path, _update_latest_markers, ) @@ -116,6 +118,44 @@ def _load_ids(path: Optional[Path]) -> set[str] | None: return ids +def _only_failed_selection( + baseline_results: Mapping[str, RunResult] | None, + overlay_results: Mapping[str, RunResult] | None, + *, + fail_on: str, + require_assert: bool, +) -> tuple[set[str], dict[str, object]]: + baseline = baseline_results or {} + overlay = overlay_results or {} + bad = bad_statuses(fail_on, require_assert) + baseline_bad = {cid for cid, res in baseline.items() if res.status in bad} + overlay_bad = {cid for cid, res in overlay.items() if res.status in bad} + overlay_good = {cid for cid, res in overlay.items() if res.status not in bad} + + healed = baseline_bad & overlay_good + selection = (baseline_bad - healed) | overlay_bad + breakdown = { + "baseline_failures": baseline_bad, + "healed": healed, + "new_failures": overlay_bad, + } + return selection, breakdown + + +def _only_missed_selection( + selected_case_ids: Iterable[str], + baseline_results: Mapping[str, RunResult] | None, + overlay_results: Mapping[str, RunResult] | None, +) -> tuple[set[str], dict[str, object]]: + selected = set(selected_case_ids) + baseline_ids = set(baseline_results.keys()) if baseline_results else set() + overlay_executed = set(overlay_results.keys()) if overlay_results else set() + missed_base = selected - baseline_ids + missed_final = missed_base - overlay_executed + breakdown = {"missed_base": missed_base, "overlay_executed": overlay_executed} + return missed_final, breakdown + + def _fingerprint_dir(data_dir: Path, *, verbose: bool = False) -> Mapping[str, object]: entries: list[dict] = [] total_bytes = 0 @@ -170,7 +210,7 @@ def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]: def _resolve_run_path(path: Path | None, artifacts_dir: Path) -> Optional[Path]: if path is not None: return path - return _load_latest_run(artifacts_dir) + return _load_latest_run(artifacts_dir, kind="any") def handle_chat(args) -> int: @@ -322,10 +362,7 @@ def write_junit(compare: DiffReport, out_path: Path) -> None: def _select_cases_for_rerun( cases: list[Case], - baseline_for_filter: Optional[Mapping[str, RunResult]], *, - require_assert: bool, - fail_on: str, include_tags: set[str] | None, exclude_tags: set[str] | None, include_ids: set[str] | None, @@ -343,12 +380,7 @@ def _select_cases_for_rerun( if exclude_ids and case.id in exclude_ids: continue filtered.append(case) - if not baseline_for_filter: - return filtered - target_ids = { - case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses(fail_on, require_assert) - } - return [case for case in filtered if case.id in target_ids] + return filtered def handle_batch(args) -> int: @@ -373,8 +405,14 @@ def handle_batch(args) -> int: print(f"Cases error: {exc}", file=sys.stderr) return 2 - baseline_for_filter: Optional[Mapping[str, RunResult]] = None baseline_for_compare: Optional[Mapping[str, RunResult]] = None + failed_baseline_results: Optional[Mapping[str, RunResult]] = None + failed_baseline_path: Path | None = None + missed_baseline_results: Optional[Mapping[str, RunResult]] = None + missed_baseline_path: Path | None = None + overlay_results: Optional[Mapping[str, RunResult]] = None + overlay_results_path: Path | None = None + overlay_run_path: Path | None = None artifacts_dir = Path(args.artifacts_dir) if args.artifacts_dir else data_dir / ".runs" @@ -410,28 +448,20 @@ def handle_batch(args) -> int: if effective_meta and effective_meta.get("scope_hash") not in (None, scope_id): print("Effective results scope does not match current selection; refusing to merge.", file=sys.stderr) return 2 - baseline_for_filter = effective_results + failed_baseline_results = effective_results baseline_filter_path = eff_path only_failed_baseline_kind = "effective" elif args.only_failed: - latest_results = _load_latest_results(artifacts_dir, args.tag) - if latest_results: - baseline_filter_path = latest_results - only_failed_baseline_kind = "latest" - else: - latest_run = _load_latest_run(artifacts_dir, args.tag) - if latest_run: - candidate = latest_run / "results.jsonl" - if candidate.exists(): - baseline_filter_path = candidate - only_failed_baseline_kind = "latest" - if baseline_filter_path is not None and baseline_for_filter is None: + baseline_filter_path = _load_latest_results(artifacts_dir, args.tag) + if baseline_filter_path: + only_failed_baseline_kind = "latest_complete" + if baseline_filter_path is not None and failed_baseline_results is None: try: - baseline_for_filter = load_results(baseline_filter_path) + failed_baseline_results = load_results(baseline_filter_path) except Exception as exc: print(f"Failed to read baseline for --only-failed-from: {exc}", file=sys.stderr) return 2 - if args.only_failed and baseline_for_filter is None: + if args.only_failed and failed_baseline_results is None: print("No baseline found for --only-failed.", file=sys.stderr) return 2 @@ -442,50 +472,61 @@ def handle_batch(args) -> int: if compare_path is not None: try: if baseline_filter_path is not None and compare_path.resolve() == baseline_filter_path.resolve(): - baseline_for_compare = baseline_for_filter + baseline_for_compare = failed_baseline_results else: baseline_for_compare = load_results(compare_path) except Exception as exc: print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr) return 2 + overlay_run_path = _load_latest_run(artifacts_dir, args.tag, kind="any") + overlay_results_path = _resolve_results_path_for_run(overlay_run_path) or _load_latest_any_results( + artifacts_dir, args.tag + ) + if overlay_results_path and not args.no_overlay: + try: + overlay_results = load_results(overlay_results_path) + except Exception as exc: + print(f"Failed to read overlay results from latest run: {exc}", file=sys.stderr) + overlay_results_path = None + overlay_results = None + filtered_cases = _select_cases_for_rerun( cases, - None, - require_assert=args.require_assert, - fail_on=args.fail_on, include_tags=include_tags, exclude_tags=exclude_tags, include_ids=include_ids, exclude_ids=exclude_ids, ) suite_case_ids = [case.id for case in filtered_cases] - cases = _select_cases_for_rerun( - filtered_cases, - baseline_for_filter, - require_assert=args.require_assert, - fail_on=args.fail_on, - include_tags=include_tags, - exclude_tags=exclude_tags, - include_ids=include_ids, - exclude_ids=exclude_ids, - ) + cases = filtered_cases + + if args.only_failed: + selection_ids, breakdown = _only_failed_selection( + failed_baseline_results, + overlay_results if not args.no_overlay else None, + fail_on=args.fail_on, + require_assert=args.require_assert, + ) + cases = [case for case in cases if case.id in selection_ids] + healed = breakdown.get("healed", set()) + baseline_fails = breakdown.get("baseline_failures", set()) + new_failures = breakdown.get("new_failures", set()) + baseline_label = str(_run_dir_from_results_path(baseline_filter_path) or baseline_filter_path or "n/a") + overlay_label = str(overlay_run_path or overlay_results_path or "n/a") + print(f"Baseline: {baseline_label}", file=sys.stderr) + print(f"Overlay: {overlay_label}", file=sys.stderr) + print(f"Baseline failures: {len(baseline_fails)}", file=sys.stderr) + print(f"Healed by overlay: {len(healed)}", file=sys.stderr) + print(f"New failures in overlay: {len(new_failures)}", file=sys.stderr) + print(f"Final only-failed selection: {len(selection_ids)}", file=sys.stderr) - baseline_planned_ids: set[str] | None = None - missed_baseline_results: Optional[Mapping[str, RunResult]] = None - missed_baseline_path: Path | None = None - missed_baseline_run: Path | None = None only_missed_baseline_kind: str | None = None if args.only_missed: only_missed_from_arg = cast(Optional[Path], args.only_missed_from) if only_missed_from_arg: missed_baseline_path = only_missed_from_arg only_missed_baseline_kind = "path" - try: - missed_baseline_results = load_results(missed_baseline_path) - except Exception as exc: - print(f"Failed to read baseline for --only-missed-from: {exc}", file=sys.stderr) - return 2 elif args.tag: effective_results, effective_meta, eff_path = _load_effective_results(artifacts_dir, args.tag) if not effective_results: @@ -503,62 +544,37 @@ def handle_batch(args) -> int: missed_baseline_path = eff_path missed_baseline_results = effective_results only_missed_baseline_kind = "effective" - baseline_planned_ids = ( - {str(cid) for cid in effective_meta.get("planned_case_ids", [])} - if isinstance(effective_meta, dict) - else None - ) - if not baseline_planned_ids: - print( - "Effective results missing planned_case_ids; computing missed relative to current filtered cases.", - file=sys.stderr, - ) - baseline_planned_ids = set(suite_case_ids) else: missed_baseline_path = only_missed_from_arg or _load_latest_results(artifacts_dir, args.tag) if only_missed_from_arg: only_missed_baseline_kind = "path" elif missed_baseline_path is not None: - only_missed_baseline_kind = "latest" - missed_baseline_run = _run_dir_from_results_path(missed_baseline_path) - if missed_baseline_run is None: - missed_baseline_run = _load_latest_run(artifacts_dir, args.tag) - if missed_baseline_path is not None: - try: - missed_baseline_results = load_results(missed_baseline_path) - except Exception as exc: - print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr) - return 2 - else: - print( - "No baseline found for --only-missed. Provide --only-missed-from or run a tagged batch first.", - file=sys.stderr, - ) + only_missed_baseline_kind = "latest_complete" + if missed_baseline_path is not None and missed_baseline_results is None: + try: + missed_baseline_results = load_results(missed_baseline_path) + except Exception as exc: + print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr) return 2 - baseline_meta = _load_run_meta(missed_baseline_run) - if isinstance(baseline_meta, dict): - planned_from_meta = baseline_meta.get("planned_case_ids") - if isinstance(planned_from_meta, list): - baseline_planned_ids = {str(cid) for cid in planned_from_meta} - else: - print( - "Baseline run meta missing planned_case_ids; computing missed relative to current filtered cases.", - file=sys.stderr, - ) - baseline_planned_ids = set(suite_case_ids) if args.only_missed and missed_baseline_results is None: print("No baseline found for --only-missed.", file=sys.stderr) return 2 - - selected_case_ids = [case.id for case in cases] - if args.only_missed: - planned_pool = baseline_planned_ids or set(selected_case_ids) - missed_ids = _missed_case_ids(planned_pool, missed_baseline_results) - cases = [case for case in cases if case.id in missed_ids] selected_case_ids = [case.id for case in cases] + missed_ids, missed_breakdown = _only_missed_selection( + selected_case_ids, + missed_baseline_results, + overlay_results if not args.no_overlay else None, + ) + cases = [case for case in cases if case.id in missed_ids] + print(f"Baseline (missed) results: {missed_baseline_path}", file=sys.stderr) + print(f"Overlay executed: {len(missed_breakdown.get('overlay_executed', set()))}", file=sys.stderr) + print(f"Missed in baseline: {len(missed_breakdown.get('missed_base', set()))}", file=sys.stderr) + print(f"Final only-missed selection: {len(missed_ids)}", file=sys.stderr) if not cases: print("0 missed cases selected.", file=sys.stderr) + selected_case_ids = [case.id for case in cases] + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") run_folder = artifacts_dir / "runs" / f"{timestamp}_{cases_path.stem}" results_path = Path(args.out) if args.out else (run_folder / "results.jsonl") @@ -661,6 +677,15 @@ def handle_batch(args) -> int: missed_total = len(_missed_case_ids(selected_case_ids, executed_results)) suite_planned_total = len(suite_case_ids) suite_missed_total = len(_missed_case_ids(suite_case_ids, executed_results)) + results_complete = (planned_total == executed_total) and not interrupted + if interrupted: + run_status = "INTERRUPTED" + elif not results_complete: + run_status = "ERROR" + elif bad_count: + run_status = "FAILED" + else: + run_status = "SUCCESS" summary = { "run_id": run_id, "started_at": _isoformat_utc(started_at), @@ -681,6 +706,10 @@ def handle_batch(args) -> int: "interrupted_at_case_id": interrupted_at_case_id, "tag": args.tag, "note": args.note, + "run_status": run_status, + "results_complete": results_complete, + "total_selected": planned_total, + "total_executed": executed_total, } if diff_block: summary["diff"] = diff_block @@ -704,10 +733,12 @@ def handle_batch(args) -> int: "planned_total": planned_total, "executed_total": executed_total, "missed_total": missed_total, + "run_status": run_status, + "results_complete": results_complete, } ) - _update_latest_markers(run_folder, results_path, artifacts_dir, args.tag) + _update_latest_markers(run_folder, results_path, artifacts_dir, args.tag, results_complete=results_complete) effective_path = None effective_meta_path = None if args.tag: @@ -763,6 +794,12 @@ def handle_batch(args) -> int: "suite_case_ids": suite_case_ids, "selected_case_ids": selected_case_ids, "planned_total": planned_total, + "executed_total": executed_total, + "run_status": run_status, + "results_complete": results_complete, + "exit_code": exit_code, + "total_selected": planned_total, + "total_executed": executed_total, "selected_filters": { "include_tags": sorted(include_tags) if include_tags else None, "exclude_tags": sorted(exclude_tags) if exclude_tags else None, @@ -774,6 +811,7 @@ def handle_batch(args) -> int: "only_missed": args.only_missed, "only_missed_from": str(missed_baseline_path) if missed_baseline_path else None, "only_missed_baseline_kind": only_missed_baseline_kind, + "overlay_results_path": str(overlay_results_path) if overlay_results_path else None, "baseline_tag": args.tag, "effective_path": str(effective_path) if effective_path else None, "scope_hash": scope_id, @@ -781,6 +819,7 @@ def handle_batch(args) -> int: "plan_only": args.plan_only, "fail_fast": args.fail_fast, "max_fails": args.max_fails, + "no_overlay": args.no_overlay, }, "interrupted": interrupted, "interrupted_at_case_id": interrupted_at_case_id, @@ -826,12 +865,17 @@ def handle_batch(args) -> int: "fail_count": bad_count, "planned_total": planned_total, "executed_total": executed_total, + "total_selected": planned_total, + "total_executed": executed_total, "missed_total": missed_total, "suite_planned_total": suite_planned_total, "suite_missed_total": suite_missed_total, "interrupted": interrupted, "interrupted_at_case_id": interrupted_at_case_id, "scope_hash": scope_id, + "run_status": run_status, + "results_complete": results_complete, + "exit_code": exit_code, } for res in results: _append_case_history( @@ -935,6 +979,10 @@ def handle_case_run(args) -> int: result = run_one(cases[args.case_id], runner, artifacts_root, plan_only=args.plan_only) write_results(results_path, [result]) counts = summarize([result]) + bad = bad_statuses("bad", False) + bad_count = sum(_coerce_int(counts.get(status)) for status in bad) + run_status = "FAILED" if bad_count else "SUCCESS" + exit_code = 1 if bad_count else 0 summary = { "run_id": run_folder.name, "timestamp": timestamp + "Z", @@ -942,17 +990,19 @@ def handle_case_run(args) -> int: "results_path": str(results_path), "fail_on": "bad", "require_assert": False, + "run_status": run_status, + "results_complete": True, + "total_selected": 1, + "total_executed": 1, + "exit_code": exit_code, } summary_path = write_summary(results_path, summary) - save_dir = run_folder.parent - save_dir.mkdir(parents=True, exist_ok=True) - (save_dir / "latest.txt").write_text(str(run_folder), encoding="utf-8") - (save_dir / "latest_results.txt").write_text(str(results_path), encoding="utf-8") + _update_latest_markers(run_folder, results_path, artifacts_dir, None, results_complete=True) print(format_status_line(result)) print(f"Artifacts: {result.artifacts_dir}") print(f"Summary: {summary_path}") - return 0 + return exit_code def handle_case_open(args) -> int: diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index 78cc697..6bcab36 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -91,6 +91,11 @@ def build_parser() -> argparse.ArgumentParser: help="Run only cases that failed/mismatched/errored in a previous results.jsonl", ) batch_p.add_argument("--only-failed", action="store_true", help="Use latest run for --only-failed-from automatically") + batch_p.add_argument( + "--no-overlay", + action="store_true", + help="Ignore latest partial run when selecting only-failed/only-missed (use baseline only)", + ) batch_p.add_argument("--plan-only", action="store_true", help="Run planner only (no fetch/synthesize)") batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code") batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show") diff --git a/examples/demo_qa/runs/layout.py b/examples/demo_qa/runs/layout.py index 704f643..8c0d61e 100644 --- a/examples/demo_qa/runs/layout.py +++ b/examples/demo_qa/runs/layout.py @@ -2,7 +2,14 @@ import json from pathlib import Path -from typing import Optional +from typing import NamedTuple, Optional + + +class LatestMarkers(NamedTuple): + complete: Path + results: Path + any_run: Path + legacy_run: Path def _sanitize_tag(tag: str) -> str: @@ -15,43 +22,84 @@ def _effective_paths(artifacts_dir: Path, tag: str) -> tuple[Path, Path]: return base / "effective_results.jsonl", base / "effective_meta.json" -def _latest_markers(artifacts_dir: Path, tag: str | None) -> tuple[Path, Path]: +def _latest_markers(artifacts_dir: Path, tag: str | None) -> LatestMarkers: runs_dir = artifacts_dir / "runs" if tag: slug = _sanitize_tag(tag) - return runs_dir / f"tag-latest-{slug}.txt", runs_dir / f"tag-latest-results-{slug}.txt" - return runs_dir / "latest.txt", runs_dir / "latest_results.txt" - - -def _load_latest_run(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]: - latest_file, _ = _latest_markers(artifacts_dir, tag) - if latest_file.exists(): - content = latest_file.read_text(encoding="utf-8").strip() + return LatestMarkers( + runs_dir / f"tag-latest-complete-{slug}.txt", + runs_dir / f"tag-latest-results-{slug}.txt", + runs_dir / f"tag-latest-any-{slug}.txt", + runs_dir / f"tag-latest-{slug}.txt", + ) + return LatestMarkers( + runs_dir / "latest_complete.txt", + runs_dir / "latest_results.txt", + runs_dir / "latest_any.txt", + runs_dir / "latest.txt", + ) + + +def _read_marker(path: Path) -> Optional[Path]: + if path.exists(): + content = path.read_text(encoding="utf-8").strip() if content: return Path(content) return None -def _load_latest_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]: - _, latest_file = _latest_markers(artifacts_dir, tag) - if latest_file.exists(): - content = latest_file.read_text(encoding="utf-8").strip() - if content: - return Path(content) - latest_run = _load_latest_run(artifacts_dir, tag) - if latest_run: - summary_path = latest_run / "summary.json" - if summary_path.exists(): - try: - summary = json.loads(summary_path.read_text(encoding="utf-8")) - results_path = summary.get("results_path") - if results_path: - return Path(results_path) - except Exception: - pass +def _load_latest_run(artifacts_dir: Path, tag: str | None = None, *, kind: str = "complete") -> Optional[Path]: + markers = _latest_markers(artifacts_dir, tag) + candidates: list[Path] = [] + if kind == "any": + candidates.append(markers.any_run) + candidates.append(markers.complete) + candidates.append(markers.legacy_run) + for marker in candidates: + resolved = _read_marker(marker) + if resolved: + return resolved return None +def _resolve_results_path_for_run(run_path: Path | None) -> Optional[Path]: + if run_path is None: + return None + summary_path = run_path / "summary.json" + if summary_path.exists(): + try: + summary = json.loads(summary_path.read_text(encoding="utf-8")) + results_path = summary.get("results_path") + if results_path: + return Path(results_path) + except Exception: + pass + candidate = run_path / "results.jsonl" + if candidate.exists(): + return candidate + return None + + +def _load_latest_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]: + markers = _latest_markers(artifacts_dir, tag) + resolved = _read_marker(markers.results) + if resolved: + return resolved + latest_run = _load_latest_run(artifacts_dir, tag, kind="complete") + return _resolve_results_path_for_run(latest_run) + + +def _load_latest_any_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]: + latest_run = _load_latest_run(artifacts_dir, tag, kind="any") + if latest_run is None: + return None + results = _resolve_results_path_for_run(latest_run) + if results: + return results + markers = _latest_markers(artifacts_dir, tag) + return _read_marker(markers.results) + + def _load_run_meta(run_path: Path | None) -> Optional[dict]: if run_path is None: return None @@ -80,22 +128,30 @@ def _run_dir_from_results_path(results_path: Path | None) -> Optional[Path]: return run_dir -def _update_latest_markers(run_folder: Path, results_path: Path, artifacts_dir: Path, tag: str | None) -> None: - marker_pairs = {_latest_markers(artifacts_dir, None)} +def _update_latest_markers( + run_folder: Path, results_path: Path, artifacts_dir: Path, tag: str | None, *, results_complete: bool +) -> None: + marker_sets = {_latest_markers(artifacts_dir, None)} if tag: - marker_pairs.add(_latest_markers(artifacts_dir, tag)) - for latest_path, latest_results_path in marker_pairs: - latest_path.parent.mkdir(parents=True, exist_ok=True) - latest_path.write_text(str(run_folder), encoding="utf-8") - latest_results_path.write_text(str(results_path), encoding="utf-8") + marker_sets.add(_latest_markers(artifacts_dir, tag)) + for markers in marker_sets: + markers.complete.parent.mkdir(parents=True, exist_ok=True) + markers.any_run.write_text(str(run_folder), encoding="utf-8") + markers.legacy_run.write_text(str(run_folder), encoding="utf-8") + if results_complete: + markers.complete.write_text(str(run_folder), encoding="utf-8") + markers.results.write_text(str(results_path), encoding="utf-8") __all__ = [ + "LatestMarkers", "_effective_paths", + "_load_latest_any_results", "_latest_markers", "_load_latest_results", "_load_latest_run", "_load_run_meta", + "_resolve_results_path_for_run", "_run_dir_from_results_path", "_sanitize_tag", "_update_latest_markers", diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index cdde099..6226fc4 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -9,7 +9,15 @@ import pytest -from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown, write_results +from examples.demo_qa.batch import ( + _fingerprint_dir, + _only_failed_selection, + _only_missed_selection, + bad_statuses, + is_failure, + render_markdown, + write_results, +) from examples.demo_qa.runs.coverage import _missed_case_ids from examples.demo_qa.runs.layout import _latest_markers, _update_latest_markers from examples.demo_qa.runner import DiffReport, RunResult, diff_runs @@ -106,6 +114,29 @@ def test_missed_case_ids_diff_planned_and_executed() -> None: assert _missed_case_ids(planned, executed) == {"a", "c"} +def test_only_failed_selection_uses_overlay_and_baseline() -> None: + baseline = {"a": _mk_result("a", "failed"), "b": _mk_result("b", "failed")} + overlay = {"a": _mk_result("a", "ok"), "c": _mk_result("c", "failed")} + + selection, breakdown = _only_failed_selection(baseline, overlay, fail_on="bad", require_assert=False) + + assert selection == {"b", "c"} + assert breakdown["healed"] == {"a"} + assert breakdown["baseline_failures"] == {"a", "b"} + assert breakdown["new_failures"] == {"c"} + + +def test_only_missed_selection_uses_overlay_executed() -> None: + baseline = {"a": _mk_result("a", "ok")} + overlay = {"c": _mk_result("c", "ok")} + + missed, breakdown = _only_missed_selection(["a", "b", "c"], baseline, overlay) + + assert missed == {"b"} + assert breakdown["missed_base"] == {"b", "c"} + assert breakdown["overlay_executed"] == {"c"} + + def test_update_latest_markers_handles_tag(tmp_path: Path) -> None: artifacts_dir = tmp_path / "data" / ".runs" run_dir = artifacts_dir / "runs" / "20240101_cases" @@ -113,12 +144,31 @@ def test_update_latest_markers_handles_tag(tmp_path: Path) -> None: run_dir.mkdir(parents=True) results_path.write_text("{}", encoding="utf-8") - _update_latest_markers(run_dir, results_path, artifacts_dir, "feature/beta") + _update_latest_markers(run_dir, results_path, artifacts_dir, "feature/beta", results_complete=True) + + latest_default = _latest_markers(artifacts_dir, None) + assert latest_default.complete.read_text(encoding="utf-8").strip() == str(run_dir) + assert latest_default.results.read_text(encoding="utf-8").strip() == str(results_path) + assert latest_default.any_run.read_text(encoding="utf-8").strip() == str(run_dir) + + latest_tag = _latest_markers(artifacts_dir, "feature/beta") + assert latest_tag.complete.read_text(encoding="utf-8").strip() == str(run_dir) + assert latest_tag.results.read_text(encoding="utf-8").strip() == str(results_path) + assert latest_tag.any_run.read_text(encoding="utf-8").strip() == str(run_dir) + + partial_dir = artifacts_dir / "runs" / "20240102_cases" + partial_results = partial_dir / "results.jsonl" + partial_dir.mkdir(parents=True) + partial_results.write_text("{}", encoding="utf-8") + + _update_latest_markers(partial_dir, partial_results, artifacts_dir, "feature/beta", results_complete=False) - latest_default, latest_results_default = _latest_markers(artifacts_dir, None) - assert latest_default.read_text(encoding="utf-8").strip() == str(run_dir) - assert latest_results_default.read_text(encoding="utf-8").strip() == str(results_path) + refreshed_default = _latest_markers(artifacts_dir, None) + assert refreshed_default.complete.read_text(encoding="utf-8").strip() == str(run_dir) + assert refreshed_default.results.read_text(encoding="utf-8").strip() == str(results_path) + assert refreshed_default.any_run.read_text(encoding="utf-8").strip() == str(partial_dir) - latest_tag, latest_results_tag = _latest_markers(artifacts_dir, "feature/beta") - assert latest_tag.read_text(encoding="utf-8").strip() == str(run_dir) - assert latest_results_tag.read_text(encoding="utf-8").strip() == str(results_path) + refreshed_tag = _latest_markers(artifacts_dir, "feature/beta") + assert refreshed_tag.complete.read_text(encoding="utf-8").strip() == str(run_dir) + assert refreshed_tag.results.read_text(encoding="utf-8").strip() == str(results_path) + assert refreshed_tag.any_run.read_text(encoding="utf-8").strip() == str(partial_dir) From 958f2ae7ad17de1ff65985e3e68c95ac8feda5bd Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 10:41:23 +0300 Subject: [PATCH 67/92] Prevent partial runs from becoming latest baseline --- examples/demo_qa/runs/layout.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/demo_qa/runs/layout.py b/examples/demo_qa/runs/layout.py index 8c0d61e..c527d04 100644 --- a/examples/demo_qa/runs/layout.py +++ b/examples/demo_qa/runs/layout.py @@ -62,7 +62,7 @@ def _load_latest_run(artifacts_dir: Path, tag: str | None = None, *, kind: str = return None -def _resolve_results_path_for_run(run_path: Path | None) -> Optional[Path]: +def _resolve_results_path_for_run(run_path: Path | None, *, require_complete: bool = False) -> Optional[Path]: if run_path is None: return None summary_path = run_path / "summary.json" @@ -70,12 +70,14 @@ def _resolve_results_path_for_run(run_path: Path | None) -> Optional[Path]: try: summary = json.loads(summary_path.read_text(encoding="utf-8")) results_path = summary.get("results_path") + if require_complete and summary.get("results_complete") is False: + return None if results_path: return Path(results_path) except Exception: pass candidate = run_path / "results.jsonl" - if candidate.exists(): + if candidate.exists() and not require_complete: return candidate return None @@ -86,7 +88,7 @@ def _load_latest_results(artifacts_dir: Path, tag: str | None = None) -> Optiona if resolved: return resolved latest_run = _load_latest_run(artifacts_dir, tag, kind="complete") - return _resolve_results_path_for_run(latest_run) + return _resolve_results_path_for_run(latest_run, require_complete=True) def _load_latest_any_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]: @@ -137,8 +139,8 @@ def _update_latest_markers( for markers in marker_sets: markers.complete.parent.mkdir(parents=True, exist_ok=True) markers.any_run.write_text(str(run_folder), encoding="utf-8") - markers.legacy_run.write_text(str(run_folder), encoding="utf-8") if results_complete: + markers.legacy_run.write_text(str(run_folder), encoding="utf-8") markers.complete.write_text(str(run_folder), encoding="utf-8") markers.results.write_text(str(results_path), encoding="utf-8") From 2995f016fcd639110d3d5a63abfe42beb1e74276 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 10:41:27 +0300 Subject: [PATCH 68/92] Fix overlay lookup and only-missed base selection --- examples/demo_qa/batch.py | 15 +++++++++++++-- examples/demo_qa/runs/layout.py | 3 +-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 26e07ee..494881e 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -499,7 +499,9 @@ def handle_batch(args) -> int: exclude_ids=exclude_ids, ) suite_case_ids = [case.id for case in filtered_cases] + filtered_case_lookup = {case.id: case for case in filtered_cases} cases = filtered_cases + failed_selection_ids: set[str] | None = None if args.only_failed: selection_ids, breakdown = _only_failed_selection( @@ -509,6 +511,7 @@ def handle_batch(args) -> int: require_assert=args.require_assert, ) cases = [case for case in cases if case.id in selection_ids] + failed_selection_ids = selection_ids healed = breakdown.get("healed", set()) baseline_fails = breakdown.get("baseline_failures", set()) new_failures = breakdown.get("new_failures", set()) @@ -559,13 +562,21 @@ def handle_batch(args) -> int: if args.only_missed and missed_baseline_results is None: print("No baseline found for --only-missed.", file=sys.stderr) return 2 - selected_case_ids = [case.id for case in cases] + selected_case_ids = suite_case_ids missed_ids, missed_breakdown = _only_missed_selection( selected_case_ids, missed_baseline_results, overlay_results if not args.no_overlay else None, ) - cases = [case for case in cases if case.id in missed_ids] + base_pool = filtered_case_lookup + target_ids = missed_ids + if args.only_failed and failed_selection_ids is not None: + target_ids = target_ids & failed_selection_ids + print( + f"Combining --only-failed and --only-missed via intersection: {len(target_ids)} cases remain.", + file=sys.stderr, + ) + cases = [case for cid, case in base_pool.items() if cid in target_ids] print(f"Baseline (missed) results: {missed_baseline_path}", file=sys.stderr) print(f"Overlay executed: {len(missed_breakdown.get('overlay_executed', set()))}", file=sys.stderr) print(f"Missed in baseline: {len(missed_breakdown.get('missed_base', set()))}", file=sys.stderr) diff --git a/examples/demo_qa/runs/layout.py b/examples/demo_qa/runs/layout.py index c527d04..022fc08 100644 --- a/examples/demo_qa/runs/layout.py +++ b/examples/demo_qa/runs/layout.py @@ -98,8 +98,7 @@ def _load_latest_any_results(artifacts_dir: Path, tag: str | None = None) -> Opt results = _resolve_results_path_for_run(latest_run) if results: return results - markers = _latest_markers(artifacts_dir, tag) - return _read_marker(markers.results) + return None def _load_run_meta(run_path: Path | None) -> Optional[dict]: From ce0d9c799199e2db7ae2646cd230d9ec236d0be6 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 10:41:32 +0300 Subject: [PATCH 69/92] Add anti-flake passes and update messaging --- examples/demo_qa/batch.py | 62 +++++++++++++++++++++++++++++++++++++-- examples/demo_qa/cli.py | 6 ++++ 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 494881e..f42f9de 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -30,7 +30,7 @@ save_status, summarize, ) -from .runs.case_history import _append_case_history +from .runs.case_history import _append_case_history, _load_case_history from .runs.coverage import _missed_case_ids from .runs.effective import ( _append_effective_diff, @@ -118,19 +118,70 @@ def _load_ids(path: Optional[Path]) -> set[str] | None: return ids +def _consecutive_passes( + case_id: str, + overlay_result: RunResult, + artifacts_dir: Path, + *, + tag: str | None, + scope_hash: str, + passes_required: int, + fail_on: str, + require_assert: bool, +) -> bool: + if passes_required <= 1: + return True + bad = bad_statuses(fail_on, require_assert) + if overlay_result.status in bad: + return False + count = 1 + history_path = artifacts_dir / "runs" / "cases" / f"{case_id}.jsonl" + entries = list(reversed(_load_case_history(history_path))) + for entry in entries: + if tag is not None and entry.get("tag") != tag: + continue + if scope_hash and entry.get("scope_hash") not in (None, scope_hash): + continue + status = str(entry.get("status", "")) + if status in bad: + break + count += 1 + if count >= passes_required: + return True + return count >= passes_required + + def _only_failed_selection( baseline_results: Mapping[str, RunResult] | None, overlay_results: Mapping[str, RunResult] | None, *, fail_on: str, require_assert: bool, + artifacts_dir: Path, + tag: str | None, + scope_hash: str, + anti_flake_passes: int, ) -> tuple[set[str], dict[str, object]]: baseline = baseline_results or {} overlay = overlay_results or {} bad = bad_statuses(fail_on, require_assert) baseline_bad = {cid for cid, res in baseline.items() if res.status in bad} overlay_bad = {cid for cid, res in overlay.items() if res.status in bad} - overlay_good = {cid for cid, res in overlay.items() if res.status not in bad} + overlay_good = { + cid + for cid, res in overlay.items() + if res.status not in bad + and _consecutive_passes( + cid, + res, + artifacts_dir, + tag=tag, + scope_hash=scope_hash, + passes_required=anti_flake_passes, + fail_on=fail_on, + require_assert=require_assert, + ) + } healed = baseline_bad & overlay_good selection = (baseline_bad - healed) | overlay_bad @@ -509,6 +560,10 @@ def handle_batch(args) -> int: overlay_results if not args.no_overlay else None, fail_on=args.fail_on, require_assert=args.require_assert, + artifacts_dir=artifacts_dir, + tag=args.tag, + scope_hash=scope_id, + anti_flake_passes=max(1, int(args.anti_flake_passes)), ) cases = [case for case in cases if case.id in selection_ids] failed_selection_ids = selection_ids @@ -831,6 +886,7 @@ def handle_batch(args) -> int: "fail_fast": args.fail_fast, "max_fails": args.max_fails, "no_overlay": args.no_overlay, + "anti_flake_passes": args.anti_flake_passes, }, "interrupted": interrupted, "interrupted_at_case_id": interrupted_at_case_id, @@ -1020,7 +1076,7 @@ def handle_case_open(args) -> int: artifacts_dir = args.artifacts_dir or (args.data / ".runs") run_path = _resolve_run_path(args.run, artifacts_dir) if not run_path: - print("No run found. Provide --run or ensure runs/latest.txt exists.", file=sys.stderr) + print("No run found. Provide --run or ensure runs/latest_any.txt exists (run a batch first).", file=sys.stderr) return 2 case_dir = _find_case_artifact(run_path, args.case_id) if not case_dir: diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index 6bcab36..7acc050 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -96,6 +96,12 @@ def build_parser() -> argparse.ArgumentParser: action="store_true", help="Ignore latest partial run when selecting only-failed/only-missed (use baseline only)", ) + batch_p.add_argument( + "--anti-flake-passes", + type=int, + default=2, + help="Require N consecutive PASS results to consider a test healed (applies to --only-failed overlay logic)", + ) batch_p.add_argument("--plan-only", action="store_true", help="Run planner only (no fetch/synthesize)") batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code") batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show") From 6ec96a10ee0a76e42d6bac80f884ace07c8c4fa8 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 10:41:36 +0300 Subject: [PATCH 70/92] Improve overlay logging and baseline flag help --- examples/demo_qa/batch.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index f42f9de..4af12c2 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -485,6 +485,10 @@ def handle_batch(args) -> int: only_failed_baseline_kind: str | None = None if baseline_filter_path_arg: only_failed_baseline_kind = "path" + print( + "Using explicit baseline from --only-failed-from; overlay (latest any run) will still be considered unless --no-overlay is set.", + file=sys.stderr, + ) elif args.tag and args.only_failed: effective_results, effective_meta, eff_path = _load_effective_results(artifacts_dir, args.tag) if not effective_results: @@ -570,10 +574,23 @@ def handle_batch(args) -> int: healed = breakdown.get("healed", set()) baseline_fails = breakdown.get("baseline_failures", set()) new_failures = breakdown.get("new_failures", set()) - baseline_label = str(_run_dir_from_results_path(baseline_filter_path) or baseline_filter_path or "n/a") - overlay_label = str(overlay_run_path or overlay_results_path or "n/a") - print(f"Baseline: {baseline_label}", file=sys.stderr) - print(f"Overlay: {overlay_label}", file=sys.stderr) + baseline_meta = _load_run_meta(_run_dir_from_results_path(baseline_filter_path)) + baseline_label = baseline_meta.get("run_id") if isinstance(baseline_meta, dict) else None + baseline_status = baseline_meta.get("run_status") if isinstance(baseline_meta, dict) else None + overlay_meta = _load_run_meta(overlay_run_path) + overlay_label = overlay_meta.get("run_id") if isinstance(overlay_meta, dict) else None + overlay_status = overlay_meta.get("run_status") if isinstance(overlay_meta, dict) else None + baseline_complete = baseline_meta.get("results_complete") if isinstance(baseline_meta, dict) else None + overlay_complete = overlay_meta.get("results_complete") if isinstance(overlay_meta, dict) else None + scope_display = scope_id or "n/a" + print( + f"Baseline: run_id={baseline_label or 'n/a'} status={baseline_status or 'n/a'} complete={baseline_complete} scope={scope_display}", + file=sys.stderr, + ) + print( + f"Overlay: run_id={overlay_label or 'n/a'} status={overlay_status or 'n/a'} complete={overlay_complete} scope={scope_display}", + file=sys.stderr, + ) print(f"Baseline failures: {len(baseline_fails)}", file=sys.stderr) print(f"Healed by overlay: {len(healed)}", file=sys.stderr) print(f"New failures in overlay: {len(new_failures)}", file=sys.stderr) From ef193309bcfc7bf591bf89beb1290c9603a6fe4a Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 10:41:41 +0300 Subject: [PATCH 71/92] Harden selection API defaults and stabilize only-missed logging --- examples/demo_qa/batch.py | 35 +++++++++++++++++------------------ tests/test_demo_qa_batch.py | 9 ++++++++- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 4af12c2..d7a6012 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -121,16 +121,16 @@ def _load_ids(path: Optional[Path]) -> set[str] | None: def _consecutive_passes( case_id: str, overlay_result: RunResult, - artifacts_dir: Path, + artifacts_dir: Path | None = None, *, - tag: str | None, - scope_hash: str, - passes_required: int, - fail_on: str, - require_assert: bool, + tag: str | None = None, + scope_hash: str = "", + passes_required: int = 1, + fail_on: str = "bad", + require_assert: bool = False, ) -> bool: - if passes_required <= 1: - return True + if passes_required <= 1 or artifacts_dir is None: + return overlay_result.status not in bad_statuses(fail_on, require_assert) bad = bad_statuses(fail_on, require_assert) if overlay_result.status in bad: return False @@ -155,12 +155,12 @@ def _only_failed_selection( baseline_results: Mapping[str, RunResult] | None, overlay_results: Mapping[str, RunResult] | None, *, - fail_on: str, - require_assert: bool, - artifacts_dir: Path, - tag: str | None, - scope_hash: str, - anti_flake_passes: int, + fail_on: str = "bad", + require_assert: bool = False, + artifacts_dir: Path | None = None, + tag: str | None = None, + scope_hash: str = "", + anti_flake_passes: int = 1, ) -> tuple[set[str], dict[str, object]]: baseline = baseline_results or {} overlay = overlay_results or {} @@ -640,19 +640,18 @@ def handle_batch(args) -> int: missed_baseline_results, overlay_results if not args.no_overlay else None, ) - base_pool = filtered_case_lookup target_ids = missed_ids if args.only_failed and failed_selection_ids is not None: target_ids = target_ids & failed_selection_ids print( - f"Combining --only-failed and --only-missed via intersection: {len(target_ids)} cases remain.", + f"Combining --only-failed and --only-missed via intersection: {len(target_ids)} cases remain (missed={len(missed_ids)}).", file=sys.stderr, ) - cases = [case for cid, case in base_pool.items() if cid in target_ids] + cases = [case for case in filtered_cases if case.id in target_ids] print(f"Baseline (missed) results: {missed_baseline_path}", file=sys.stderr) print(f"Overlay executed: {len(missed_breakdown.get('overlay_executed', set()))}", file=sys.stderr) print(f"Missed in baseline: {len(missed_breakdown.get('missed_base', set()))}", file=sys.stderr) - print(f"Final only-missed selection: {len(missed_ids)}", file=sys.stderr) + print(f"Final only-missed selection: {len(target_ids)}", file=sys.stderr) if not cases: print("0 missed cases selected.", file=sys.stderr) diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 6226fc4..ddf1b0f 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -118,7 +118,14 @@ def test_only_failed_selection_uses_overlay_and_baseline() -> None: baseline = {"a": _mk_result("a", "failed"), "b": _mk_result("b", "failed")} overlay = {"a": _mk_result("a", "ok"), "c": _mk_result("c", "failed")} - selection, breakdown = _only_failed_selection(baseline, overlay, fail_on="bad", require_assert=False) + selection, breakdown = _only_failed_selection( + baseline, + overlay, + fail_on="bad", + require_assert=False, + artifacts_dir=None, + anti_flake_passes=1, + ) assert selection == {"b", "c"} assert breakdown["healed"] == {"a"} From 59c1a3f18f4654097272bf30332216ff696f0638 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 10:41:46 +0300 Subject: [PATCH 72/92] Clarify scope handling and streamline overlay resolution --- examples/demo_qa/batch.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index d7a6012..a0f1281 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -140,6 +140,7 @@ def _consecutive_passes( for entry in entries: if tag is not None and entry.get("tag") != tag: continue + # Old history entries may not contain scope_hash; treat missing as compatible for migration. if scope_hash and entry.get("scope_hash") not in (None, scope_hash): continue status = str(entry.get("status", "")) @@ -535,9 +536,7 @@ def handle_batch(args) -> int: return 2 overlay_run_path = _load_latest_run(artifacts_dir, args.tag, kind="any") - overlay_results_path = _resolve_results_path_for_run(overlay_run_path) or _load_latest_any_results( - artifacts_dir, args.tag - ) + overlay_results_path = _load_latest_any_results(artifacts_dir, args.tag) if overlay_results_path and not args.no_overlay: try: overlay_results = load_results(overlay_results_path) @@ -1092,7 +1091,10 @@ def handle_case_open(args) -> int: artifacts_dir = args.artifacts_dir or (args.data / ".runs") run_path = _resolve_run_path(args.run, artifacts_dir) if not run_path: - print("No run found. Provide --run or ensure runs/latest_any.txt exists (run a batch first).", file=sys.stderr) + print( + "No run found. Provide --run or ensure latest markers exist (latest_any/latest_complete); run a batch first.", + file=sys.stderr, + ) return 2 case_dir = _find_case_artifact(run_path, args.case_id) if not case_dir: From 17e8154dd8b014ab9ebf3a5f7ccf61cfbbf24de0 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 10:50:24 +0300 Subject: [PATCH 73/92] Refine overlay logging and handling when disabled --- examples/demo_qa/batch.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index a0f1281..49b8b54 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -535,8 +535,12 @@ def handle_batch(args) -> int: print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr) return 2 - overlay_run_path = _load_latest_run(artifacts_dir, args.tag, kind="any") - overlay_results_path = _load_latest_any_results(artifacts_dir, args.tag) + overlay_run_path = None + overlay_results_path = None + overlay_disabled = args.no_overlay + if not overlay_disabled: + overlay_run_path = _load_latest_run(artifacts_dir, args.tag, kind="any") + overlay_results_path = _load_latest_any_results(artifacts_dir, args.tag) if overlay_results_path and not args.no_overlay: try: overlay_results = load_results(overlay_results_path) @@ -586,10 +590,14 @@ def handle_batch(args) -> int: f"Baseline: run_id={baseline_label or 'n/a'} status={baseline_status or 'n/a'} complete={baseline_complete} scope={scope_display}", file=sys.stderr, ) - print( - f"Overlay: run_id={overlay_label or 'n/a'} status={overlay_status or 'n/a'} complete={overlay_complete} scope={scope_display}", - file=sys.stderr, + overlay_line = ( + "Overlay: disabled (--no-overlay)" + if args.no_overlay + else f"Overlay: run_id={overlay_label or 'n/a'} status={overlay_status or 'n/a'} complete={overlay_complete} scope={scope_display}" ) + if overlay_results_path is None and not args.no_overlay: + overlay_line = "Overlay: none (no latest_any run)" + print(overlay_line, file=sys.stderr) print(f"Baseline failures: {len(baseline_fails)}", file=sys.stderr) print(f"Healed by overlay: {len(healed)}", file=sys.stderr) print(f"New failures in overlay: {len(new_failures)}", file=sys.stderr) From 5036908d24917f42b91f36814a865f2e2900eb05 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 10:54:28 +0300 Subject: [PATCH 74/92] Tighten anti-flake scope handling and marker coverage --- examples/demo_qa/batch.py | 21 ++++++++++++++------- examples/demo_qa/cli.py | 5 +++++ tests/test_demo_qa_batch.py | 4 ++++ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 49b8b54..fba9d11 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -44,7 +44,6 @@ _load_latest_any_results, _load_latest_run, _load_run_meta, - _resolve_results_path_for_run, _run_dir_from_results_path, _update_latest_markers, ) @@ -128,6 +127,7 @@ def _consecutive_passes( passes_required: int = 1, fail_on: str = "bad", require_assert: bool = False, + strict_scope_history: bool = False, ) -> bool: if passes_required <= 1 or artifacts_dir is None: return overlay_result.status not in bad_statuses(fail_on, require_assert) @@ -140,9 +140,11 @@ def _consecutive_passes( for entry in entries: if tag is not None and entry.get("tag") != tag: continue - # Old history entries may not contain scope_hash; treat missing as compatible for migration. - if scope_hash and entry.get("scope_hash") not in (None, scope_hash): - continue + # Old history entries may not contain scope_hash; treat missing as compatible for migration unless strict. + if scope_hash: + entry_scope = entry.get("scope_hash") + if entry_scope != scope_hash and (strict_scope_history or entry_scope is not None): + continue status = str(entry.get("status", "")) if status in bad: break @@ -162,6 +164,7 @@ def _only_failed_selection( tag: str | None = None, scope_hash: str = "", anti_flake_passes: int = 1, + strict_scope_history: bool = False, ) -> tuple[set[str], dict[str, object]]: baseline = baseline_results or {} overlay = overlay_results or {} @@ -181,6 +184,7 @@ def _only_failed_selection( passes_required=anti_flake_passes, fail_on=fail_on, require_assert=require_assert, + strict_scope_history=strict_scope_history, ) } @@ -459,7 +463,6 @@ def handle_batch(args) -> int: baseline_for_compare: Optional[Mapping[str, RunResult]] = None failed_baseline_results: Optional[Mapping[str, RunResult]] = None - failed_baseline_path: Path | None = None missed_baseline_results: Optional[Mapping[str, RunResult]] = None missed_baseline_path: Path | None = None overlay_results: Optional[Mapping[str, RunResult]] = None @@ -571,6 +574,7 @@ def handle_batch(args) -> int: tag=args.tag, scope_hash=scope_id, anti_flake_passes=max(1, int(args.anti_flake_passes)), + strict_scope_history=args.strict_scope_history, ) cases = [case for case in cases if case.id in selection_ids] failed_selection_ids = selection_ids @@ -910,6 +914,7 @@ def handle_batch(args) -> int: "max_fails": args.max_fails, "no_overlay": args.no_overlay, "anti_flake_passes": args.anti_flake_passes, + "strict_scope_history": args.strict_scope_history, }, "interrupted": interrupted, "interrupted_at_case_id": interrupted_at_case_id, @@ -1055,7 +1060,8 @@ def handle_case_run(args) -> int: artifacts_dir = args.artifacts_dir or (args.data / ".runs") timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}" + run_id = uuid.uuid4().hex[:8] + run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}_{run_id}" artifacts_root = run_folder / "cases" results_path = run_folder / "results.jsonl" @@ -1074,7 +1080,7 @@ def handle_case_run(args) -> int: run_status = "FAILED" if bad_count else "SUCCESS" exit_code = 1 if bad_count else 0 summary = { - "run_id": run_folder.name, + "run_id": run_id, "timestamp": timestamp + "Z", "counts": counts, "results_path": str(results_path), @@ -1085,6 +1091,7 @@ def handle_case_run(args) -> int: "total_selected": 1, "total_executed": 1, "exit_code": exit_code, + "run_dir": str(run_folder), } summary_path = write_summary(results_path, summary) _update_latest_markers(run_folder, results_path, artifacts_dir, None, results_complete=True) diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index 7acc050..cf66445 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -102,6 +102,11 @@ def build_parser() -> argparse.ArgumentParser: default=2, help="Require N consecutive PASS results to consider a test healed (applies to --only-failed overlay logic)", ) + batch_p.add_argument( + "--strict-scope-history", + action="store_true", + help="Require scope_hash match in history when counting consecutive passes (disable migration fallback)", + ) batch_p.add_argument("--plan-only", action="store_true", help="Run planner only (no fetch/synthesize)") batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code") batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show") diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index ddf1b0f..26791c5 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -157,11 +157,13 @@ def test_update_latest_markers_handles_tag(tmp_path: Path) -> None: assert latest_default.complete.read_text(encoding="utf-8").strip() == str(run_dir) assert latest_default.results.read_text(encoding="utf-8").strip() == str(results_path) assert latest_default.any_run.read_text(encoding="utf-8").strip() == str(run_dir) + assert latest_default.legacy_run.read_text(encoding="utf-8").strip() == str(run_dir) latest_tag = _latest_markers(artifacts_dir, "feature/beta") assert latest_tag.complete.read_text(encoding="utf-8").strip() == str(run_dir) assert latest_tag.results.read_text(encoding="utf-8").strip() == str(results_path) assert latest_tag.any_run.read_text(encoding="utf-8").strip() == str(run_dir) + assert latest_tag.legacy_run.read_text(encoding="utf-8").strip() == str(run_dir) partial_dir = artifacts_dir / "runs" / "20240102_cases" partial_results = partial_dir / "results.jsonl" @@ -174,8 +176,10 @@ def test_update_latest_markers_handles_tag(tmp_path: Path) -> None: assert refreshed_default.complete.read_text(encoding="utf-8").strip() == str(run_dir) assert refreshed_default.results.read_text(encoding="utf-8").strip() == str(results_path) assert refreshed_default.any_run.read_text(encoding="utf-8").strip() == str(partial_dir) + assert refreshed_default.legacy_run.read_text(encoding="utf-8").strip() == str(run_dir) refreshed_tag = _latest_markers(artifacts_dir, "feature/beta") assert refreshed_tag.complete.read_text(encoding="utf-8").strip() == str(run_dir) assert refreshed_tag.results.read_text(encoding="utf-8").strip() == str(results_path) assert refreshed_tag.any_run.read_text(encoding="utf-8").strip() == str(partial_dir) + assert refreshed_tag.legacy_run.read_text(encoding="utf-8").strip() == str(run_dir) From b894f8860d06e3f779a01db0ae2c498c175d7abd Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 3 Jan 2026 11:02:54 +0300 Subject: [PATCH 75/92] =?UTF-8?q?=D0=BA=D0=BE=D1=81=D0=BC=D0=B5=D1=82?= =?UTF-8?q?=D0=B8=D0=BA=D0=B0:=20=D1=81=D0=BE=D0=BE=D0=B1=D1=89=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D0=B5=20=D0=BE=D0=B1=20=D0=BE=D1=88=D0=B8=D0=B1?= =?UTF-8?q?=D0=BA=D0=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo_qa/batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index bc3cf66..c8ad04d 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -618,7 +618,7 @@ def handle_batch(args) -> int: ) save_status(stub) results.append(stub) - print("Interrupted during case execution; saved partial status.", file=sys.stderr) + print("\nInterrupted during case execution; saved partial status.", file=sys.stderr) break results.append(result) if not args.quiet: From b27e9d0dda964d29c74aaadb95a1a1da1ef7eb4a Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 11:03:55 +0300 Subject: [PATCH 76/92] Clean unused imports and tighten marker handling --- examples/demo_qa/batch.py | 1 - examples/demo_qa/runs/layout.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index fba9d11..2fbd41e 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -560,7 +560,6 @@ def handle_batch(args) -> int: exclude_ids=exclude_ids, ) suite_case_ids = [case.id for case in filtered_cases] - filtered_case_lookup = {case.id: case for case in filtered_cases} cases = filtered_cases failed_selection_ids: set[str] | None = None diff --git a/examples/demo_qa/runs/layout.py b/examples/demo_qa/runs/layout.py index 022fc08..89bbc09 100644 --- a/examples/demo_qa/runs/layout.py +++ b/examples/demo_qa/runs/layout.py @@ -136,7 +136,8 @@ def _update_latest_markers( if tag: marker_sets.add(_latest_markers(artifacts_dir, tag)) for markers in marker_sets: - markers.complete.parent.mkdir(parents=True, exist_ok=True) + for path in [markers.complete, markers.results, markers.any_run, markers.legacy_run]: + path.parent.mkdir(parents=True, exist_ok=True) markers.any_run.write_text(str(run_folder), encoding="utf-8") if results_complete: markers.legacy_run.write_text(str(run_folder), encoding="utf-8") From 1e14d104d958a0b02ee7a9962624d038d86d6eeb Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 11:14:06 +0300 Subject: [PATCH 77/92] Document scope migration and include run_dir in case summaries --- examples/demo_qa/batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 2fbd41e..fbb3918 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -140,7 +140,7 @@ def _consecutive_passes( for entry in entries: if tag is not None and entry.get("tag") != tag: continue - # Old history entries may not contain scope_hash; treat missing as compatible for migration unless strict. + # Old history entries may not contain scope_hash; treat missing as compatible for migration unless strict_scope_history is set. if scope_hash: entry_scope = entry.get("scope_hash") if entry_scope != scope_hash and (strict_scope_history or entry_scope is not None): From 9dfaec7b83aef7d16a6f23b2b7d91b9848e0a003 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 11:27:30 +0300 Subject: [PATCH 78/92] Avoid overlay logging when disabled and remove unused imports --- examples/demo_qa/batch.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index fbb3918..fac8c16 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -544,13 +544,13 @@ def handle_batch(args) -> int: if not overlay_disabled: overlay_run_path = _load_latest_run(artifacts_dir, args.tag, kind="any") overlay_results_path = _load_latest_any_results(artifacts_dir, args.tag) - if overlay_results_path and not args.no_overlay: - try: - overlay_results = load_results(overlay_results_path) - except Exception as exc: - print(f"Failed to read overlay results from latest run: {exc}", file=sys.stderr) - overlay_results_path = None - overlay_results = None + if overlay_results_path: + try: + overlay_results = load_results(overlay_results_path) + except Exception as exc: + print(f"Failed to read overlay results from latest run: {exc}", file=sys.stderr) + overlay_results_path = None + overlay_results = None filtered_cases = _select_cases_for_rerun( cases, From b65f81b3bdb01bc2f53eb32d5a4aca3d5e72d57d Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 11:34:10 +0300 Subject: [PATCH 79/92] Fix anti-flake double counting and respect baseline planned pool --- examples/demo_qa/batch.py | 17 ++++++++++++++- tests/test_demo_qa_batch.py | 43 +++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index fac8c16..0f1b9bd 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -137,7 +137,11 @@ def _consecutive_passes( count = 1 history_path = artifacts_dir / "runs" / "cases" / f"{case_id}.jsonl" entries = list(reversed(_load_case_history(history_path))) + skip_first = True # overlay_result already counted; skip the most recent history entry for entry in entries: + if skip_first: + skip_first = False + continue if tag is not None and entry.get("tag") != tag: continue # Old history entries may not contain scope_hash; treat missing as compatible for migration unless strict_scope_history is set. @@ -607,6 +611,7 @@ def handle_batch(args) -> int: print(f"Final only-failed selection: {len(selection_ids)}", file=sys.stderr) only_missed_baseline_kind: str | None = None + missed_planned_ids: set[str] | None = None if args.only_missed: only_missed_from_arg = cast(Optional[Path], args.only_missed_from) if only_missed_from_arg: @@ -629,6 +634,9 @@ def handle_batch(args) -> int: missed_baseline_path = eff_path missed_baseline_results = effective_results only_missed_baseline_kind = "effective" + planned_ids_meta = effective_meta.get("planned_case_ids") if isinstance(effective_meta, dict) else None + if isinstance(planned_ids_meta, list): + missed_planned_ids = {str(cid) for cid in planned_ids_meta} else: missed_baseline_path = only_missed_from_arg or _load_latest_results(artifacts_dir, args.tag) if only_missed_from_arg: @@ -641,10 +649,17 @@ def handle_batch(args) -> int: except Exception as exc: print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr) return 2 + if missed_baseline_path is not None and missed_planned_ids is None: + missed_baseline_run = _run_dir_from_results_path(missed_baseline_path) + baseline_meta = _load_run_meta(missed_baseline_run) + if isinstance(baseline_meta, dict): + planned_from_meta = baseline_meta.get("planned_case_ids") or baseline_meta.get("selected_case_ids") + if isinstance(planned_from_meta, list): + missed_planned_ids = {str(cid) for cid in planned_from_meta} if args.only_missed and missed_baseline_results is None: print("No baseline found for --only-missed.", file=sys.stderr) return 2 - selected_case_ids = suite_case_ids + selected_case_ids = missed_planned_ids or set(suite_case_ids) missed_ids, missed_breakdown = _only_missed_selection( selected_case_ids, missed_baseline_results, diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 26791c5..34669de 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -144,6 +144,49 @@ def test_only_missed_selection_uses_overlay_executed() -> None: assert breakdown["overlay_executed"] == {"c"} +def test_anti_flake_requires_two_passes_without_double_count(tmp_path: Path) -> None: + artifacts_dir = tmp_path + case_id = "x1" + history_dir = artifacts_dir / "runs" / "cases" + history_dir.mkdir(parents=True) + history_file = history_dir / f"{case_id}.jsonl" + # most recent in history = overlay run we already count + history_file.write_text( + json.dumps({"status": "ok", "scope_hash": "s"}, ensure_ascii=False) + "\n" + + json.dumps({"status": "failed", "scope_hash": "s"}, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + overlay_res = _mk_result(case_id, "ok") + healed = _consecutive_passes( + case_id, + overlay_res, + artifacts_dir, + scope_hash="s", + passes_required=2, + fail_on="bad", + require_assert=False, + strict_scope_history=True, + ) + assert healed is False + + +def test_only_missed_uses_planned_pool_from_baseline_meta(tmp_path: Path) -> None: + artifacts_dir = tmp_path + run_dir = artifacts_dir / "runs" / "r1" + run_dir.mkdir(parents=True) + baseline_results = {"a": _mk_result("a", "ok")} + results_path = run_dir / "results.jsonl" + results_path.write_text("", encoding="utf-8") + meta = {"planned_case_ids": ["a", "b"], "selected_case_ids": ["a", "b"], "scope_hash": "s"} + (run_dir / "run_meta.json").write_text(json.dumps(meta), encoding="utf-8") + + overlay = {"c": _mk_result("c", "ok")} + planned_pool = {"a", "b"} + missed, _ = _only_missed_selection(planned_pool, baseline_results, overlay) + + assert missed == {"b"} + + def test_update_latest_markers_handles_tag(tmp_path: Path) -> None: artifacts_dir = tmp_path / "data" / ".runs" run_dir = artifacts_dir / "runs" / "20240101_cases" From be56c6b61fb6a82d424e34981ecc515878a5e578 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 12:00:28 +0300 Subject: [PATCH 80/92] Add planned-pool helper and anti-flake regression tests --- examples/demo_qa/batch.py | 32 +++++++++++++++++++++----------- tests/test_demo_qa_batch.py | 9 +++------ 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 0f1b9bd..da379da 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -216,6 +216,24 @@ def _only_missed_selection( return missed_final, breakdown +def _planned_pool_from_meta( + effective_meta: Mapping[str, object] | None, baseline_results_path: Path | None, suite_case_ids: Iterable[str] +) -> set[str]: + planned: set[str] | None = None + if effective_meta: + planned_from_eff = effective_meta.get("planned_case_ids") + if isinstance(planned_from_eff, list): + planned = {str(cid) for cid in planned_from_eff} + if planned is None and baseline_results_path is not None: + run_dir = _run_dir_from_results_path(baseline_results_path) + meta = _load_run_meta(run_dir) + if isinstance(meta, dict): + planned_from_meta = meta.get("planned_case_ids") or meta.get("selected_case_ids") + if isinstance(planned_from_meta, list): + planned = {str(cid) for cid in planned_from_meta} + return planned or set(suite_case_ids) + + def _fingerprint_dir(data_dir: Path, *, verbose: bool = False) -> Mapping[str, object]: entries: list[dict] = [] total_bytes = 0 @@ -612,6 +630,7 @@ def handle_batch(args) -> int: only_missed_baseline_kind: str | None = None missed_planned_ids: set[str] | None = None + missed_effective_meta: Mapping[str, object] | None = None if args.only_missed: only_missed_from_arg = cast(Optional[Path], args.only_missed_from) if only_missed_from_arg: @@ -634,9 +653,7 @@ def handle_batch(args) -> int: missed_baseline_path = eff_path missed_baseline_results = effective_results only_missed_baseline_kind = "effective" - planned_ids_meta = effective_meta.get("planned_case_ids") if isinstance(effective_meta, dict) else None - if isinstance(planned_ids_meta, list): - missed_planned_ids = {str(cid) for cid in planned_ids_meta} + missed_effective_meta = effective_meta else: missed_baseline_path = only_missed_from_arg or _load_latest_results(artifacts_dir, args.tag) if only_missed_from_arg: @@ -649,17 +666,10 @@ def handle_batch(args) -> int: except Exception as exc: print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr) return 2 - if missed_baseline_path is not None and missed_planned_ids is None: - missed_baseline_run = _run_dir_from_results_path(missed_baseline_path) - baseline_meta = _load_run_meta(missed_baseline_run) - if isinstance(baseline_meta, dict): - planned_from_meta = baseline_meta.get("planned_case_ids") or baseline_meta.get("selected_case_ids") - if isinstance(planned_from_meta, list): - missed_planned_ids = {str(cid) for cid in planned_from_meta} if args.only_missed and missed_baseline_results is None: print("No baseline found for --only-missed.", file=sys.stderr) return 2 - selected_case_ids = missed_planned_ids or set(suite_case_ids) + selected_case_ids = _planned_pool_from_meta(missed_effective_meta, missed_baseline_path, suite_case_ids) missed_ids, missed_breakdown = _only_missed_selection( selected_case_ids, missed_baseline_results, diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 34669de..61eab1d 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -13,6 +13,7 @@ _fingerprint_dir, _only_failed_selection, _only_missed_selection, + _planned_pool_from_meta, bad_statuses, is_failure, render_markdown, @@ -174,17 +175,13 @@ def test_only_missed_uses_planned_pool_from_baseline_meta(tmp_path: Path) -> Non artifacts_dir = tmp_path run_dir = artifacts_dir / "runs" / "r1" run_dir.mkdir(parents=True) - baseline_results = {"a": _mk_result("a", "ok")} results_path = run_dir / "results.jsonl" results_path.write_text("", encoding="utf-8") meta = {"planned_case_ids": ["a", "b"], "selected_case_ids": ["a", "b"], "scope_hash": "s"} (run_dir / "run_meta.json").write_text(json.dumps(meta), encoding="utf-8") - overlay = {"c": _mk_result("c", "ok")} - planned_pool = {"a", "b"} - missed, _ = _only_missed_selection(planned_pool, baseline_results, overlay) - - assert missed == {"b"} + planned_pool = _planned_pool_from_meta(None, results_path, ["x", "y"]) + assert planned_pool == {"a", "b"} def test_update_latest_markers_handles_tag(tmp_path: Path) -> None: From 9062c690decc8b33f55ac3ada0f6be3d7d019eba Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 13:16:52 +0300 Subject: [PATCH 81/92] Improve anti-flake history handling (#92) * Improve anti-flake history handling * Fix anti-flake history filtering and ts handling --- examples/demo_qa/batch.py | 152 +++++++++++++++++++------- examples/demo_qa/cli.py | 11 ++ examples/demo_qa/runs/case_history.py | 113 ++++++++++++++++++- tests/test_demo_qa_batch.py | 99 +++++++++++++++-- 4 files changed, 327 insertions(+), 48 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index da379da..b1d39a1 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -30,7 +30,7 @@ save_status, summarize, ) -from .runs.case_history import _append_case_history, _load_case_history +from .runs.case_history import _append_case_history, _iter_case_entries_newest_first from .runs.coverage import _missed_case_ids from .runs.effective import ( _append_effective_diff, @@ -119,8 +119,8 @@ def _load_ids(path: Optional[Path]) -> set[str] | None: def _consecutive_passes( case_id: str, - overlay_result: RunResult, - artifacts_dir: Path | None = None, + overlay_entry: Mapping[str, object] | None, + history_path: Path | None, *, tag: str | None = None, scope_hash: str = "", @@ -128,34 +128,38 @@ def _consecutive_passes( fail_on: str = "bad", require_assert: bool = False, strict_scope_history: bool = False, -) -> bool: - if passes_required <= 1 or artifacts_dir is None: - return overlay_result.status not in bad_statuses(fail_on, require_assert) + max_entries: int | None = None, +) -> tuple[bool, list[dict]]: bad = bad_statuses(fail_on, require_assert) - if overlay_result.status in bad: - return False - count = 1 - history_path = artifacts_dir / "runs" / "cases" / f"{case_id}.jsonl" - entries = list(reversed(_load_case_history(history_path))) - skip_first = True # overlay_result already counted; skip the most recent history entry - for entry in entries: - if skip_first: - skip_first = False - continue - if tag is not None and entry.get("tag") != tag: - continue - # Old history entries may not contain scope_hash; treat missing as compatible for migration unless strict_scope_history is set. - if scope_hash: - entry_scope = entry.get("scope_hash") - if entry_scope != scope_hash and (strict_scope_history or entry_scope is not None): - continue + if overlay_entry is None: + return False, [] + if passes_required <= 1: + return (overlay_entry.get("status") not in bad, [dict(overlay_entry)]) + if history_path is None: + return False, [dict(overlay_entry)] + entries: list[dict] = [] + passes_needed = max(passes_required, 1) + iterator = _iter_case_entries_newest_first( + history_path, + case_id, + tag, + scope_hash or None, + strict_scope=strict_scope_history, + fail_on=fail_on, + require_assert=require_assert, + overlay_entry=dict(overlay_entry) if overlay_entry else None, + max_entries=max_entries or (passes_needed + 5), + ) + consecutive = 0 + for entry in iterator: + entries.append(entry) status = str(entry.get("status", "")) if status in bad: - break - count += 1 - if count >= passes_required: - return True - return count >= passes_required + return False, entries + consecutive += 1 + if consecutive >= passes_needed: + return True, entries + return False, entries def _only_failed_selection( @@ -169,20 +173,45 @@ def _only_failed_selection( scope_hash: str = "", anti_flake_passes: int = 1, strict_scope_history: bool = False, + overlay_run_meta: Mapping[str, object] | None = None, + overlay_run_path: Path | None = None, + explain_selection: bool = False, + explain_limit: int = 20, ) -> tuple[set[str], dict[str, object]]: baseline = baseline_results or {} overlay = overlay_results or {} bad = bad_statuses(fail_on, require_assert) baseline_bad = {cid for cid, res in baseline.items() if res.status in bad} overlay_bad = {cid for cid, res in overlay.items() if res.status in bad} - overlay_good = { - cid - for cid, res in overlay.items() - if res.status not in bad - and _consecutive_passes( + overlay_run_id = cast(Optional[str], overlay_run_meta.get("run_id") if isinstance(overlay_run_meta, Mapping) else None) + overlay_ts: Optional[object] = None + if isinstance(overlay_run_meta, Mapping): + overlay_ts = overlay_run_meta.get("ended_at") or overlay_run_meta.get("timestamp") or overlay_run_meta.get("started_at") + if overlay_ts is None and overlay_run_path and overlay_run_path.exists(): + try: + overlay_ts = overlay_run_path.stat().st_mtime + except OSError: + overlay_ts = None + overlay_entries: dict[str, dict] = {} + for cid, res in overlay.items(): + entry = { + "run_id": overlay_run_id or (str(overlay_run_path) if overlay_run_path else "overlay"), + "ts": overlay_ts, + "timestamp": overlay_ts, + "status": res.status, + "scope_hash": scope_hash, + "tag": tag, + "run_dir": str(overlay_run_path) if overlay_run_path else None, + } + overlay_entries[cid] = {k: v for k, v in entry.items() if v is not None} + + overlay_good: set[str] = set() + healed_details: dict[str, list[dict]] = {} + for cid, res in overlay.items(): + ok, history_entries = _consecutive_passes( cid, - res, - artifacts_dir, + overlay_entries.get(cid), + artifacts_dir / "runs" / "cases" / f"{cid}.jsonl" if artifacts_dir else None, tag=tag, scope_hash=scope_hash, passes_required=anti_flake_passes, @@ -190,7 +219,10 @@ def _only_failed_selection( require_assert=require_assert, strict_scope_history=strict_scope_history, ) - } + if ok: + overlay_good.add(cid) + if explain_selection: + healed_details[cid] = history_entries healed = baseline_bad & overlay_good selection = (baseline_bad - healed) | overlay_bad @@ -199,9 +231,36 @@ def _only_failed_selection( "healed": healed, "new_failures": overlay_bad, } + if explain_selection and healed_details: + limit = max(1, explain_limit) + breakdown["healed_details"] = {cid: healed_details[cid] for cid in list(sorted(healed_details))[:limit]} return selection, breakdown +def _format_healed_explain( + healed: Iterable[str], + healed_details: Mapping[str, list[dict]] | None, + *, + anti_flake_passes: int, + limit: int, +) -> list[str]: + details = healed_details or {} + max_entries = max(1, limit) + lines: list[str] = [] + healed_list = sorted(set(healed)) + for cid in healed_list[:max_entries]: + entries = details.get(cid, []) + lines.append(f"Healed because last {anti_flake_passes} results are PASS for case {cid}") + for entry in entries[:anti_flake_passes]: + rid = entry.get("run_id") + ts = entry.get("ts") or entry.get("timestamp") + status = entry.get("status") + lines.append(f" - run_id={rid} ts={ts} status={status}") + if len(healed_list) > max_entries: + lines.append(f"... {len(healed_list) - max_entries} more healed cases not shown (limit={max_entries})") + return lines + + def _only_missed_selection( selected_case_ids: Iterable[str], baseline_results: Mapping[str, RunResult] | None, @@ -563,6 +622,7 @@ def handle_batch(args) -> int: overlay_run_path = None overlay_results_path = None overlay_disabled = args.no_overlay + overlay_run_meta: Optional[Mapping[str, object]] = None if not overlay_disabled: overlay_run_path = _load_latest_run(artifacts_dir, args.tag, kind="any") overlay_results_path = _load_latest_any_results(artifacts_dir, args.tag) @@ -573,6 +633,8 @@ def handle_batch(args) -> int: print(f"Failed to read overlay results from latest run: {exc}", file=sys.stderr) overlay_results_path = None overlay_results = None + if overlay_run_path: + overlay_run_meta = _load_run_meta(overlay_run_path) filtered_cases = _select_cases_for_rerun( cases, @@ -596,6 +658,10 @@ def handle_batch(args) -> int: scope_hash=scope_id, anti_flake_passes=max(1, int(args.anti_flake_passes)), strict_scope_history=args.strict_scope_history, + overlay_run_meta=overlay_run_meta, + overlay_run_path=overlay_run_path, + explain_selection=args.explain_selection, + explain_limit=args.explain_limit, ) cases = [case for case in cases if case.id in selection_ids] failed_selection_ids = selection_ids @@ -605,7 +671,7 @@ def handle_batch(args) -> int: baseline_meta = _load_run_meta(_run_dir_from_results_path(baseline_filter_path)) baseline_label = baseline_meta.get("run_id") if isinstance(baseline_meta, dict) else None baseline_status = baseline_meta.get("run_status") if isinstance(baseline_meta, dict) else None - overlay_meta = _load_run_meta(overlay_run_path) + overlay_meta = overlay_run_meta if overlay_run_meta is not None else _load_run_meta(overlay_run_path) overlay_label = overlay_meta.get("run_id") if isinstance(overlay_meta, dict) else None overlay_status = overlay_meta.get("run_status") if isinstance(overlay_meta, dict) else None baseline_complete = baseline_meta.get("results_complete") if isinstance(baseline_meta, dict) else None @@ -627,6 +693,15 @@ def handle_batch(args) -> int: print(f"Healed by overlay: {len(healed)}", file=sys.stderr) print(f"New failures in overlay: {len(new_failures)}", file=sys.stderr) print(f"Final only-failed selection: {len(selection_ids)}", file=sys.stderr) + if args.explain_selection and healed: + healed_lines = _format_healed_explain( + healed, + breakdown.get("healed_details"), + anti_flake_passes=args.anti_flake_passes, + limit=args.explain_limit, + ) + for line in healed_lines: + print(line, file=sys.stderr) only_missed_baseline_kind: str | None = None missed_planned_ids: set[str] | None = None @@ -939,6 +1014,8 @@ def handle_batch(args) -> int: "no_overlay": args.no_overlay, "anti_flake_passes": args.anti_flake_passes, "strict_scope_history": args.strict_scope_history, + "explain_selection": args.explain_selection, + "explain_limit": args.explain_limit, }, "interrupted": interrupted, "interrupted_at_case_id": interrupted_at_case_id, @@ -1010,6 +1087,7 @@ def handle_batch(args) -> int: git_sha=git_sha, run_dir=run_folder, results_path=results_path, + run_ts=_isoformat_utc(ended_at), ) history_path.parent.mkdir(parents=True, exist_ok=True) with history_path.open("a", encoding="utf-8") as f: diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py index cf66445..5938355 100644 --- a/examples/demo_qa/cli.py +++ b/examples/demo_qa/cli.py @@ -107,6 +107,17 @@ def build_parser() -> argparse.ArgumentParser: action="store_true", help="Require scope_hash match in history when counting consecutive passes (disable migration fallback)", ) + batch_p.add_argument( + "--explain-selection", + action="store_true", + help="Explain why cases were selected/healed when using --only-failed/--only-missed", + ) + batch_p.add_argument( + "--explain-limit", + type=int, + default=20, + help="Maximum number of cases to include in explain output", + ) batch_p.add_argument("--plan-only", action="store_true", help="Run planner only (no fetch/synthesize)") batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code") batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show") diff --git a/examples/demo_qa/runs/case_history.py b/examples/demo_qa/runs/case_history.py index 860483c..55ac7fb 100644 --- a/examples/demo_qa/runs/case_history.py +++ b/examples/demo_qa/runs/case_history.py @@ -2,10 +2,15 @@ import datetime import json +import logging from pathlib import Path -from typing import Optional +from typing import Iterable, Mapping, Optional from ..runner import RunResult +from .layout import _load_run_meta + + +logger = logging.getLogger(__name__) def _reason_text(res: RunResult) -> str: @@ -33,11 +38,14 @@ def _append_case_history( git_sha: str | None, run_dir: Path, results_path: Path, + run_ts: str | None, ) -> None: history_dir = artifacts_dir / "runs" / "cases" history_dir.mkdir(parents=True, exist_ok=True) + ts = run_ts or datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z") payload = { - "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z"), + "timestamp": ts, + "ts": ts, "run_id": run_id, "tag": tag, "note": note, @@ -74,4 +82,103 @@ def _load_case_history(path: Path) -> list[dict]: return entries -__all__ = ["_append_case_history", "_load_case_history"] +def _parse_ts(value: object | None) -> float | None: + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return datetime.datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp() + except Exception: + try: + return float(value) + except Exception: + return None + return None + + +def _entry_ts(entry: Mapping[str, object], *, run_dir: Path | None) -> tuple[float | None, str | None]: + ts = _parse_ts(entry.get("ts")) or _parse_ts(entry.get("timestamp")) + if ts is not None: + return ts, None + meta_ts: float | None = None + if run_dir: + meta = _load_run_meta(run_dir) + if isinstance(meta, dict): + meta_ts = _parse_ts(meta.get("ended_at") or meta.get("timestamp") or meta.get("started_at")) + if meta_ts is None: + try: + meta_ts = run_dir.stat().st_mtime + except OSError: + meta_ts = None + return meta_ts, "history order fallback used" + + +def _iter_case_entries_newest_first( + history_path: Path, + case_id: str, + tag: str | None, + scope_hash: str | None, + *, + strict_scope: bool, + fail_on: str, + require_assert: bool, + overlay_entry: dict | None, + max_entries: int, +) -> Iterable[dict]: + entries = list(_load_case_history(history_path)) if history_path.exists() else [] + overlay_index = None + if overlay_entry: + overlay_index = len(entries) + entries.append(dict(overlay_entry)) + + accepted: dict[str, dict] = {} + ts_map: dict[str, float | None] = {} + is_overlay_map: dict[str, bool] = {} + warnings_emitted = False + for idx, entry in enumerate(entries): + if tag is not None and entry.get("tag") != tag: + continue + entry_scope = entry.get("scope_hash") + if scope_hash: + if entry_scope != scope_hash and (strict_scope or entry_scope is not None): + continue + run_id = str(entry.get("run_id")) if entry.get("run_id") is not None else None + if not run_id: + continue + run_dir = None + if entry.get("run_dir"): + run_dir = Path(str(entry["run_dir"])) + ts_value, warn = _entry_ts(entry, run_dir=run_dir) + if ts_value is None and warn: + warnings_emitted = True + is_overlay = overlay_entry is not None and idx == overlay_index + current_ts = ts_map.get(run_id) + current_is_overlay = is_overlay_map.get(run_id, False) + candidate_ts = ts_value + should_replace = False + if run_id not in accepted: + should_replace = True + else: + if candidate_ts is not None: + if current_ts is None or candidate_ts > current_ts or ( + candidate_ts == current_ts and is_overlay and not current_is_overlay + ): + should_replace = True + else: + if current_ts is None and is_overlay and not current_is_overlay: + should_replace = True + if should_replace: + accepted[run_id] = entry + ts_map[run_id] = candidate_ts + is_overlay_map[run_id] = is_overlay + if warnings_emitted: + logger.warning("ts missing; history order fallback used for case %s", case_id) + + sorted_entries = sorted(accepted.items(), key=lambda kv: ts_map.get(kv[0], 0), reverse=True) + for _, entry in sorted_entries[:max_entries]: + yield entry + + +__all__ = ["_append_case_history", "_iter_case_entries_newest_first", "_load_case_history"] diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 61eab1d..0c46705 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -10,6 +10,8 @@ import pytest from examples.demo_qa.batch import ( + _consecutive_passes, + _format_healed_explain, _fingerprint_dir, _only_failed_selection, _only_missed_selection, @@ -151,24 +153,92 @@ def test_anti_flake_requires_two_passes_without_double_count(tmp_path: Path) -> history_dir = artifacts_dir / "runs" / "cases" history_dir.mkdir(parents=True) history_file = history_dir / f"{case_id}.jsonl" - # most recent in history = overlay run we already count + now = "2024-01-02T00:00:00Z" + history_entries = [ + {"status": "ok", "scope_hash": "s", "run_id": "r1", "ts": now, "timestamp": now}, + ] + history_file.write_text("\n".join(json.dumps(e, ensure_ascii=False) for e in history_entries), encoding="utf-8") + overlay_entry = {"status": "ok", "scope_hash": "s", "run_id": "r1", "ts": now, "timestamp": now} + healed, _ = _consecutive_passes( + case_id, + overlay_entry, + artifacts_dir / "runs" / "cases" / f"{case_id}.jsonl", + scope_hash="s", + passes_required=2, + fail_on="bad", + require_assert=False, + strict_scope_history=True, + ) + assert healed is False + + +def test_anti_flake_order_independent(tmp_path: Path) -> None: + artifacts_dir = tmp_path + case_id = "case-1" + history_dir = artifacts_dir / "runs" / "cases" + history_dir.mkdir(parents=True) + history_file = history_dir / f"{case_id}.jsonl" + entries = [ + {"status": "ok", "scope_hash": "s", "run_id": "r2", "ts": "2024-01-03T00:00:00Z"}, + {"status": "failed", "scope_hash": "s", "run_id": "r1", "ts": "2024-01-01T00:00:00Z"}, + {"status": "ok", "scope_hash": "s", "run_id": "r3", "ts": "2024-01-02T00:00:00Z"}, + ] + history_file.write_text("\n".join(json.dumps(e, ensure_ascii=False) for e in entries), encoding="utf-8") + overlay_entry = {"status": "ok", "scope_hash": "s", "run_id": "r4", "ts": "2024-01-04T00:00:00Z"} + + healed, used_entries = _consecutive_passes( + case_id, + overlay_entry, + history_file, + scope_hash="s", + passes_required=2, + fail_on="bad", + require_assert=False, + strict_scope_history=True, + ) + assert healed is True + assert used_entries[0]["run_id"] == "r4" + + +def test_anti_flake_respects_legacy_scope_when_not_strict(tmp_path: Path) -> None: + artifacts_dir = tmp_path + case_id = "case-legacy" + history_dir = artifacts_dir / "runs" / "cases" + history_dir.mkdir(parents=True) + history_file = history_dir / f"{case_id}.jsonl" history_file.write_text( - json.dumps({"status": "ok", "scope_hash": "s"}, ensure_ascii=False) + "\n" - + json.dumps({"status": "failed", "scope_hash": "s"}, ensure_ascii=False) + "\n", + "\n".join( + json.dumps(e, ensure_ascii=False) + for e in [ + {"status": "ok", "scope_hash": None, "run_id": "r1", "ts": "2024-01-01T00:00:00Z"}, + ] + ), encoding="utf-8", ) - overlay_res = _mk_result(case_id, "ok") - healed = _consecutive_passes( + overlay_entry = {"status": "ok", "scope_hash": "s", "run_id": "r2", "ts": "2024-01-02T00:00:00Z"} + + healed_strict, _ = _consecutive_passes( case_id, - overlay_res, - artifacts_dir, + overlay_entry, + history_file, scope_hash="s", passes_required=2, fail_on="bad", require_assert=False, strict_scope_history=True, ) - assert healed is False + healed_migrating, _ = _consecutive_passes( + case_id, + overlay_entry, + history_file, + scope_hash="s", + passes_required=2, + fail_on="bad", + require_assert=False, + strict_scope_history=False, + ) + assert healed_strict is False + assert healed_migrating is True def test_only_missed_uses_planned_pool_from_baseline_meta(tmp_path: Path) -> None: @@ -223,3 +293,16 @@ def test_update_latest_markers_handles_tag(tmp_path: Path) -> None: assert refreshed_tag.results.read_text(encoding="utf-8").strip() == str(results_path) assert refreshed_tag.any_run.read_text(encoding="utf-8").strip() == str(partial_dir) assert refreshed_tag.legacy_run.read_text(encoding="utf-8").strip() == str(run_dir) + + +def test_format_healed_explain_includes_key_lines() -> None: + healed = {"a", "b"} + healed_details = { + "a": [ + {"run_id": "r2", "ts": "2024-01-02T00:00:00Z", "status": "ok"}, + {"run_id": "r1", "ts": "2024-01-01T00:00:00Z", "status": "ok"}, + ] + } + lines = _format_healed_explain(healed, healed_details, anti_flake_passes=2, limit=2) + assert any("Healed because last 2 results are PASS for case a" in line for line in lines) + assert any("run_id=r2" in line and "status=ok" in line for line in lines) From b58a4ce6c65e58e9da2b9fb558f52b3eda5bc63b Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 3 Jan 2026 14:22:40 +0300 Subject: [PATCH 82/92] ruff fixes --- examples/demo_qa/batch.py | 4 ++-- tests/test_demo_qa_batch.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index b1d39a1..bd8fe39 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -14,8 +14,8 @@ from .provider_factory import build_provider from .runner import ( Case, - DiffReport, DiffCaseChange, + DiffReport, EventLogger, RunResult, RunTimings, @@ -40,8 +40,8 @@ ) from .runs.io import write_results from .runs.layout import ( - _load_latest_results, _load_latest_any_results, + _load_latest_results, _load_latest_run, _load_run_meta, _run_dir_from_results_path, diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 0c46705..f294848 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -11,8 +11,8 @@ from examples.demo_qa.batch import ( _consecutive_passes, - _format_healed_explain, _fingerprint_dir, + _format_healed_explain, _only_failed_selection, _only_missed_selection, _planned_pool_from_meta, @@ -21,9 +21,9 @@ render_markdown, write_results, ) +from examples.demo_qa.runner import DiffReport, RunResult, diff_runs from examples.demo_qa.runs.coverage import _missed_case_ids from examples.demo_qa.runs.layout import _latest_markers, _update_latest_markers -from examples.demo_qa.runner import DiffReport, RunResult, diff_runs @pytest.mark.parametrize( From cdd6f35b1e70ed4fa20c2095d98f92e0d6a99726 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 14:49:50 +0300 Subject: [PATCH 83/92] Fix overlay scope handling for strict scope healing (#93) * Fix overlay scope handling for strict scope healing * Fix pyright typing issues in tests --- examples/demo_qa/batch.py | 50 +++++++++++++++++++++--- tests/test_demo_qa_batch.py | 74 ++++++++++++++++++++++++++++++++++++ tests/test_demo_qa_runner.py | 4 +- 3 files changed, 122 insertions(+), 6 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index bd8fe39..f92d9a2 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -184,14 +184,39 @@ def _only_failed_selection( baseline_bad = {cid for cid, res in baseline.items() if res.status in bad} overlay_bad = {cid for cid, res in overlay.items() if res.status in bad} overlay_run_id = cast(Optional[str], overlay_run_meta.get("run_id") if isinstance(overlay_run_meta, Mapping) else None) + overlay_scope_hash = cast(Optional[str], overlay_run_meta.get("scope_hash") if isinstance(overlay_run_meta, Mapping) else None) + overlay_tag = cast(Optional[str], overlay_run_meta.get("tag") if isinstance(overlay_run_meta, Mapping) else None) overlay_ts: Optional[object] = None if isinstance(overlay_run_meta, Mapping): - overlay_ts = overlay_run_meta.get("ended_at") or overlay_run_meta.get("timestamp") or overlay_run_meta.get("started_at") + overlay_ts = ( + overlay_run_meta.get("ended_at") + or overlay_run_meta.get("started_at") + or overlay_run_meta.get("ts") + or overlay_run_meta.get("timestamp") + ) if overlay_ts is None and overlay_run_path and overlay_run_path.exists(): try: overlay_ts = overlay_run_path.stat().st_mtime except OSError: overlay_ts = None + + current_scope_hash = scope_hash or None + overlay_scope_matches_current = True + if strict_scope_history and current_scope_hash: + overlay_scope_matches_current = overlay_scope_hash == current_scope_hash + + explain_lines: list[str] = [] + if explain_selection: + explain_lines.append( + f"current_scope_hash={current_scope_hash} overlay_scope_hash={overlay_scope_hash} " + f"overlay_scope_matches_current={overlay_scope_matches_current}" + ) + if overlay_tag is None and tag is not None: + explain_lines.append(f"Overlay tag missing; using current tag={tag} for overlay entries") + elif overlay_tag is not None and tag is not None and overlay_tag != tag: + explain_lines.append( + f"Overlay tag differs from current selection; using overlay tag={overlay_tag} (current tag={tag})" + ) overlay_entries: dict[str, dict] = {} for cid, res in overlay.items(): entry = { @@ -199,18 +224,20 @@ def _only_failed_selection( "ts": overlay_ts, "timestamp": overlay_ts, "status": res.status, - "scope_hash": scope_hash, - "tag": tag, + "scope_hash": overlay_scope_hash, + "tag": overlay_tag if overlay_tag is not None else tag, "run_dir": str(overlay_run_path) if overlay_run_path else None, } overlay_entries[cid] = {k: v for k, v in entry.items() if v is not None} overlay_good: set[str] = set() healed_details: dict[str, list[dict]] = {} + scope_mismatch_warned = False for cid, res in overlay.items(): + overlay_entry_for_history = overlay_entries.get(cid) if overlay_scope_matches_current else None ok, history_entries = _consecutive_passes( cid, - overlay_entries.get(cid), + overlay_entry_for_history, artifacts_dir / "runs" / "cases" / f"{cid}.jsonl" if artifacts_dir else None, tag=tag, scope_hash=scope_hash, @@ -219,6 +246,18 @@ def _only_failed_selection( require_assert=require_assert, strict_scope_history=strict_scope_history, ) + if explain_selection and strict_scope_history and not overlay_scope_matches_current: + if res.status not in bad: + explain_lines.append( + f"Overlay PASS for case {cid} ignored due to strict scope mismatch " + f"(overlay_scope_hash={overlay_scope_hash}, current_scope_hash={current_scope_hash})" + ) + elif not scope_mismatch_warned: + explain_lines.append( + f"Overlay scope mismatch; overlay failures still counted (overlay_scope_hash={overlay_scope_hash}, " + f"current_scope_hash={current_scope_hash})" + ) + scope_mismatch_warned = True if ok: overlay_good.add(cid) if explain_selection: @@ -234,6 +273,8 @@ def _only_failed_selection( if explain_selection and healed_details: limit = max(1, explain_limit) breakdown["healed_details"] = {cid: healed_details[cid] for cid in list(sorted(healed_details))[:limit]} + if explain_selection and explain_lines: + breakdown["explain"] = explain_lines return selection, breakdown @@ -704,7 +745,6 @@ def handle_batch(args) -> int: print(line, file=sys.stderr) only_missed_baseline_kind: str | None = None - missed_planned_ids: set[str] | None = None missed_effective_meta: Mapping[str, object] | None = None if args.only_missed: only_missed_from_arg = cast(Optional[Path], args.only_missed_from) diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index f294848..832d3f1 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -147,6 +147,80 @@ def test_only_missed_selection_uses_overlay_executed() -> None: assert breakdown["overlay_executed"] == {"c"} +def test_only_failed_strict_scope_ignores_overlay_pass(tmp_path: Path) -> None: + baseline = {"A": _mk_result("A", "failed")} + overlay = {"A": _mk_result("A", "ok")} + overlay_meta = {"run_id": "overlay", "scope_hash": "scope_overlay", "ended_at": "2024-01-01T00:00:00Z"} + + selection, breakdown = _only_failed_selection( + baseline, + overlay, + fail_on="bad", + require_assert=False, + artifacts_dir=tmp_path, + tag="t1", + scope_hash="scope_current", + anti_flake_passes=1, + strict_scope_history=True, + overlay_run_meta=overlay_meta, + overlay_run_path=tmp_path, + explain_selection=True, + ) + + assert selection == {"A"} + assert breakdown["healed"] == set() + explain_lines = cast(list[str], breakdown.get("explain", []) or []) + assert any("overlay_scope_matches_current=False" in line for line in explain_lines) + + +def test_only_failed_strict_scope_allows_overlay_pass_when_scope_matches(tmp_path: Path) -> None: + baseline = {"A": _mk_result("A", "failed")} + overlay = {"A": _mk_result("A", "ok")} + overlay_meta = {"run_id": "overlay", "scope_hash": "scope_current", "ended_at": "2024-01-01T00:00:00Z"} + + selection, breakdown = _only_failed_selection( + baseline, + overlay, + fail_on="bad", + require_assert=False, + artifacts_dir=tmp_path, + tag="t1", + scope_hash="scope_current", + anti_flake_passes=1, + strict_scope_history=True, + overlay_run_meta=overlay_meta, + overlay_run_path=tmp_path, + explain_selection=True, + ) + + assert selection == set() + assert breakdown["healed"] == {"A"} + + +def test_only_failed_explain_notes_scope_mismatch(tmp_path: Path) -> None: + baseline = {"A": _mk_result("A", "failed")} + overlay = {"A": _mk_result("A", "ok")} + overlay_meta = {"run_id": "overlay", "scope_hash": "scope_other", "ended_at": "2024-01-01T00:00:00Z"} + + _, breakdown = _only_failed_selection( + baseline, + overlay, + fail_on="bad", + require_assert=False, + artifacts_dir=tmp_path, + tag="t1", + scope_hash="scope_current", + anti_flake_passes=1, + strict_scope_history=True, + overlay_run_meta=overlay_meta, + overlay_run_path=tmp_path, + explain_selection=True, + ) + + explain_lines = cast(list[str], breakdown.get("explain", []) or []) + assert any("ignored due to strict scope mismatch" in line for line in explain_lines) + + def test_anti_flake_requires_two_passes_without_double_count(tmp_path: Path) -> None: artifacts_dir = tmp_path case_id = "x1" diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py index 0888dfe..eafa3fc 100644 --- a/tests/test_demo_qa_runner.py +++ b/tests/test_demo_qa_runner.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import cast + from examples.demo_qa.runner import Case, RunResult, _match_expected, diff_runs, summarize @@ -9,7 +11,7 @@ def test_match_expected_unchecked_when_no_expectations() -> None: def test_match_expected_coerces_non_string_expected_values() -> None: - case = Case(id="c1", question="What is foo?", expected=42) + case = Case(id="c1", question="What is foo?", expected=cast(str, 42)) mismatch = _match_expected(case, "43") assert mismatch is not None From b91be9863eca28990a475b32f1b262a2b108e388 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 3 Jan 2026 14:52:28 +0300 Subject: [PATCH 84/92] ruff fixes --- examples/demo_qa/runs/case_history.py | 1 - tests/test_demo_qa_runner.py | 8 +++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/demo_qa/runs/case_history.py b/examples/demo_qa/runs/case_history.py index 55ac7fb..783589f 100644 --- a/examples/demo_qa/runs/case_history.py +++ b/examples/demo_qa/runs/case_history.py @@ -9,7 +9,6 @@ from ..runner import RunResult from .layout import _load_run_meta - logger = logging.getLogger(__name__) diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py index eafa3fc..67fc66f 100644 --- a/tests/test_demo_qa_runner.py +++ b/tests/test_demo_qa_runner.py @@ -2,7 +2,13 @@ from typing import cast -from examples.demo_qa.runner import Case, RunResult, _match_expected, diff_runs, summarize +from examples.demo_qa.runner import ( + Case, + RunResult, + _match_expected, + diff_runs, + summarize, +) def test_match_expected_unchecked_when_no_expectations() -> None: From 9da728f403a336f7691900ce5fd467576c7c57d4 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 16:04:07 +0300 Subject: [PATCH 85/92] Handle only-missed overlay scope checks (#94) * Handle only-missed overlay scope checks * Respect scope/tag when applying only-missed overlay * Surface tag match flag when selection tag present --- examples/demo_qa/batch.py | 47 +++++++++++++++++++++++++++++-- tests/test_demo_qa_batch.py | 56 +++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 2 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index f92d9a2..62ca4e9 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -306,13 +306,49 @@ def _only_missed_selection( selected_case_ids: Iterable[str], baseline_results: Mapping[str, RunResult] | None, overlay_results: Mapping[str, RunResult] | None, + *, + overlay_scope_hash: str | None = None, + selection_scope_hash: str | None = None, + overlay_tag: str | None = None, + selection_tag: str | None = None, + overlay_disabled_reason: str | None = None, + overlay_ignored_reason: str | None = None, ) -> tuple[set[str], dict[str, object]]: selected = set(selected_case_ids) baseline_ids = set(baseline_results.keys()) if baseline_results else set() - overlay_executed = set(overlay_results.keys()) if overlay_results else set() + overlay_scope_matches_current: bool | None = None + overlay_tag_matches_current: bool | None = None + overlay_results_for_calc: Mapping[str, RunResult] | None = None + ignored_reason = overlay_ignored_reason or overlay_disabled_reason + if overlay_results is not None and overlay_disabled_reason is None: + overlay_scope_matches_current = ( + overlay_scope_hash == selection_scope_hash if overlay_scope_hash is not None and selection_scope_hash is not None else None + ) + overlay_tag_matches_current = ( + overlay_tag == selection_tag if overlay_tag is not None and selection_tag is not None else None + ) + overlay_results_for_calc = overlay_results + if overlay_scope_matches_current is False: + overlay_results_for_calc = None + ignored_reason = "scope_mismatch" + elif overlay_tag_matches_current is False: + overlay_results_for_calc = None + ignored_reason = "tag_mismatch" + overlay_executed = set(overlay_results_for_calc.keys()) if overlay_results_for_calc else set() missed_base = selected - baseline_ids missed_final = missed_base - overlay_executed - breakdown = {"missed_base": missed_base, "overlay_executed": overlay_executed} + breakdown: dict[str, object] = { + "missed_base": missed_base, + "overlay_executed": overlay_executed, + "overlay_scope_hash": overlay_scope_hash, + "overlay_scope_matches_current": overlay_scope_matches_current, + } + if overlay_tag is not None: + breakdown["overlay_tag"] = overlay_tag + if selection_tag is not None: + breakdown["overlay_tag_matches_current"] = overlay_tag_matches_current + if ignored_reason: + breakdown["overlay_ignored_reason"] = ignored_reason return missed_final, breakdown @@ -784,11 +820,18 @@ def handle_batch(args) -> int: if args.only_missed and missed_baseline_results is None: print("No baseline found for --only-missed.", file=sys.stderr) return 2 + overlay_scope_hash = cast(Optional[str], overlay_run_meta.get("scope_hash") if isinstance(overlay_run_meta, Mapping) else None) + overlay_tag = cast(Optional[str], overlay_run_meta.get("tag") if isinstance(overlay_run_meta, Mapping) else None) selected_case_ids = _planned_pool_from_meta(missed_effective_meta, missed_baseline_path, suite_case_ids) missed_ids, missed_breakdown = _only_missed_selection( selected_case_ids, missed_baseline_results, overlay_results if not args.no_overlay else None, + overlay_scope_hash=overlay_scope_hash, + selection_scope_hash=scope_id, + overlay_tag=overlay_tag, + selection_tag=args.tag, + overlay_disabled_reason="no_overlay" if args.no_overlay else None, ) target_ids = missed_ids if args.only_failed and failed_selection_ids is not None: diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 832d3f1..0d625d9 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -147,6 +147,62 @@ def test_only_missed_selection_uses_overlay_executed() -> None: assert breakdown["overlay_executed"] == {"c"} +def test_only_missed_ignores_overlay_when_scope_mismatches() -> None: + baseline = {"A": _mk_result("A", "ok")} + overlay = {"B": _mk_result("B", "ok")} + + missed, breakdown = _only_missed_selection( + ["A", "B", "C"], + baseline, + overlay, + overlay_scope_hash="overlay_scope", + selection_scope_hash="current_scope", + ) + + assert missed == {"B", "C"} + assert breakdown["missed_base"] == {"B", "C"} + assert breakdown["overlay_executed"] == set() + assert breakdown["overlay_scope_hash"] == "overlay_scope" + assert breakdown["overlay_scope_matches_current"] is False + assert breakdown["overlay_ignored_reason"] == "scope_mismatch" + + +def test_only_missed_applies_overlay_when_scope_matches() -> None: + baseline = {"A": _mk_result("A", "ok")} + overlay = {"B": _mk_result("B", "ok")} + + missed, breakdown = _only_missed_selection( + ["A", "B", "C"], + baseline, + overlay, + overlay_scope_hash="scope_current", + selection_scope_hash="scope_current", + ) + + assert missed == {"C"} + assert breakdown["missed_base"] == {"B", "C"} + assert breakdown["overlay_executed"] == {"B"} + assert breakdown["overlay_scope_hash"] == "scope_current" + assert breakdown["overlay_scope_matches_current"] is True + assert "overlay_ignored_reason" not in breakdown + + +def test_only_missed_exposes_tag_match_flag_even_when_overlay_tag_missing() -> None: + baseline = {"A": _mk_result("A", "ok")} + overlay = {"B": _mk_result("B", "ok")} + + _, breakdown = _only_missed_selection( + ["A", "B"], + baseline, + overlay, + selection_tag="current-tag", + ) + + assert breakdown["overlay_executed"] == {"B"} + assert "overlay_tag_matches_current" in breakdown + assert breakdown["overlay_tag_matches_current"] is None + + def test_only_failed_strict_scope_ignores_overlay_pass(tmp_path: Path) -> None: baseline = {"A": _mk_result("A", "failed")} overlay = {"A": _mk_result("A", "ok")} From 33816036fb0de0c3d6ede9e3339cbd1d89c6efd0 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 3 Jan 2026 16:16:31 +0300 Subject: [PATCH 86/92] ruff fixes --- examples/demo_qa/chat_repl.py | 9 ++++++++- examples/demo_qa/data_gen.py | 7 ++++++- examples/demo_qa/llm/openai_adapter.py | 2 +- examples/demo_qa/runs/effective.py | 2 +- examples/demo_qa/schema_io.py | 10 +++++++--- examples/demo_qa/settings.py | 3 ++- examples/retail_orders/demo_agent.py | 2 +- examples/retail_orders/demo_agent_sql.py | 2 +- examples/retail_orders/schema.py | 1 - 9 files changed, 27 insertions(+), 11 deletions(-) diff --git a/examples/demo_qa/chat_repl.py b/examples/demo_qa/chat_repl.py index 2ebd14c..5d3feb6 100644 --- a/examples/demo_qa/chat_repl.py +++ b/examples/demo_qa/chat_repl.py @@ -8,7 +8,14 @@ from typing import Optional, Sequence from .provider_factory import build_provider -from .runner import Case, EventLogger, RunArtifacts, build_agent, run_one, save_artifacts +from .runner import ( + Case, + EventLogger, + RunArtifacts, + build_agent, + run_one, + save_artifacts, +) def _load_json(path: Path) -> object | None: diff --git a/examples/demo_qa/data_gen.py b/examples/demo_qa/data_gen.py index 4732ee1..3faf0db 100644 --- a/examples/demo_qa/data_gen.py +++ b/examples/demo_qa/data_gen.py @@ -9,7 +9,12 @@ import pandas as pd -from fetchgraph.relational.schema import ColumnConfig, EntityConfig, RelationConfig, SchemaConfig +from fetchgraph.relational.schema import ( + ColumnConfig, + EntityConfig, + RelationConfig, + SchemaConfig, +) @dataclass diff --git a/examples/demo_qa/llm/openai_adapter.py b/examples/demo_qa/llm/openai_adapter.py index 810c752..e298a1f 100644 --- a/examples/demo_qa/llm/openai_adapter.py +++ b/examples/demo_qa/llm/openai_adapter.py @@ -1,7 +1,7 @@ from __future__ import annotations -import os import logging +import os from typing import Any, Dict, Tuple from urllib.parse import urlparse diff --git a/examples/demo_qa/runs/effective.py b/examples/demo_qa/runs/effective.py index ca2457d..444f086 100644 --- a/examples/demo_qa/runs/effective.py +++ b/examples/demo_qa/runs/effective.py @@ -8,8 +8,8 @@ from ..runner import RunResult, bad_statuses, load_results, summarize from ..utils import dump_json from .coverage import _missed_case_ids -from .layout import _effective_paths from .io import write_results +from .layout import _effective_paths def _load_effective_results(artifacts_dir: Path, tag: str) -> tuple[dict[str, RunResult], Optional[dict], Path]: diff --git a/examples/demo_qa/schema_io.py b/examples/demo_qa/schema_io.py index 3525ad3..febcd70 100644 --- a/examples/demo_qa/schema_io.py +++ b/examples/demo_qa/schema_io.py @@ -1,11 +1,15 @@ from __future__ import annotations +import json from pathlib import Path from typing import Any, Dict -import json - -from fetchgraph.relational.schema import ColumnConfig, EntityConfig, RelationConfig, SchemaConfig +from fetchgraph.relational.schema import ( + ColumnConfig, + EntityConfig, + RelationConfig, + SchemaConfig, +) def _entity_from_dict(data: Dict[str, Any]) -> EntityConfig: diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py index be3f03a..3d2e34e 100644 --- a/examples/demo_qa/settings.py +++ b/examples/demo_qa/settings.py @@ -8,8 +8,9 @@ from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator try: - from pydantic_settings import BaseSettings, SettingsConfigDict from pydantic_settings.sources import TomlConfigSettingsSource + + from pydantic_settings import BaseSettings, SettingsConfigDict except ImportError as exc: # pragma: no cover - make missing dependency explicit raise ImportError( "pydantic-settings is required for demo_qa configuration. " diff --git a/examples/retail_orders/demo_agent.py b/examples/retail_orders/demo_agent.py index 5ed9b29..20b96a0 100644 --- a/examples/retail_orders/demo_agent.py +++ b/examples/retail_orders/demo_agent.py @@ -8,7 +8,7 @@ from fetchgraph.core import TaskProfile, create_generic_agent from fetchgraph.relational.schema import SchemaConfig # только для типа, не обязательно -from .schema import build_retail_provider, RETAIL_SCHEMA +from .schema import RETAIL_SCHEMA, build_retail_provider # Простейшая заглушка LLM, чтобы пример запускался без внешних зависимостей. diff --git a/examples/retail_orders/demo_agent_sql.py b/examples/retail_orders/demo_agent_sql.py index 5fdeab1..1014933 100644 --- a/examples/retail_orders/demo_agent_sql.py +++ b/examples/retail_orders/demo_agent_sql.py @@ -1,7 +1,7 @@ from __future__ import annotations -from pathlib import Path import sqlite3 +from pathlib import Path import pandas as pd diff --git a/examples/retail_orders/schema.py b/examples/retail_orders/schema.py index b61297b..46d434f 100644 --- a/examples/retail_orders/schema.py +++ b/examples/retail_orders/schema.py @@ -11,7 +11,6 @@ build_pandas_provider_from_schema, ) - RETAIL_SCHEMA = SchemaConfig( name="retail_orders", label="Интернет-магазин: клиенты, заказы и товары", From b71662f440f341912cf7016a4b0d1d267c3ebe7a Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 17:31:36 +0300 Subject: [PATCH 87/92] Capture resolved demo_qa config metadata --- examples/demo_qa/batch.py | 12 +-- examples/demo_qa/settings.py | 4 +- tests/test_demo_qa_batch.py | 116 ++++++++++++++++++++++++- tests/test_demo_qa_settings.py | 9 +- tests/test_demo_qa_settings_sources.py | 3 +- 5 files changed, 131 insertions(+), 13 deletions(-) diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py index 2877485..79e1ec1 100644 --- a/examples/demo_qa/batch.py +++ b/examples/demo_qa/batch.py @@ -429,7 +429,7 @@ def _resolve_run_path(path: Path | None, artifacts_dir: Path) -> Optional[Path]: def handle_chat(args) -> int: try: - settings = load_settings(config_path=args.config, data_dir=args.data) + settings, _ = load_settings(config_path=args.config, data_dir=args.data) except Exception as exc: print(f"Configuration error: {exc}", file=sys.stderr) return 2 @@ -605,10 +605,10 @@ def handle_batch(args) -> int: data_dir = Path(args.data) schema_path = Path(args.schema) cases_path = Path(args.cases) - config_path = Path(args.config) if args.config else None + cli_config_path = Path(args.config) if args.config else None try: - settings = load_settings(config_path=config_path, data_dir=data_dir) + settings, resolved_config_path = load_settings(config_path=cli_config_path, data_dir=data_dir) except Exception as exc: print(f"Configuration error: {exc}", file=sys.stderr) return 2 @@ -1047,7 +1047,7 @@ def handle_batch(args) -> int: except Exception as exc: print(f"Failed to update effective results for tag {args.tag!r}: {exc}", file=sys.stderr) - config_hash = _hash_file(config_path) if config_path else None + config_hash = _hash_file(resolved_config_path) if resolved_config_path else None schema_hash = _hash_file(schema_path) data_fingerprint = _fingerprint_dir(data_dir, verbose=args.fingerprint_verbose) git_sha = _git_sha() @@ -1060,7 +1060,7 @@ def handle_batch(args) -> int: "inputs": { "cases_path": str(cases_path), "cases_hash": cases_hash, - "config_path": str(config_path) if config_path else None, + "config_path": str(resolved_config_path) if resolved_config_path else None, "config_hash": config_hash, "schema_path": str(schema_path), "schema_hash": schema_hash, @@ -1230,7 +1230,7 @@ def handle_batch(args) -> int: def handle_case_run(args) -> int: try: - settings = load_settings(config_path=args.config, data_dir=args.data) + settings, _ = load_settings(config_path=args.config, data_dir=args.data) except Exception as exc: print(f"Configuration error: {exc}", file=sys.stderr) return 2 diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py index 3d2e34e..e471728 100644 --- a/examples/demo_qa/settings.py +++ b/examples/demo_qa/settings.py @@ -118,7 +118,7 @@ def load_settings( config_path: Path | None = None, data_dir: Path | None = None, overrides: Dict[str, Any] | None = None, -) -> DemoQASettings: +) -> tuple[DemoQASettings, Path | None]: resolved = resolve_config_path(config_path, data_dir) DemoQASettings._toml_path = resolved try: @@ -127,7 +127,7 @@ def load_settings( DemoQASettings._toml_path = None raise DemoQASettings._toml_path = None - return settings + return settings, resolved __all__ = ["DemoQASettings", "LLMSettings", "resolve_config_path", "load_settings"] diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 0d625d9..c46f589 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -5,10 +5,13 @@ import os import time from pathlib import Path +from types import SimpleNamespace from typing import cast import pytest +import examples.demo_qa.batch as batch +from examples.demo_qa.cli import build_parser from examples.demo_qa.batch import ( _consecutive_passes, _fingerprint_dir, @@ -21,7 +24,7 @@ render_markdown, write_results, ) -from examples.demo_qa.runner import DiffReport, RunResult, diff_runs +from examples.demo_qa.runner import DiffReport, RunResult, RunTimings, diff_runs from examples.demo_qa.runs.coverage import _missed_case_ids from examples.demo_qa.runs.layout import _latest_markers, _update_latest_markers @@ -436,3 +439,114 @@ def test_format_healed_explain_includes_key_lines() -> None: lines = _format_healed_explain(healed, healed_details, anti_flake_passes=2, limit=2) assert any("Healed because last 2 results are PASS for case a" in line for line in lines) assert any("run_id=r2" in line and "status=ok" in line for line in lines) + + +def _stubbed_run_one(case, runner, artifacts_root, *, plan_only=False, event_logger=None): + run_dir = artifacts_root / f"{case.id}_stub" + run_dir.mkdir(parents=True, exist_ok=True) + return RunResult( + id=case.id, + question=case.question, + status="ok", + checked=case.has_asserts, + reason=None, + details=None, + artifacts_dir=str(run_dir), + duration_ms=1, + tags=list(case.tags), + answer="ok", + error=None, + plan_path=str(run_dir / "plan.json"), + timings=RunTimings(), + expected_check=None, + ) + + +def _prepare_batch_inputs(tmp_path: Path) -> tuple[Path, Path, Path, Path]: + data_dir = tmp_path / "data" + data_dir.mkdir(exist_ok=True) + schema_path = tmp_path / "schema.json" + schema_path.write_text("{}", encoding="utf-8") + cases_path = tmp_path / "cases.jsonl" + cases_path.write_text('[{"id":"c1","question":"Q?"}]', encoding="utf-8") + artifacts_dir = tmp_path / "artifacts" + return data_dir, schema_path, cases_path, artifacts_dir + + +def _run_batch_and_meta( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + *, + cli_config: Path | None = None, + env_api_key: str | None = None, +) -> dict: + data_dir, schema_path, cases_path, artifacts_dir = _prepare_batch_inputs(tmp_path) + + monkeypatch.setattr( + batch, + "build_provider", + lambda data_dir, schema_path, enable_semantic=False, embedding_model=None: (SimpleNamespace(name="dummy"), None), + ) + monkeypatch.setattr(batch, "build_llm", lambda settings: SimpleNamespace()) + monkeypatch.setattr(batch, "build_agent", lambda llm, provider: SimpleNamespace()) + monkeypatch.setattr(batch, "run_one", _stubbed_run_one) + monkeypatch.setattr(batch, "configure_logging", lambda **kwargs: None) + + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + if env_api_key is not None: + monkeypatch.setenv("OPENAI_API_KEY", env_api_key) + + args_list = [ + "batch", + "--data", + str(data_dir), + "--schema", + str(schema_path), + "--cases", + str(cases_path), + "--artifacts-dir", + str(artifacts_dir), + "--events", + "off", + "--quiet", + ] + if cli_config is not None: + args_list.extend(["--config", str(cli_config)]) + args = build_parser().parse_args(args_list) + exit_code = batch.handle_batch(args) + assert exit_code == 0 + + run_meta_path = next((artifacts_dir / "runs").rglob("run_meta.json")) + return json.loads(run_meta_path.read_text(encoding="utf-8")) + + +def test_default_config_is_discovered_and_hashed(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + data_dir, _, _, _ = _prepare_batch_inputs(tmp_path) + default_config = data_dir / "demo_qa.toml" + default_config.write_text('[llm]\napi_key="sk-default"\n', encoding="utf-8") + + run_meta = _run_batch_and_meta(tmp_path, monkeypatch, env_api_key=None) + + assert run_meta["inputs"]["config_path"] == str(default_config) + assert run_meta["inputs"]["config_hash"] == batch._hash_file(default_config) + + +def test_explicit_config_path_wins(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + data_dir, _, _, _ = _prepare_batch_inputs(tmp_path) + # Default config exists but should be ignored when CLI is provided. + default_config = data_dir / "demo_qa.toml" + default_config.write_text('[llm]\napi_key="sk-default"\n', encoding="utf-8") + explicit_config = tmp_path / "custom.toml" + explicit_config.write_text('[llm]\napi_key="sk-explicit"\n', encoding="utf-8") + + run_meta = _run_batch_and_meta(tmp_path, monkeypatch, cli_config=explicit_config, env_api_key=None) + + assert run_meta["inputs"]["config_path"] == str(explicit_config) + assert run_meta["inputs"]["config_hash"] == batch._hash_file(explicit_config) + + +def test_no_config_available_sets_none_in_meta(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + run_meta = _run_batch_and_meta(tmp_path, monkeypatch, env_api_key="sk-env") + + assert run_meta["inputs"]["config_path"] is None + assert run_meta["inputs"]["config_hash"] is None diff --git a/tests/test_demo_qa_settings.py b/tests/test_demo_qa_settings.py index 16a1f30..c7ced7b 100644 --- a/tests/test_demo_qa_settings.py +++ b/tests/test_demo_qa_settings.py @@ -29,7 +29,8 @@ def test_env_overrides_toml(tmp_path, monkeypatch): monkeypatch.setenv("DEMO_QA_LLM__API_KEY", "sk-from-env") monkeypatch.setenv("DEMO_QA_LLM__PLAN_MODEL", "env-plan") - settings = load_settings(config_path=config_path) + settings, resolved = load_settings(config_path=config_path) + assert resolved == config_path assert settings.llm.api_key == "sk-from-env" assert settings.llm.base_url == "http://localhost:1234/v1" assert settings.llm.plan_model == "env-plan" @@ -61,7 +62,8 @@ def test_openai_key_from_global_env(tmp_path, monkeypatch): ) monkeypatch.setenv("OPENAI_API_KEY", "sk-global") - settings = load_settings(config_path=config_path) + settings, resolved = load_settings(config_path=config_path) + assert resolved == config_path assert settings.llm.api_key == "sk-global" @@ -96,7 +98,8 @@ def _store_and_return(kwargs): monkeypatch.setitem(sys.modules, "openai", SimpleNamespace(OpenAI=FakeOpenAI)) - settings = load_settings(config_path=config_path) + settings, resolved = load_settings(config_path=config_path) + assert resolved == config_path llm = build_llm(settings) result = llm("hello", sender="generic_plan") diff --git a/tests/test_demo_qa_settings_sources.py b/tests/test_demo_qa_settings_sources.py index f828329..c8ec38d 100644 --- a/tests/test_demo_qa_settings_sources.py +++ b/tests/test_demo_qa_settings_sources.py @@ -25,8 +25,9 @@ def test_source_priorities(tmp_path, monkeypatch): monkeypatch.setenv("DEMO_QA_LLM__API_KEY", "sk-env") monkeypatch.setenv("DEMO_QA_LLM__PLAN_MODEL", "env-plan") - settings = load_settings(config_path=config_path, overrides={"llm": {"plan_model": "override-plan"}}) + settings, resolved = load_settings(config_path=config_path, overrides={"llm": {"plan_model": "override-plan"}}) + assert resolved == config_path assert settings.llm.api_key == "sk-env" assert settings.llm.plan_model == "override-plan" assert settings.llm.synth_model == "toml-synth" From 066228a874e79dabc4e7b8483eca81dbf1eadf3d Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 17:40:20 +0300 Subject: [PATCH 88/92] Fix demo QA batch test config inputs --- tests/test_demo_qa_batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index c46f589..031d7b1 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -468,7 +468,7 @@ def _prepare_batch_inputs(tmp_path: Path) -> tuple[Path, Path, Path, Path]: schema_path = tmp_path / "schema.json" schema_path.write_text("{}", encoding="utf-8") cases_path = tmp_path / "cases.jsonl" - cases_path.write_text('[{"id":"c1","question":"Q?"}]', encoding="utf-8") + cases_path.write_text('{"id":"c1","question":"Q?"}\n', encoding="utf-8") artifacts_dir = tmp_path / "artifacts" return data_dir, schema_path, cases_path, artifacts_dir From 98879f0f22299c4a8be989019a3d394cbb885ab7 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 17:44:01 +0300 Subject: [PATCH 89/92] Write demo QA batch test case in jsonl format From b6796b8ed007b58e3a5d7026c86b5596cdf341a7 Mon Sep 17 00:00:00 2001 From: AlexanderOnischenko <74920855+AlexanderOnischenko@users.noreply.github.com> Date: Sat, 3 Jan 2026 17:51:11 +0300 Subject: [PATCH 90/92] Expect packaged default demo QA config --- .gitignore | 1 + examples/demo_qa/demo_qa.toml | 9 +++++++++ tests/test_demo_qa_batch.py | 10 +++++++--- 3 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 examples/demo_qa/demo_qa.toml diff --git a/.gitignore b/.gitignore index 2ca3c0a..4a87019 100644 --- a/.gitignore +++ b/.gitignore @@ -6,5 +6,6 @@ build/ .pytest_cache/ examples/demo_qa/demo_qa.toml **/demo_qa.toml +!examples/demo_qa/demo_qa.toml .env.demo_qa _demo_data/*/.runs/* diff --git a/examples/demo_qa/demo_qa.toml b/examples/demo_qa/demo_qa.toml new file mode 100644 index 0000000..7dd49e9 --- /dev/null +++ b/examples/demo_qa/demo_qa.toml @@ -0,0 +1,9 @@ +[llm] +api_key = "unused" +base_url = "http://localhost:8000/v1" +plan_model = "gpt-4o-mini" +synth_model = "gpt-4o-mini" +plan_temperature = 0.0 +synth_temperature = 0.2 +timeout_s = 900 +retries = 2 diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 031d7b1..692cd85 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -545,8 +545,12 @@ def test_explicit_config_path_wins(tmp_path: Path, monkeypatch: pytest.MonkeyPat assert run_meta["inputs"]["config_hash"] == batch._hash_file(explicit_config) -def test_no_config_available_sets_none_in_meta(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: +def test_packaged_default_config_used_when_no_cli_or_data_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: run_meta = _run_batch_and_meta(tmp_path, monkeypatch, env_api_key="sk-env") - assert run_meta["inputs"]["config_path"] is None - assert run_meta["inputs"]["config_hash"] is None + config_path = run_meta["inputs"]["config_path"] + assert config_path is not None + + expected_default = Path(batch.__file__).resolve().parent / "demo_qa.toml" + assert Path(config_path) == expected_default + assert run_meta["inputs"]["config_hash"] == batch._hash_file(expected_default) From f5ad53920543563131fcbadc38cd991455484e15 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 3 Jan 2026 19:30:53 +0300 Subject: [PATCH 91/92] =?UTF-8?q?=D0=B4=D0=B5=D0=BB=D0=B0=D0=B5=D0=BC=20ap?= =?UTF-8?q?i=5Fkey=20=D0=BE=D0=BF=D1=86=D0=B8=D0=BE=D0=BD=D0=B0=D0=BB?= =?UTF-8?q?=D1=8C=D0=BD=D1=8B=D0=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 - README_demo_qa.md | 4 +- examples/demo_qa/demo_qa.toml | 9 --- examples/demo_qa/demo_qa.toml.example | 5 +- examples/demo_qa/llm/openai_adapter.py | 18 ++--- examples/demo_qa/settings.py | 15 +--- tests/test_demo_qa_batch.py | 4 +- tests/test_demo_qa_settings.py | 96 ++++++++++++++++++++------ 8 files changed, 94 insertions(+), 59 deletions(-) delete mode 100644 examples/demo_qa/demo_qa.toml diff --git a/.gitignore b/.gitignore index 4a87019..c874fb3 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,6 @@ __pycache__/ dist/ build/ .pytest_cache/ -examples/demo_qa/demo_qa.toml **/demo_qa.toml -!examples/demo_qa/demo_qa.toml .env.demo_qa _demo_data/*/.runs/* diff --git a/README_demo_qa.md b/README_demo_qa.md index 8a8cc07..3b453c6 100644 --- a/README_demo_qa.md +++ b/README_demo_qa.md @@ -17,6 +17,7 @@ python -m examples.demo_qa.cli gen --out demo_data --rows 1000 --seed 42 ### Файл demo_qa.toml См. шаблон `examples/demo_qa/demo_qa.toml.example`. Автопоиск: `--config`, затем `/demo_qa.toml`, затем `examples/demo_qa/demo_qa.toml`. +`llm.api_key` можно опустить: при инициализации LLM используется `OPENAI_API_KEY`, а при его отсутствии — строка `"unused"`. ### .env.demo_qa Пример: @@ -30,6 +31,7 @@ DEMO_QA_LLM__BASE_URL=http://localhost:8000/v1 export DEMO_QA_LLM__API_KEY=sk-... export DEMO_QA_LLM__BASE_URL=http://localhost:8000/v1 ``` +Если не задавать `DEMO_QA_LLM__API_KEY` и не выставлять `OPENAI_API_KEY`, LLM-клиент подставит `"unused"` и не упадёт. ### Зависимости демо * Требуется Python 3.11+ (используется стандартный `tomllib`). @@ -44,7 +46,7 @@ pip install -r examples/demo_qa/requirements.txt ### OpenAI / совместимый прокси 1. Скопируйте `examples/demo_qa/demo_qa.toml.example` в удобное место и укажите - `llm.api_key` (можно `env:OPENAI_API_KEY` или любое значение, если прокси не проверяет ключ), + при необходимости `llm.api_key` (можно `env:OPENAI_API_KEY`; если не указать, возьмётся `OPENAI_API_KEY` или `"unused"`), `base_url` (формат `http://host:port/v1`), модели и температуры. 2. Запустите чат с указанием конфига: ```bash diff --git a/examples/demo_qa/demo_qa.toml b/examples/demo_qa/demo_qa.toml deleted file mode 100644 index 7dd49e9..0000000 --- a/examples/demo_qa/demo_qa.toml +++ /dev/null @@ -1,9 +0,0 @@ -[llm] -api_key = "unused" -base_url = "http://localhost:8000/v1" -plan_model = "gpt-4o-mini" -synth_model = "gpt-4o-mini" -plan_temperature = 0.0 -synth_temperature = 0.2 -timeout_s = 900 -retries = 2 diff --git a/examples/demo_qa/demo_qa.toml.example b/examples/demo_qa/demo_qa.toml.example index 7dd49e9..4b12c2c 100644 --- a/examples/demo_qa/demo_qa.toml.example +++ b/examples/demo_qa/demo_qa.toml.example @@ -1,8 +1,7 @@ [llm] -api_key = "unused" base_url = "http://localhost:8000/v1" -plan_model = "gpt-4o-mini" -synth_model = "gpt-4o-mini" +plan_model = "default" +synth_model = "default" plan_temperature = 0.0 synth_temperature = 0.2 timeout_s = 900 diff --git a/examples/demo_qa/llm/openai_adapter.py b/examples/demo_qa/llm/openai_adapter.py index e298a1f..80bc7af 100644 --- a/examples/demo_qa/llm/openai_adapter.py +++ b/examples/demo_qa/llm/openai_adapter.py @@ -46,15 +46,15 @@ def __init__( self.logger.info("OpenAILLM using endpoint %s", endpoint) def _resolve_api_key(self, api_key: str | None) -> str: - if api_key is None: - raise RuntimeError("OpenAI provider selected but llm.api_key is missing.") - if api_key.startswith("env:"): - env_var = api_key.split(":", 1)[1] - value = os.getenv(env_var) - if not value: - raise RuntimeError(f"Environment variable {env_var} referenced in config but not set.") - return value - return api_key + if api_key: + if api_key.startswith("env:"): + env_var = api_key.split(":", 1)[1] + value = os.getenv(env_var) + return value or "unused" + return api_key + + env_key = os.getenv("OPENAI_API_KEY") + return env_key or "unused" def _validate_base_url(self, base_url: str | None) -> str | None: if base_url in (None, ""): diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py index e471728..546cfb4 100644 --- a/examples/demo_qa/settings.py +++ b/examples/demo_qa/settings.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os from pathlib import Path from typing import Any, ClassVar, Dict from urllib.parse import urlparse @@ -17,6 +16,7 @@ "Install demo extras via `pip install -e .[demo]` or `pip install -r examples/demo_qa/requirements.txt`." ) from exc + class LLMSettings(BaseModel): base_url: str | None = Field(default=None) api_key: str | None = Field(default=None) @@ -87,16 +87,6 @@ def settings_customise_sources( sources.append(file_secret_settings) return tuple(sources) - @model_validator(mode="after") - def require_api_key(self) -> "DemoQASettings": - if not self.llm.api_key: - env_key = os.getenv("OPENAI_API_KEY") - if env_key: - self.llm.api_key = env_key - if not self.llm.api_key: - raise ValueError("llm.api_key is required. Provide it in config or set OPENAI_API_KEY.") - return self - def resolve_config_path(config: Path | None, data_dir: Path | None) -> Path | None: if config is not None: @@ -107,7 +97,8 @@ def resolve_config_path(config: Path | None, data_dir: Path | None) -> Path | No candidate = data_dir / "demo_qa.toml" if candidate.exists(): return candidate - default = Path(__file__).resolve().parent / "demo_qa.toml" + root = Path(__file__).resolve().parent + default = root / "demo_qa.toml" if default.exists(): return default return None diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py index 692cd85..fa163a8 100644 --- a/tests/test_demo_qa_batch.py +++ b/tests/test_demo_qa_batch.py @@ -11,7 +11,6 @@ import pytest import examples.demo_qa.batch as batch -from examples.demo_qa.cli import build_parser from examples.demo_qa.batch import ( _consecutive_passes, _fingerprint_dir, @@ -24,6 +23,7 @@ render_markdown, write_results, ) +from examples.demo_qa.cli import build_parser from examples.demo_qa.runner import DiffReport, RunResult, RunTimings, diff_runs from examples.demo_qa.runs.coverage import _missed_case_ids from examples.demo_qa.runs.layout import _latest_markers, _update_latest_markers @@ -546,7 +546,7 @@ def test_explicit_config_path_wins(tmp_path: Path, monkeypatch: pytest.MonkeyPat def test_packaged_default_config_used_when_no_cli_or_data_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - run_meta = _run_batch_and_meta(tmp_path, monkeypatch, env_api_key="sk-env") + run_meta = _run_batch_and_meta(tmp_path, monkeypatch, env_api_key=None) config_path = run_meta["inputs"]["config_path"] assert config_path is not None diff --git a/tests/test_demo_qa_settings.py b/tests/test_demo_qa_settings.py index c7ced7b..bd0f9f3 100644 --- a/tests/test_demo_qa_settings.py +++ b/tests/test_demo_qa_settings.py @@ -4,8 +4,6 @@ from pathlib import Path from types import SimpleNamespace -import pytest - from examples.demo_qa.llm.factory import build_llm from examples.demo_qa.llm.openai_adapter import OpenAILLM from examples.demo_qa.settings import load_settings @@ -15,6 +13,20 @@ def write_toml(path: Path, content: str) -> None: path.write_text(content, encoding="utf-8") +def _install_fake_openai(monkeypatch, created: dict): + def _store_and_return(kwargs): + created["chat_kwargs"] = kwargs + return SimpleNamespace(choices=[SimpleNamespace(message=SimpleNamespace(content="ok"))]) + + class FakeOpenAI: + def __init__(self, api_key=None, base_url=None, **kwargs): + created["api_key"] = api_key + created["base_url"] = base_url + self.chat = SimpleNamespace(completions=SimpleNamespace(create=lambda **kwargs: _store_and_return(kwargs))) + + monkeypatch.setitem(sys.modules, "openai", SimpleNamespace(OpenAI=FakeOpenAI)) + + def test_env_overrides_toml(tmp_path, monkeypatch): config_path = tmp_path / "demo_qa.toml" write_toml( @@ -36,7 +48,9 @@ def test_env_overrides_toml(tmp_path, monkeypatch): assert settings.llm.plan_model == "env-plan" -def test_openai_requires_api_key(tmp_path): + + +def test_allow_missing_api_key_when_disabled(tmp_path): config_path = tmp_path / "demo_qa.toml" write_toml( config_path, @@ -47,8 +61,9 @@ def test_openai_requires_api_key(tmp_path): """, ) - with pytest.raises(ValueError): - load_settings(config_path=config_path) + settings, resolved = load_settings(config_path=config_path) + assert resolved == config_path + assert settings.llm.api_key is None def test_openai_key_from_global_env(tmp_path, monkeypatch): @@ -61,10 +76,15 @@ def test_openai_key_from_global_env(tmp_path, monkeypatch): """, ) monkeypatch.setenv("OPENAI_API_KEY", "sk-global") + created = {} + _install_fake_openai(monkeypatch, created) settings, resolved = load_settings(config_path=config_path) assert resolved == config_path - assert settings.llm.api_key == "sk-global" + llm = build_llm(settings) + + llm("hello", sender="generic_plan") + assert created["api_key"] == "sk-global" def test_base_url_passed_to_openai_client(tmp_path, monkeypatch): @@ -82,21 +102,7 @@ def test_base_url_passed_to_openai_client(tmp_path, monkeypatch): created = {} - class FakeOpenAI: - def __init__(self, api_key=None, base_url=None, **kwargs): - created["api_key"] = api_key - created["base_url"] = base_url - self.chat = SimpleNamespace( - completions=SimpleNamespace( - create=lambda **kwargs: _store_and_return(kwargs) - ) - ) - - def _store_and_return(kwargs): - created["chat_kwargs"] = kwargs - return SimpleNamespace(choices=[SimpleNamespace(message=SimpleNamespace(content="ok"))]) - - monkeypatch.setitem(sys.modules, "openai", SimpleNamespace(OpenAI=FakeOpenAI)) + _install_fake_openai(monkeypatch, created) settings, resolved = load_settings(config_path=config_path) assert resolved == config_path @@ -193,3 +199,51 @@ def with_options(self, **kwargs): "messages": [{"role": "user", "content": "question"}], "temperature": 0.2, } + + +def test_missing_api_key_uses_unused(monkeypatch): + created: dict = {} + _install_fake_openai(monkeypatch, created) + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + + llm = OpenAILLM( + api_key=None, + base_url=None, + plan_model="demo-plan", + synth_model="demo-synth", + ) + llm("hello", sender="generic_plan") + + assert created["api_key"] == "unused" + + +def test_env_reference_uses_openai_api_key(monkeypatch): + created: dict = {} + _install_fake_openai(monkeypatch, created) + monkeypatch.setenv("OPENAI_API_KEY", "sk-env") + + llm = OpenAILLM( + api_key="env:OPENAI_API_KEY", + base_url=None, + plan_model="demo-plan", + synth_model="demo-synth", + ) + llm("hello", sender="generic_plan") + + assert created["api_key"] == "sk-env" + + +def test_env_reference_defaults_to_unused_when_missing(monkeypatch): + created: dict = {} + _install_fake_openai(monkeypatch, created) + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + + llm = OpenAILLM( + api_key="env:OPENAI_API_KEY", + base_url=None, + plan_model="demo-plan", + synth_model="demo-synth", + ) + llm("hello", sender="generic_plan") + + assert created["api_key"] == "unused" From a93bd7c7d31dbaf27bc8b89f4ea5e822a95b7a04 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 3 Jan 2026 19:47:52 +0300 Subject: [PATCH 92/92] demo_qa.toml --- examples/demo_qa/demo_qa.toml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 examples/demo_qa/demo_qa.toml diff --git a/examples/demo_qa/demo_qa.toml b/examples/demo_qa/demo_qa.toml new file mode 100644 index 0000000..a1d9ede --- /dev/null +++ b/examples/demo_qa/demo_qa.toml @@ -0,0 +1,8 @@ +[llm] +base_url = "http://localhost:8000/v1" +plan_model = "default" +synth_model = "default" +plan_temperature = 0.0 +synth_temperature = 0.2 +timeout_s = 900 +retries = 2 \ No newline at end of file