From 0f079de4e76fe2c1783d33c5ec85eb0cc732a7c0 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 14:56:44 +0300
Subject: [PATCH 01/92] Delete unused LLM cache module

---
 README_demo_qa.md             |  15 ++
 examples/demo_qa/chat_repl.py |  92 +----------
 examples/demo_qa/cli.py       | 190 +++++++++++++++++-----
 examples/demo_qa/runner.py    | 296 ++++++++++++++++++++++++++++++++++
 4 files changed, 469 insertions(+), 124 deletions(-)
 create mode 100644 examples/demo_qa/runner.py
diff --git a/README_demo_qa.md b/README_demo_qa.md
index 86c9b0d..006da3f 100644
--- a/README_demo_qa.md
+++ b/README_demo_qa.md
@@ -52,6 +52,21 @@ python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.y
 
 Флаг `--enable-semantic` строит семантический индекс, если передана модель эмбеддингов.
 
+## Batch
+
+Запустить пакетный прогон вопросов из `cases.jsonl` (по одному JSON на строку, поля `id`, `question`, опционально `expected`/`expected_regex`/`expected_contains` и `skip`):
+
+```bash
+python -m examples.demo_qa.cli batch \
+  --data demo_data \
+  --schema demo_data/schema.yaml \
+  --cases cases.jsonl \
+  --out results.jsonl
+```
+
+* Артефакты по умолчанию пишутся в `<data>/.runs/batch_<timestamp>/id_runid/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`).
+* `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов.
+* Флаги `--fail-on (error|mismatch|any)`, `--max-fails` и `--fail-fast` управляют остановкой и кодом выхода (0/1/2).
 ## Local proxy
 
 Для OpenAI-совместимых серверов (например, LM Studio) укажите `base_url` с `.../v1` и
diff --git a/examples/demo_qa/chat_repl.py b/examples/demo_qa/chat_repl.py
index 10f8b74..678a78b 100644
--- a/examples/demo_qa/chat_repl.py
+++ b/examples/demo_qa/chat_repl.py
@@ -1,92 +1,16 @@
 from __future__ import annotations
 
 import datetime
-import json
 import sys
 import uuid
-from dataclasses import dataclass
 from pathlib import Path
-from typing import Callable, Dict, Optional, Sequence
+from typing import Optional, Sequence
 
 import readline
-
-from fetchgraph.core import create_generic_agent
-from fetchgraph.core.models import TaskProfile
-from fetchgraph.utils import set_run_id
+import json
 
 from .provider_factory import build_provider
-
-
-@dataclass
-class RunArtifacts:
-    run_id: str
-    run_dir: Path
-    plan: str | None = None
-    context: Dict[str, object] | None = None
-    answer: str | None = None
-    error: str | None = None
-
-
-def build_agent(llm, provider) -> Callable[[str, str, Path], RunArtifacts]:
-    def saver(feature_name: str, parsed: object) -> None:
-        # Placeholder to satisfy BaseGraphAgent.saver; artifacts captured elsewhere.
-        return None
-
-    task_profile = TaskProfile(
-        task_name="Demo QA",
-        goal="Answer analytics questions over the demo dataset",
-        output_format="Plain text answer",
-        focus_hints=[
-            "Prefer aggregates",
-            "Use concise answers",
-        ],
-    )
-
-    agent = create_generic_agent(
-        llm_invoke=llm,
-        providers={provider.name: provider},
-        saver=saver,
-        task_profile=task_profile,
-    )
-
-    def run_question(question: str, run_id: str, run_dir: Path) -> RunArtifacts:
-        set_run_id(run_id)
-        artifacts = RunArtifacts(run_id=run_id, run_dir=run_dir)
-        plan = agent._plan(question)  # type: ignore[attr-defined]
-        artifacts.plan = json.dumps(plan.model_dump(), ensure_ascii=False, indent=2)
-        try:
-            ctx = agent._fetch(question, plan)  # type: ignore[attr-defined]
-            artifacts.context = {k: v.text for k, v in (ctx or {}).items()} if ctx else {}
-        except Exception as exc:  # pragma: no cover - demo fallback
-            artifacts.error = str(exc)
-            artifacts.context = {"error": str(exc)}
-            ctx = None
-        draft = agent._synthesize(question, ctx, plan)  # type: ignore[attr-defined]
-        parsed = agent.domain_parser(draft)
-        artifacts.answer = str(parsed)
-        return artifacts
-
-    return run_question
-
-
-def _save_text(path: Path, content: str) -> None:
-    path.write_text(content, encoding="utf-8")
-
-
-def _save_json(path: Path, payload: object) -> None:
-    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
-
-
-def _save_artifacts(artifacts: RunArtifacts) -> None:
-    artifacts.run_dir.mkdir(parents=True, exist_ok=True)
-    if artifacts.plan is not None:
-        _save_text(artifacts.run_dir / "plan.json", artifacts.plan)
-    if artifacts.context is not None:
-        _save_json(artifacts.run_dir / "context.json", artifacts.context)
-    if artifacts.answer is not None:
-        _save_text(artifacts.run_dir / "answer.txt", artifacts.answer)
-    if artifacts.error is not None:
-        _save_text(artifacts.run_dir / "error.txt", artifacts.error)
+from .runner import RunArtifacts, build_agent, save_artifacts
 
 
 def _maybe_add_history(entry: str) -> None:
@@ -173,18 +97,18 @@ def start_repl(
 
         artifacts: RunArtifacts | None = None
         try:
-            artifacts = runner(line, run_id, run_dir)
+            artifacts = runner.run_question(line, run_id, run_dir)
             last_artifacts = artifacts
-            _save_artifacts(artifacts)
+            save_artifacts(artifacts)
             if plan_debug_mode in {"on", "once"} and artifacts.plan:
                 print("--- PLAN ---")
-                print(artifacts.plan)
+                print(json.dumps(artifacts.plan, ensure_ascii=False, indent=2))
             print(artifacts.answer or "")
         except Exception as exc:  # pragma: no cover - REPL resilience
-            error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=run_dir)
+            error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=run_dir, question=line)
             error_artifacts.error = error_artifacts.error or str(exc)
             last_artifacts = error_artifacts
-            _save_artifacts(error_artifacts)
+            save_artifacts(error_artifacts)
             print(f"Error during run {run_id}: {exc}", file=sys.stderr)
         finally:
             if plan_debug_mode == "once":
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index 5c02c4c..f5763de 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -1,8 +1,11 @@
 from __future__ import annotations
 
 import argparse
+import datetime
+import json
 import sys
 from pathlib import Path
+from typing import Iterable
 
 ROOT = Path(__file__).resolve().parents[2]
 SRC = ROOT / "src"
@@ -13,9 +16,128 @@
 from .data_gen import generate_and_save
 from .llm.factory import build_llm
 from .logging_config import configure_logging
+from .provider_factory import build_provider
+from .runner import RunResult, build_agent, format_status_line, load_cases, run_one, summarize
 from .settings import load_settings
 
 
+def write_results(out_path: Path, results: Iterable[RunResult]) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", encoding="utf-8") as f:
+        for res in results:
+            f.write(json.dumps(res.to_json(), ensure_ascii=False) + "\n")
+
+
+def write_summary(out_path: Path, summary: dict) -> Path:
+    summary_path = out_path.with_name("summary.json")
+    summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
+    return summary_path
+
+
+def is_failure(status: str, fail_on: str) -> bool:
+    if fail_on == "error":
+        return status == "error"
+    if fail_on == "mismatch":
+        return status in {"error", "mismatch"}
+    return status in {"error", "mismatch", "skipped"}
+
+
+def handle_chat(args) -> int:
+    try:
+        settings = load_settings(config_path=args.config, data_dir=args.data)
+    except Exception as exc:
+        print(f"Configuration error: {exc}", file=sys.stderr)
+        return 2
+
+    log_dir = args.log_dir or args.data / ".runs" / "logs"
+    log_file = configure_logging(
+        level=args.log_level,
+        log_dir=log_dir,
+        to_stderr=args.log_stderr,
+        jsonl=args.log_jsonl,
+        run_id=None,
+    )
+
+    llm_settings = settings.llm
+    llm_endpoint = llm_settings.base_url or "https://api.openai.com/v1"
+    diagnostics = [
+        f"LLM endpoint: {llm_endpoint}",
+        f"Plan model: {llm_settings.plan_model} (temp={llm_settings.plan_temperature})",
+        f"Synth model: {llm_settings.synth_model} (temp={llm_settings.synth_temperature})",
+        f"Timeout: {llm_settings.timeout_s if llm_settings.timeout_s is not None else 'default'}, "
+        f"Retries: {llm_settings.retries if llm_settings.retries is not None else 'default'}",
+    ]
+    if args.enable_semantic:
+        diagnostics.append(f"Embeddings: CSV semantic backend in {args.data} (*.embeddings.json)")
+    else:
+        diagnostics.append("Embeddings: disabled (use --enable-semantic to build/search embeddings).")
+
+    llm = build_llm(settings)
+
+    start_repl(
+        args.data,
+        args.schema,
+        llm,
+        enable_semantic=args.enable_semantic,
+        log_file=log_file,
+        diagnostics=diagnostics,
+    )
+    return 0
+
+
+def handle_batch(args) -> int:
+    try:
+        settings = load_settings(config_path=args.config, data_dir=args.data)
+    except Exception as exc:
+        print(f"Configuration error: {exc}", file=sys.stderr)
+        return 2
+    try:
+        cases = load_cases(args.cases)
+    except Exception as exc:
+        print(f"Cases error: {exc}", file=sys.stderr)
+        return 2
+
+    artifacts_dir = args.artifacts_dir
+    if artifacts_dir is None:
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        artifacts_dir = args.data / ".runs" / f"batch_{timestamp}"
+    artifacts_dir.mkdir(parents=True, exist_ok=True)
+
+    log_dir = args.log_dir or args.data / ".runs" / "logs"
+    configure_logging(
+        level=args.log_level,
+        log_dir=log_dir,
+        to_stderr=args.log_stderr,
+        jsonl=args.log_jsonl,
+        run_id=None,
+    )
+
+    provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic)
+    llm = build_llm(settings)
+    runner = build_agent(llm, provider)
+
+    results: list[RunResult] = []
+    failures = 0
+    for case in cases:
+        result = run_one(case, runner, artifacts_dir)
+        results.append(result)
+        print(format_status_line(result))
+        if is_failure(result.status, args.fail_on):
+            failures += 1
+            if args.fail_fast or (args.max_fails and failures >= args.max_fails):
+                break
+
+    write_results(args.out, results)
+    summary = summarize(results)
+    summary_path = write_summary(args.out, summary)
+    print(f"Summary: {json.dumps(summary, ensure_ascii=False)}")
+    print(f"Results written to: {args.out}")
+    print(f"Summary written to: {summary_path}")
+
+    failure_count = sum(1 for res in results if is_failure(res.status, args.fail_on))
+    return 1 if failure_count else 0
+
+
 def main() -> None:
     parser = argparse.ArgumentParser(description="Demo QA harness for fetchgraph")
     sub = parser.add_subparsers(dest="command", required=True)
@@ -36,53 +158,41 @@ def main() -> None:
     chat_p.add_argument("--log-stderr", action="store_true", help="Also stream logs to stderr")
     chat_p.add_argument("--log-jsonl", action="store_true", help="Write logs as JSONL")
 
+    batch_p = sub.add_parser("batch", help="Run a batch of questions from a JSONL file")
+    batch_p.add_argument("--data", type=Path, required=True)
+    batch_p.add_argument("--schema", type=Path, required=True)
+    batch_p.add_argument("--config", type=Path, default=None, help="Path to demo_qa.toml")
+    batch_p.add_argument("--cases", type=Path, required=True, help="Path to cases jsonl")
+    batch_p.add_argument("--out", type=Path, required=True, help="Path to results jsonl")
+    batch_p.add_argument("--artifacts-dir", type=Path, default=None, help="Where to store per-case artifacts")
+    batch_p.add_argument("--enable-semantic", action="store_true")
+    batch_p.add_argument("--log-level", default="INFO", help="Logging level (INFO, DEBUG, etc.)")
+    batch_p.add_argument("--log-dir", type=Path, default=None, help="Directory for log files")
+    batch_p.add_argument("--log-stderr", action="store_true", help="Also stream logs to stderr")
+    batch_p.add_argument("--log-jsonl", action="store_true", help="Write logs as JSONL")
+    batch_p.add_argument("--max-fails", type=int, default=None, help="Maximum allowed failures before stopping")
+    batch_p.add_argument("--fail-fast", action="store_true", help="Stop on first failing case")
+    batch_p.add_argument(
+        "--fail-on",
+        choices=["error", "mismatch", "any"],
+        default="mismatch",
+        help="Which statuses should cause a failing exit code",
+    )
+
     args = parser.parse_args()
 
     if args.command == "gen":
         generate_and_save(args.out, rows=args.rows, seed=args.seed, enable_semantic=args.enable_semantic)
         print(f"Generated data in {args.out}")
-        return
+        raise SystemExit(0)
 
     if args.command == "chat":
-        try:
-            settings = load_settings(config_path=args.config, data_dir=args.data)
-        except Exception as exc:
-            raise SystemExit(f"Configuration error: {exc}")
-
-        log_dir = args.log_dir or args.data / ".runs" / "logs"
-        log_file = configure_logging(
-            level=args.log_level,
-            log_dir=log_dir,
-            to_stderr=args.log_stderr,
-            jsonl=args.log_jsonl,
-            run_id=None,
-        )
-
-        llm_settings = settings.llm
-        llm_endpoint = llm_settings.base_url or "https://api.openai.com/v1"
-        diagnostics = [
-            f"LLM endpoint: {llm_endpoint}",
-            f"Plan model: {llm_settings.plan_model} (temp={llm_settings.plan_temperature})",
-            f"Synth model: {llm_settings.synth_model} (temp={llm_settings.synth_temperature})",
-            f"Timeout: {llm_settings.timeout_s if llm_settings.timeout_s is not None else 'default'}, "
-            f"Retries: {llm_settings.retries if llm_settings.retries is not None else 'default'}",
-        ]
-        if args.enable_semantic:
-            diagnostics.append(f"Embeddings: CSV semantic backend in {args.data} (*.embeddings.json)")
-        else:
-            diagnostics.append("Embeddings: disabled (use --enable-semantic to build/search embeddings).")
-
-        llm = build_llm(settings)
-
-        start_repl(
-            args.data,
-            args.schema,
-            llm,
-            enable_semantic=args.enable_semantic,
-            log_file=log_file,
-            diagnostics=diagnostics,
-        )
-        return
+        code = handle_chat(args)
+    elif args.command == "batch":
+        code = handle_batch(args)
+    else:
+        code = 0
+    raise SystemExit(code)
 
 
 if __name__ == "__main__":
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
new file mode 100644
index 0000000..2fc17af
--- /dev/null
+++ b/examples/demo_qa/runner.py
@@ -0,0 +1,296 @@
+from __future__ import annotations
+
+import json
+import re
+import statistics
+import time
+import uuid
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, Iterable, List
+
+from fetchgraph.core import create_generic_agent
+from fetchgraph.core.models import TaskProfile
+from fetchgraph.utils import set_run_id
+
+
+@dataclass
+class RunTimings:
+    plan_s: float | None = None
+    fetch_s: float | None = None
+    synth_s: float | None = None
+    total_s: float | None = None
+
+
+@dataclass
+class ExpectedCheck:
+    mode: str
+    expected: str
+    passed: bool
+    detail: str | None = None
+
+
+@dataclass
+class RunArtifacts:
+    run_id: str
+    run_dir: Path
+    question: str
+    plan: Dict[str, object] | None = None
+    context: Dict[str, object] | None = None
+    answer: str | None = None
+    raw_synth: str | None = None
+    error: str | None = None
+    timings: RunTimings = field(default_factory=RunTimings)
+
+
+@dataclass
+class RunResult:
+    id: str
+    question: str
+    status: str
+    answer: str | None
+    error: str | None
+    plan_path: str | None
+    artifacts_dir: str
+    timings: RunTimings
+    expected_check: ExpectedCheck | None = None
+
+    def to_json(self) -> Dict[str, object]:
+        payload: Dict[str, object] = {
+            "id": self.id,
+            "question": self.question,
+            "status": self.status,
+            "answer": self.answer,
+            "error": self.error,
+            "plan_path": self.plan_path,
+            "artifacts_dir": self.artifacts_dir,
+            "timings": self.timings.__dict__,
+        }
+        if self.expected_check:
+            payload["expected_check"] = self.expected_check.__dict__
+        return payload
+
+
+@dataclass
+class Case:
+    id: str
+    question: str
+    expected: str | None = None
+    expected_regex: str | None = None
+    expected_contains: str | None = None
+    tags: List[str] = field(default_factory=list)
+    skip: bool = False
+
+
+class AgentRunner:
+    def __init__(self, llm, provider) -> None:
+        def saver(feature_name: str, parsed: object) -> None:
+            # Placeholder to satisfy BaseGraphAgent.saver; artifacts captured elsewhere.
+            return None
+
+        task_profile = TaskProfile(
+            task_name="Demo QA",
+            goal="Answer analytics questions over the demo dataset",
+            output_format="Plain text answer",
+            focus_hints=[
+                "Prefer aggregates",
+                "Use concise answers",
+            ],
+        )
+
+        self.agent = create_generic_agent(
+            llm_invoke=llm,
+            providers={provider.name: provider},
+            saver=saver,
+            task_profile=task_profile,
+        )
+
+    def run_question(self, question: str, run_id: str, run_dir: Path) -> RunArtifacts:
+        set_run_id(run_id)
+        artifacts = RunArtifacts(run_id=run_id, run_dir=run_dir, question=question)
+
+        started = time.perf_counter()
+        try:
+            plan_started = time.perf_counter()
+            plan = self.agent._plan(question)  # type: ignore[attr-defined]
+            artifacts.timings.plan_s = time.perf_counter() - plan_started
+            artifacts.plan = plan.model_dump()
+
+            fetch_started = time.perf_counter()
+            ctx = self.agent._fetch(question, plan)  # type: ignore[attr-defined]
+            artifacts.timings.fetch_s = time.perf_counter() - fetch_started
+            artifacts.context = {k: v.text for k, v in (ctx or {}).items()} if ctx else {}
+
+            synth_started = time.perf_counter()
+            draft = self.agent._synthesize(question, ctx, plan)  # type: ignore[attr-defined]
+            artifacts.timings.synth_s = time.perf_counter() - synth_started
+            artifacts.raw_synth = str(draft)
+            parsed = self.agent.domain_parser(draft)
+            artifacts.answer = str(parsed)
+        except Exception as exc:  # pragma: no cover - demo fallback
+            artifacts.error = str(exc)
+        finally:
+            artifacts.timings.total_s = time.perf_counter() - started
+
+        return artifacts
+
+
+def build_agent(llm, provider) -> AgentRunner:
+    return AgentRunner(llm, provider)
+
+
+def _save_text(path: Path, content: str) -> None:
+    path.write_text(content, encoding="utf-8")
+
+
+def _save_json(path: Path, payload: object) -> None:
+    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def save_artifacts(artifacts: RunArtifacts) -> None:
+    artifacts.run_dir.mkdir(parents=True, exist_ok=True)
+    if artifacts.plan is not None:
+        _save_json(artifacts.run_dir / "plan.json", artifacts.plan)
+    if artifacts.context is not None:
+        _save_json(artifacts.run_dir / "context.json", artifacts.context)
+    if artifacts.answer is not None:
+        _save_text(artifacts.run_dir / "answer.txt", artifacts.answer)
+    if artifacts.raw_synth is not None:
+        _save_text(artifacts.run_dir / "raw_synth.txt", artifacts.raw_synth)
+    if artifacts.error is not None:
+        _save_text(artifacts.run_dir / "error.txt", artifacts.error)
+
+
+def _match_expected(case: Case, answer: str | None) -> ExpectedCheck | None:
+    if answer is None:
+        return ExpectedCheck(mode="none", expected="", passed=False, detail="no answer")
+    if case.expected is not None:
+        passed = answer.strip() == case.expected.strip()
+        detail = None if passed else f"expected={case.expected!r}, got={answer!r}"
+        return ExpectedCheck(mode="exact", expected=case.expected, passed=passed, detail=detail)
+    if case.expected_regex is not None:
+        pattern = re.compile(case.expected_regex)
+        passed = bool(pattern.search(answer))
+        detail = None if passed else f"regex {case.expected_regex!r} not found"
+        return ExpectedCheck(mode="regex", expected=case.expected_regex, passed=passed, detail=detail)
+    if case.expected_contains is not None:
+        passed = case.expected_contains in answer
+        detail = None if passed else f"expected to contain {case.expected_contains!r}"
+        return ExpectedCheck(mode="contains", expected=case.expected_contains, passed=passed, detail=detail)
+    return None
+
+
+def run_one(case: Case, runner: AgentRunner, artifacts_root: Path) -> RunResult:
+    run_id = uuid.uuid4().hex[:8]
+    run_dir = artifacts_root / f"{case.id}_{run_id}"
+    if case.skip:
+        run_dir.mkdir(parents=True, exist_ok=True)
+        _save_text(run_dir / "skipped.txt", "Skipped by request")
+        return RunResult(
+            id=case.id,
+            question=case.question,
+            status="skipped",
+            answer=None,
+            error=None,
+            plan_path=None,
+            artifacts_dir=str(run_dir),
+            timings=RunTimings(),
+            expected_check=None,
+        )
+    artifacts = runner.run_question(case.question, run_id, run_dir)
+    save_artifacts(artifacts)
+
+    expected_check = _match_expected(case, artifacts.answer)
+    status = "ok"
+    if artifacts.error:
+        status = "error"
+    elif expected_check and not expected_check.passed:
+        status = "mismatch"
+
+    plan_path = str(run_dir / "plan.json") if artifacts.plan is not None else None
+    result = RunResult(
+        id=case.id,
+        question=case.question,
+        status=status,
+        answer=artifacts.answer,
+        error=artifacts.error,
+        plan_path=plan_path,
+        artifacts_dir=str(run_dir),
+        timings=artifacts.timings,
+        expected_check=expected_check,
+    )
+    return result
+
+
+def summarize(results: Iterable[RunResult]) -> Dict[str, object]:
+    totals = {"ok": 0, "error": 0, "mismatch": 0, "skipped": 0}
+    total_times: List[float] = []
+    for res in results:
+        totals[res.status] = totals.get(res.status, 0) + 1
+        if res.timings.total_s is not None:
+            total_times.append(res.timings.total_s)
+
+    summary: Dict[str, object] = {
+        "total": sum(totals.values()),
+        **totals,
+    }
+    if total_times:
+        summary["avg_total_s"] = statistics.fmean(total_times)
+        summary["median_total_s"] = statistics.median(total_times)
+    else:
+        summary["avg_total_s"] = None
+        summary["median_total_s"] = None
+    return summary
+
+
+def load_cases(path: Path) -> List[Case]:
+    if not path.exists():
+        raise FileNotFoundError(f"Cases file not found: {path}")
+    cases: List[Case] = []
+    with path.open("r", encoding="utf-8") as f:
+        for lineno, line in enumerate(f, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                payload = json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"Invalid JSON on line {lineno}: {exc}") from exc
+            if "id" not in payload or "question" not in payload:
+                raise ValueError(f"Case on line {lineno} missing required fields 'id' and 'question'")
+            case = Case(
+                id=str(payload["id"]),
+                question=str(payload["question"]),
+                expected=payload.get("expected"),
+                expected_regex=payload.get("expected_regex"),
+                expected_contains=payload.get("expected_contains"),
+                tags=list(payload.get("tags", []) or []),
+                skip=bool(payload.get("skip", False)),
+            )
+            cases.append(case)
+    return cases
+
+
+def format_status_line(result: RunResult) -> str:
+    timing = f"{result.timings.total_s:.2f}s" if result.timings.total_s is not None else "n/a"
+    if result.status == "ok":
+        return f"OK {result.id} {timing}"
+    if result.status == "skipped":
+        return f"SKIP {result.id}"
+    reason = result.error or (result.expected_check.detail if result.expected_check else "")
+    return f"FAIL {result.id} {result.status} ({reason or 'unknown'}) {timing}"
+
+
+__all__ = [
+    "AgentRunner",
+    "Case",
+    "ExpectedCheck",
+    "RunArtifacts",
+    "RunResult",
+    "build_agent",
+    "format_status_line",
+    "load_cases",
+    "run_one",
+    "save_artifacts",
+    "summarize",
+]

From ff05569dc6b200fdaaf30968c5a9ad42bb0f0ec8 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 15:47:28 +0300
Subject: [PATCH 02/92] Fix batch logging path and update CLI defaults

---
 README_demo_qa.md            |   5 +-
 examples/demo_qa/cli.py      | 192 +++++++++++++++++++++---
 examples/demo_qa/runner.py   | 273 ++++++++++++++++++++++++++++++-----
 tests/test_demo_qa_runner.py | 111 ++++++++++++++
 4 files changed, 526 insertions(+), 55 deletions(-)
 create mode 100644 tests/test_demo_qa_runner.py

diff --git a/README_demo_qa.md b/README_demo_qa.md
index 006da3f..c654c01 100644
--- a/README_demo_qa.md
+++ b/README_demo_qa.md
@@ -65,8 +65,9 @@ python -m examples.demo_qa.cli batch \
 ```
 
 * Артефакты по умолчанию пишутся в `<data>/.runs/batch_<timestamp>/id_runid/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`).
-* `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов.
-* Флаги `--fail-on (error|mismatch|any)`, `--max-fails` и `--fail-fast` управляют остановкой и кодом выхода (0/1/2).
+* `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов и, при наличии `--compare-to`, diff по прогрессу.
+* Флаги `--fail-on (error|mismatch|any)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to` и `--only-failed-from` управляют выбором кейсов, остановкой и кодом выхода (0/1/2).
+* Без `--out` результаты складываются в `<artifacts_dir>/runs/<timestamp>_<cases_stem>/results.jsonl`, а `runs/latest.txt` указывает на последнюю папку запуска.
 ## Local proxy
 
 Для OpenAI-совместимых серверов (например, LM Studio) укажите `base_url` с `.../v1` и
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index f5763de..b49e632 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -2,10 +2,12 @@
 
 import argparse
 import datetime
+import hashlib
 import json
 import sys
+import uuid
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, Mapping, Optional
 
 ROOT = Path(__file__).resolve().parents[2]
 SRC = ROOT / "src"
@@ -17,7 +19,16 @@
 from .llm.factory import build_llm
 from .logging_config import configure_logging
 from .provider_factory import build_provider
-from .runner import RunResult, build_agent, format_status_line, load_cases, run_one, summarize
+from .runner import (
+    RunResult,
+    build_agent,
+    compare_results,
+    format_status_line,
+    load_cases,
+    load_results,
+    run_one,
+    summarize,
+)
 from .settings import load_settings
 
 
@@ -34,12 +45,32 @@ def write_summary(out_path: Path, summary: dict) -> Path:
     return summary_path
 
 
-def is_failure(status: str, fail_on: str) -> bool:
+def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
+    failure_statuses = {"error", "mismatch", "failed"}
     if fail_on == "error":
-        return status == "error"
-    if fail_on == "mismatch":
-        return status in {"error", "mismatch"}
-    return status in {"error", "mismatch", "skipped"}
+        failure_statuses = {"error"}
+    elif fail_on == "mismatch":
+        failure_statuses = {"error", "mismatch", "failed"}
+    else:
+        failure_statuses = {"error", "mismatch", "failed", "unchecked"}
+    if require_assert and status == "unchecked":
+        return True
+    return status in failure_statuses
+
+
+def _hash_file(path: Path) -> str:
+    data = path.read_bytes()
+    return hashlib.sha256(data).hexdigest()
+
+
+def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]:
+    llm_settings = settings.llm
+    return {
+        "base_url": llm_settings.base_url or "https://api.openai.com/v1",
+        "plan_model": llm_settings.plan_model,
+        "synth_model": llm_settings.synth_model,
+        "cases_hash": _hash_file(cases_path),
+    }
 
 
 def handle_chat(args) -> int:
@@ -86,6 +117,9 @@ def handle_chat(args) -> int:
 
 
 def handle_batch(args) -> int:
+    started_at = datetime.datetime.utcnow()
+    run_id = uuid.uuid4().hex[:8]
+
     try:
         settings = load_settings(config_path=args.config, data_dir=args.data)
     except Exception as exc:
@@ -97,10 +131,42 @@ def handle_batch(args) -> int:
         print(f"Cases error: {exc}", file=sys.stderr)
         return 2
 
+    baseline_for_filter: Optional[Mapping[str, RunResult]] = None
+    baseline_for_compare: Optional[Mapping[str, RunResult]] = None
+
+    if args.only_failed_from:
+        try:
+            baseline_for_filter = load_results(args.only_failed_from)
+        except Exception as exc:
+            print(f"Failed to read baseline for --only-failed-from: {exc}", file=sys.stderr)
+            return 2
+
+    if args.compare_to:
+        try:
+            if args.only_failed_from and args.compare_to.resolve() == args.only_failed_from.resolve():
+                baseline_for_compare = baseline_for_filter
+            else:
+                baseline_for_compare = load_results(args.compare_to)
+        except Exception as exc:
+            print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr)
+            return 2
+
+    if baseline_for_filter:
+        bad_statuses = {"mismatch", "failed", "error"}
+        if args.require_assert:
+            bad_statuses.add("unchecked")
+        target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses}
+        cases = [case for case in cases if case.id in target_ids]
+
     artifacts_dir = args.artifacts_dir
     if artifacts_dir is None:
-        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-        artifacts_dir = args.data / ".runs" / f"batch_{timestamp}"
+        artifacts_dir = args.data / ".runs"
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}"
+    results_path = args.out or (run_folder / "results.jsonl")
+    artifacts_root = run_folder / "cases"
+    results_path.parent.mkdir(parents=True, exist_ok=True)
+    summary_path = results_path.with_name("summary.json")
     artifacts_dir.mkdir(parents=True, exist_ok=True)
 
     log_dir = args.log_dir or args.data / ".runs" / "logs"
@@ -119,23 +185,99 @@ def handle_batch(args) -> int:
     results: list[RunResult] = []
     failures = 0
     for case in cases:
-        result = run_one(case, runner, artifacts_dir)
+        result = run_one(case, runner, artifacts_root)
         results.append(result)
-        print(format_status_line(result))
-        if is_failure(result.status, args.fail_on):
+        if not args.quiet:
+            print(format_status_line(result))
+        if is_failure(result.status, args.fail_on, args.require_assert):
             failures += 1
             if args.fail_fast or (args.max_fails and failures >= args.max_fails):
                 break
 
-    write_results(args.out, results)
-    summary = summarize(results)
-    summary_path = write_summary(args.out, summary)
-    print(f"Summary: {json.dumps(summary, ensure_ascii=False)}")
-    print(f"Results written to: {args.out}")
+    write_results(results_path, results)
+    counts = summarize(results)
+
+    results_by_id = {r.id: r for r in results}
+    diff_block: dict | None = None
+    baseline_path: Path | None = None
+    if baseline_for_compare:
+        baseline_path = args.compare_to or args.only_failed_from
+        diff = compare_results(baseline_for_compare, results_by_id, require_assert=args.require_assert)
+        if baseline_path:
+            diff["baseline_path"] = str(baseline_path)
+        diff_block = diff
+
+    failure_count = sum(1 for res in results if is_failure(res.status, args.fail_on, args.require_assert))
+    exit_code = 1 if failure_count else 0
+
+    ended_at = datetime.datetime.utcnow()
+    duration_ms = int((ended_at - started_at).total_seconds() * 1000)
+    summary = {
+        "run_id": run_id,
+        "started_at": started_at.isoformat() + "Z",
+        "ended_at": ended_at.isoformat() + "Z",
+        "duration_ms": duration_ms,
+        "counts": counts,
+        "exit_code": exit_code,
+        "config_fingerprint": build_config_fingerprint(settings, args.cases),
+        "results_path": str(results_path),
+        "require_assert": args.require_assert,
+        "fail_on": args.fail_on,
+    }
+    if diff_block:
+        summary["diff"] = diff_block
+
+    summary_path = write_summary(results_path, summary)
+
+    latest_path = run_folder.parent / "latest.txt"
+    latest_path.parent.mkdir(parents=True, exist_ok=True)
+    latest_path.write_text(str(run_folder), encoding="utf-8")
+
+    bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0)
+    unchecked = counts.get("unchecked", 0)
+    if args.require_assert:
+        bad_count += unchecked
+    summary_line = (
+        f"Batch: {counts.get('total', 0)} cases | Checked: {counts.get('checked_total', 0)} | "
+        f"OK: {counts.get('ok', 0)} | BAD: {bad_count} | Unchecked: {unchecked} | Skipped: {counts.get('skipped', 0)}"
+    )
+
+    if args.quiet:
+        print(summary_line)
+        if diff_block:
+            print(
+                f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, "
+                f"-{len(diff_block.get('regressed', []))} regressions, "
+                f"{len(diff_block.get('still_bad', []))} still failing, "
+                f"{len(diff_block.get('new_unchecked', []))} new unchecked"
+            )
+        return exit_code
+
+    print(summary_line)
+    if diff_block:
+        print(
+            f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, "
+            f"-{len(diff_block.get('regressed', []))} regressions, "
+            f"{len(diff_block.get('still_bad', []))} still failing, "
+            f"{len(diff_block.get('new_unchecked', []))} new unchecked"
+        )
+
+    failures_list: dict[str, RunResult] = {}
+    for res in results:
+        if is_failure(res.status, args.fail_on, args.require_assert) or (
+            args.require_assert and res.status == "unchecked"
+        ):
+            failures_list[res.id] = res
+    if failures_list:
+        print(f"Failures (top {args.show_failures}):")
+        for res in list(failures_list.values())[: args.show_failures]:
+            reason = res.reason or res.error or ""
+            print(f"- {res.id}: {res.status} ({reason}) [{res.artifacts_dir}]")
+
+    print(f"Results written to: {results_path}")
     print(f"Summary written to: {summary_path}")
 
-    failure_count = sum(1 for res in results if is_failure(res.status, args.fail_on))
-    return 1 if failure_count else 0
+    return exit_code
 
 
 def main() -> None:
@@ -163,7 +305,7 @@ def main() -> None:
     batch_p.add_argument("--schema", type=Path, required=True)
     batch_p.add_argument("--config", type=Path, default=None, help="Path to demo_qa.toml")
     batch_p.add_argument("--cases", type=Path, required=True, help="Path to cases jsonl")
-    batch_p.add_argument("--out", type=Path, required=True, help="Path to results jsonl")
+    batch_p.add_argument("--out", type=Path, required=False, default=None, help="Path to results jsonl")
     batch_p.add_argument("--artifacts-dir", type=Path, default=None, help="Where to store per-case artifacts")
     batch_p.add_argument("--enable-semantic", action="store_true")
     batch_p.add_argument("--log-level", default="INFO", help="Logging level (INFO, DEBUG, etc.)")
@@ -178,6 +320,16 @@ def main() -> None:
         default="mismatch",
         help="Which statuses should cause a failing exit code",
     )
+    batch_p.add_argument("--require-assert", action="store_true", help="Treat unchecked cases as failures")
+    batch_p.add_argument("--compare-to", type=Path, default=None, help="Path to previous results.jsonl for diff")
+    batch_p.add_argument(
+        "--only-failed-from",
+        type=Path,
+        default=None,
+        help="Run only cases that failed/mismatched/errored in a previous results.jsonl",
+    )
+    batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code")
+    batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show")
 
     args = parser.parse_args()
 
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 2fc17af..a6a9778 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -7,7 +7,7 @@
 import uuid
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Dict, Iterable, List
+from typing import Dict, Iterable, List, Mapping, Optional
 
 from fetchgraph.core import create_generic_agent
 from fetchgraph.core.models import TaskProfile
@@ -48,11 +48,15 @@ class RunResult:
     id: str
     question: str
     status: str
-    answer: str | None
-    error: str | None
-    plan_path: str | None
+    checked: bool
+    reason: str | None
+    details: Dict[str, object] | None
     artifacts_dir: str
-    timings: RunTimings
+    duration_ms: int
+    answer: str | None = None
+    error: str | None = None
+    plan_path: str | None = None
+    timings: RunTimings | None = None
     expected_check: ExpectedCheck | None = None
 
     def to_json(self) -> Dict[str, object]:
@@ -60,11 +64,15 @@ def to_json(self) -> Dict[str, object]:
             "id": self.id,
             "question": self.question,
             "status": self.status,
+            "checked": self.checked,
+            "reason": self.reason,
+            "details": self.details,
+            "artifacts_dir": self.artifacts_dir,
+            "duration_ms": self.duration_ms,
             "answer": self.answer,
             "error": self.error,
             "plan_path": self.plan_path,
-            "artifacts_dir": self.artifacts_dir,
-            "timings": self.timings.__dict__,
+            "timings": self.timings.__dict__ if self.timings else None,
         }
         if self.expected_check:
             payload["expected_check"] = self.expected_check.__dict__
@@ -81,6 +89,10 @@ class Case:
     tags: List[str] = field(default_factory=list)
     skip: bool = False
 
+    @property
+    def has_asserts(self) -> bool:
+        return any([self.expected, self.expected_regex, self.expected_contains])
+
 
 class AgentRunner:
     def __init__(self, llm, provider) -> None:
@@ -161,9 +173,18 @@ def save_artifacts(artifacts: RunArtifacts) -> None:
         _save_text(artifacts.run_dir / "error.txt", artifacts.error)
 
 
+def save_status(result: RunResult) -> None:
+    status_path = Path(result.artifacts_dir) / "status.json"
+    status_path.parent.mkdir(parents=True, exist_ok=True)
+    _save_json(status_path, result.to_json())
+
+
 def _match_expected(case: Case, answer: str | None) -> ExpectedCheck | None:
+    if not case.has_asserts:
+        return None
+    expected_value = case.expected or case.expected_regex or case.expected_contains or ""
     if answer is None:
-        return ExpectedCheck(mode="none", expected="", passed=False, detail="no answer")
+        return ExpectedCheck(mode="none", expected=expected_value, passed=False, detail="no answer")
     if case.expected is not None:
         passed = answer.strip() == case.expected.strip()
         detail = None if passed else f"expected={case.expected!r}, got={answer!r}"
@@ -180,58 +201,91 @@ def _match_expected(case: Case, answer: str | None) -> ExpectedCheck | None:
     return None
 
 
+def _build_result(
+    case: Case, artifacts: RunArtifacts, run_dir: Path, expected_check: ExpectedCheck | None
+) -> RunResult:
+    status = "unchecked"
+    reason: str | None = None
+    details: Dict[str, object] | None = None
+
+    if artifacts.error:
+        status = "error"
+        reason = artifacts.error
+        details = {"error": artifacts.error}
+    elif expected_check:
+        status = "ok" if expected_check.passed else "mismatch"
+        reason = expected_check.detail
+        details = {"expected_check": expected_check.__dict__}
+    else:
+        reason = "no expectations provided"
+        details = {"note": "no expectations provided"}
+
+    plan_path = str(run_dir / "plan.json") if artifacts.plan is not None else None
+    duration_ms = int((artifacts.timings.total_s or 0.0) * 1000)
+    return RunResult(
+        id=case.id,
+        question=case.question,
+        status=status,
+        checked=case.has_asserts,
+        reason=reason,
+        details=details,
+        artifacts_dir=str(run_dir),
+        duration_ms=duration_ms,
+        answer=artifacts.answer,
+        error=artifacts.error,
+        plan_path=plan_path,
+        timings=artifacts.timings,
+        expected_check=expected_check,
+    )
+
+
 def run_one(case: Case, runner: AgentRunner, artifacts_root: Path) -> RunResult:
     run_id = uuid.uuid4().hex[:8]
     run_dir = artifacts_root / f"{case.id}_{run_id}"
     if case.skip:
         run_dir.mkdir(parents=True, exist_ok=True)
         _save_text(run_dir / "skipped.txt", "Skipped by request")
-        return RunResult(
+        result = RunResult(
             id=case.id,
             question=case.question,
             status="skipped",
+            checked=False,
+            reason="skipped",
+            details=None,
+            artifacts_dir=str(run_dir),
+            duration_ms=0,
             answer=None,
             error=None,
             plan_path=None,
-            artifacts_dir=str(run_dir),
             timings=RunTimings(),
             expected_check=None,
         )
+        save_status(result)
+        return result
+
     artifacts = runner.run_question(case.question, run_id, run_dir)
     save_artifacts(artifacts)
 
     expected_check = _match_expected(case, artifacts.answer)
-    status = "ok"
-    if artifacts.error:
-        status = "error"
-    elif expected_check and not expected_check.passed:
-        status = "mismatch"
-
-    plan_path = str(run_dir / "plan.json") if artifacts.plan is not None else None
-    result = RunResult(
-        id=case.id,
-        question=case.question,
-        status=status,
-        answer=artifacts.answer,
-        error=artifacts.error,
-        plan_path=plan_path,
-        artifacts_dir=str(run_dir),
-        timings=artifacts.timings,
-        expected_check=expected_check,
-    )
+    result = _build_result(case, artifacts, run_dir, expected_check)
+    save_status(result)
     return result
 
 
 def summarize(results: Iterable[RunResult]) -> Dict[str, object]:
-    totals = {"ok": 0, "error": 0, "mismatch": 0, "skipped": 0}
+    totals = {"ok": 0, "mismatch": 0, "failed": 0, "error": 0, "skipped": 0, "unchecked": 0}
     total_times: List[float] = []
+    checked_total = 0
     for res in results:
         totals[res.status] = totals.get(res.status, 0) + 1
-        if res.timings.total_s is not None:
-            total_times.append(res.timings.total_s)
+        if res.duration_ms is not None:
+            total_times.append(res.duration_ms / 1000)
+        if res.checked and res.status in {"ok", "mismatch", "failed", "error"}:
+            checked_total += 1
 
     summary: Dict[str, object] = {
         "total": sum(totals.values()),
+        "checked_total": checked_total,
         **totals,
     }
     if total_times:
@@ -271,13 +325,162 @@ def load_cases(path: Path) -> List[Case]:
     return cases
 
 
+def _build_timings(payload: Mapping[str, object] | None) -> RunTimings | None:
+    if not payload:
+        return None
+    return RunTimings(
+        plan_s=payload.get("plan_s"),  # type: ignore[arg-type]
+        fetch_s=payload.get("fetch_s"),  # type: ignore[arg-type]
+        synth_s=payload.get("synth_s"),  # type: ignore[arg-type]
+        total_s=payload.get("total_s"),  # type: ignore[arg-type]
+    )
+
+
+def _build_expected_check(payload: Mapping[str, object] | None) -> ExpectedCheck | None:
+    if not payload:
+        return None
+    return ExpectedCheck(
+        mode=str(payload.get("mode", "")),
+        expected=str(payload.get("expected", "")),
+        passed=bool(payload.get("passed", False)),
+        detail=payload.get("detail"),  # type: ignore[arg-type]
+    )
+
+
+def _duration_from_payload(payload: Mapping[str, object]) -> int:
+    if "duration_ms" in payload and payload["duration_ms"] is not None:
+        try:
+            return int(payload["duration_ms"])  # type: ignore[arg-type]
+        except Exception:
+            pass
+    timings = payload.get("timings")
+    if isinstance(timings, Mapping) and timings.get("total_s") is not None:
+        try:
+            return int(float(timings["total_s"]) * 1000)  # type: ignore[arg-type]
+        except Exception:
+            return 0
+    return 0
+
+
+def _run_result_from_payload(payload: Mapping[str, object]) -> RunResult:
+    expected_check = _build_expected_check(payload.get("expected_check") if isinstance(payload, Mapping) else None)
+    timings = _build_timings(payload.get("timings") if isinstance(payload, Mapping) else None)
+    checked = bool(payload.get("checked", False))
+    if expected_check and not checked:
+        checked = True
+    status = str(payload.get("status", "error"))
+    duration_ms = _duration_from_payload(payload)
+    reason = payload.get("reason")  # type: ignore[arg-type]
+    details = payload.get("details") if isinstance(payload.get("details"), dict) else None
+    artifacts_dir = str(payload.get("artifacts_dir", ""))
+    if not artifacts_dir:
+        raise ValueError("artifacts_dir missing in result payload")
+    return RunResult(
+        id=str(payload.get("id", "")),
+        question=str(payload.get("question", "")),
+        status=status,
+        checked=checked,
+        reason=reason,
+        details=details,
+        artifacts_dir=artifacts_dir,
+        duration_ms=duration_ms,
+        answer=payload.get("answer"),  # type: ignore[arg-type]
+        error=payload.get("error"),  # type: ignore[arg-type]
+        plan_path=payload.get("plan_path"),  # type: ignore[arg-type]
+        timings=timings,
+        expected_check=expected_check,
+    )
+
+
+def load_results(path: Path) -> Dict[str, RunResult]:
+    results: Dict[str, RunResult] = {}
+    if not path.exists():
+        raise FileNotFoundError(f"Results file not found: {path}")
+    with path.open("r", encoding="utf-8") as f:
+        for lineno, line in enumerate(f, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                payload = json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"Invalid result JSON on line {lineno}: {exc}") from exc
+            result = _run_result_from_payload(payload)
+            results[result.id] = result
+    return results
+
+
+def _bucket(status: str, checked: bool, require_assert: bool) -> str:
+    if status == "ok":
+        return "OK"
+    if status in {"mismatch", "failed", "error"}:
+        return "BAD"
+    if status == "unchecked":
+        return "BAD" if require_assert else "UNCHECKED"
+    return "NEUTRAL"
+
+
+def compare_results(
+    baseline: Mapping[str, RunResult],
+    current: Mapping[str, RunResult],
+    *,
+    require_assert: bool,
+) -> Dict[str, object]:
+    new_ok: List[str] = []
+    regressed: List[str] = []
+    still_ok: List[str] = []
+    still_bad: List[str] = []
+    new_unchecked: List[str] = []
+    status_changes: Dict[str, Dict[str, str]] = {}
+    new_cases: List[str] = []
+
+    for case_id, res in current.items():
+        base_res = baseline.get(case_id)
+        new_bucket = _bucket(res.status, res.checked, require_assert)
+        if base_res is None:
+            new_cases.append(case_id)
+            if new_bucket == "OK":
+                new_ok.append(case_id)
+            elif new_bucket == "BAD":
+                still_bad.append(case_id)
+            status_changes[case_id] = {"from": "new", "to": res.status}
+            continue
+
+        base_bucket = _bucket(base_res.status, base_res.checked, require_assert)
+        if base_res.checked and res.status == "unchecked":
+            new_unchecked.append(case_id)
+        if base_bucket == "OK" and new_bucket in {"BAD", "UNCHECKED"}:
+            regressed.append(case_id)
+        elif base_bucket in {"BAD", "UNCHECKED"} and new_bucket == "OK":
+            new_ok.append(case_id)
+        elif base_bucket == "OK" and new_bucket == "OK":
+            still_ok.append(case_id)
+        elif base_bucket in {"BAD", "UNCHECKED"} and new_bucket in {"BAD", "UNCHECKED"}:
+            still_bad.append(case_id)
+
+        if base_res.status != res.status:
+            status_changes[case_id] = {"from": base_res.status, "to": res.status}
+
+    return {
+        "new_ok": new_ok,
+        "regressed": regressed,
+        "still_ok": still_ok,
+        "still_bad": still_bad,
+        "new_unchecked": new_unchecked,
+        "status_changes": status_changes,
+        "new_cases": new_cases,
+    }
+
+
 def format_status_line(result: RunResult) -> str:
-    timing = f"{result.timings.total_s:.2f}s" if result.timings.total_s is not None else "n/a"
+    timing = f"{result.duration_ms / 1000:.2f}s"
     if result.status == "ok":
         return f"OK {result.id} {timing}"
     if result.status == "skipped":
         return f"SKIP {result.id}"
-    reason = result.error or (result.expected_check.detail if result.expected_check else "")
+    if result.status == "unchecked":
+        return f"UNCHECKED {result.id} {timing}"
+    reason = result.reason or ""
     return f"FAIL {result.id} {result.status} ({reason or 'unknown'}) {timing}"
 
 
@@ -288,9 +491,13 @@ def format_status_line(result: RunResult) -> str:
     "RunArtifacts",
     "RunResult",
     "build_agent",
+    "compare_results",
     "format_status_line",
+    "load_results",
     "load_cases",
     "run_one",
     "save_artifacts",
+    "save_status",
     "summarize",
+    "_match_expected",
 ]
diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py
new file mode 100644
index 0000000..6bdb95e
--- /dev/null
+++ b/tests/test_demo_qa_runner.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+from examples.demo_qa.runner import Case, RunResult, _match_expected, compare_results
+
+
+def test_match_expected_unchecked_when_no_expectations() -> None:
+    case = Case(id="c1", question="What is foo?")
+    assert _match_expected(case, "anything") is None
+
+
+def test_match_expected_contains_pass_and_fail() -> None:
+    case = Case(id="c2", question="Q", expected_contains="bar")
+
+    match = _match_expected(case, "value bar baz")
+    assert match is not None
+    assert match.passed is True
+
+    mismatch = _match_expected(case, "value baz")
+    assert mismatch is not None
+    assert mismatch.passed is False
+    assert "bar" in (mismatch.detail or "")
+
+    missing_answer = _match_expected(case, None)
+    assert missing_answer is not None
+    assert missing_answer.passed is False
+    assert missing_answer.detail == "no answer"
+
+
+def test_compare_results_tracks_regressions_and_improvements() -> None:
+    baseline = {
+        "ok_to_bad": RunResult(
+            id="ok_to_bad",
+            question="",
+            status="ok",
+            checked=True,
+            reason=None,
+            details=None,
+            artifacts_dir="/tmp/ok",
+            duration_ms=10,
+        ),
+        "err_to_ok": RunResult(
+            id="err_to_ok",
+            question="",
+            status="error",
+            checked=True,
+            reason=None,
+            details=None,
+            artifacts_dir="/tmp/err",
+            duration_ms=10,
+        ),
+        "checked_to_unchecked": RunResult(
+            id="checked_to_unchecked",
+            question="",
+            status="ok",
+            checked=True,
+            reason=None,
+            details=None,
+            artifacts_dir="/tmp/ok2",
+            duration_ms=10,
+        ),
+    }
+
+    current = {
+        "ok_to_bad": RunResult(
+            id="ok_to_bad",
+            question="",
+            status="mismatch",
+            checked=True,
+            reason=None,
+            details=None,
+            artifacts_dir="/tmp/ok",
+            duration_ms=10,
+        ),
+        "err_to_ok": RunResult(
+            id="err_to_ok",
+            question="",
+            status="ok",
+            checked=True,
+            reason=None,
+            details=None,
+            artifacts_dir="/tmp/err",
+            duration_ms=10,
+        ),
+        "checked_to_unchecked": RunResult(
+            id="checked_to_unchecked",
+            question="",
+            status="unchecked",
+            checked=False,
+            reason=None,
+            details=None,
+            artifacts_dir="/tmp/ok2",
+            duration_ms=10,
+        ),
+        "new_ok": RunResult(
+            id="new_ok",
+            question="",
+            status="ok",
+            checked=True,
+            reason=None,
+            details=None,
+            artifacts_dir="/tmp/new",
+            duration_ms=10,
+        ),
+    }
+
+    diff = compare_results(baseline, current, require_assert=True)
+
+    assert "ok_to_bad" in diff["regressed"]
+    assert "err_to_ok" in diff["new_ok"]
+    assert "checked_to_unchecked" in diff["new_unchecked"]
+    assert "new_ok" in diff["new_ok"]

From 80b02390e1e9d74d453475f7da3613f4230c3d19 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 15:47:33 +0300
Subject: [PATCH 03/92] Add regression-friendly batch controls and case helpers

---
 README_demo_qa.md            |   4 +-
 examples/demo_qa/cli.py      | 149 ++++++++++++++++++++++++++++++++---
 examples/demo_qa/runner.py   |  47 ++++++-----
 tests/test_demo_qa_runner.py |  42 +++++++++-
 4 files changed, 212 insertions(+), 30 deletions(-)

diff --git a/README_demo_qa.md b/README_demo_qa.md
index c654c01..4036a23 100644
--- a/README_demo_qa.md
+++ b/README_demo_qa.md
@@ -66,8 +66,10 @@ python -m examples.demo_qa.cli batch \
 
 * Артефакты по умолчанию пишутся в `<data>/.runs/batch_<timestamp>/id_runid/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`).
 * `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов и, при наличии `--compare-to`, diff по прогрессу.
-* Флаги `--fail-on (error|mismatch|any)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to` и `--only-failed-from` управляют выбором кейсов, остановкой и кодом выхода (0/1/2).
+* Флаги `--fail-on (error|mismatch/unchecked/any)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to`, `--only-failed-from/--only-failed` и `--plan-only` управляют выбором кейсов, остановкой и кодом выхода (0/1/2).
 * Без `--out` результаты складываются в `<artifacts_dir>/runs/<timestamp>_<cases_stem>/results.jsonl`, а `runs/latest.txt` указывает на последнюю папку запуска.
+* Быстрый фокус на упавших: `--only-failed` возьмёт `runs/latest/results.jsonl`, `--show-artifacts` печатает пути, репро-команды выводятся для каждого FAIL.
+* Команды уровня кейса: `demo_qa case run <id> --cases ...` и `demo_qa case open <id> --run runs/latest` для быстрого воспроизведения.
 ## Local proxy
 
 Для OpenAI-совместимых серверов (например, LM Studio) укажите `base_url` с `.../v1` и
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index b49e632..f2ae789 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -51,8 +51,10 @@ def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
         failure_statuses = {"error"}
     elif fail_on == "mismatch":
         failure_statuses = {"error", "mismatch", "failed"}
-    else:
+    elif fail_on == "unchecked":
         failure_statuses = {"error", "mismatch", "failed", "unchecked"}
+    else:
+        failure_statuses = {"error", "mismatch", "failed", "unchecked", "skipped"}
     if require_assert and status == "unchecked":
         return True
     return status in failure_statuses
@@ -73,6 +75,25 @@ def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]
     }
 
 
+def _load_latest_run(artifacts_dir: Path) -> Optional[Path]:
+    latest_file = artifacts_dir / "runs" / "latest.txt"
+    if latest_file.exists():
+        content = latest_file.read_text(encoding="utf-8").strip()
+        if content:
+            return Path(content)
+    return None
+
+
+def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]:
+    cases_dir = run_path / "cases"
+    if not cases_dir.exists():
+        return None
+    matches = sorted(cases_dir.glob(f"{case_id}_*"))
+    if matches:
+        return matches[-1]
+    return None
+
+
 def handle_chat(args) -> int:
     try:
         settings = load_settings(config_path=args.config, data_dir=args.data)
@@ -116,6 +137,71 @@ def handle_chat(args) -> int:
     return 0
 
 
+def _resolve_run_path(path: Path | None, artifacts_dir: Path) -> Optional[Path]:
+    if path is not None:
+        return path
+    return _load_latest_run(artifacts_dir)
+
+
+def handle_case_run(args) -> int:
+    try:
+        settings = load_settings(config_path=args.config, data_dir=args.data)
+    except Exception as exc:
+        print(f"Configuration error: {exc}", file=sys.stderr)
+        return 2
+    try:
+        cases = {c.id: c for c in load_cases(args.cases)}
+    except Exception as exc:
+        print(f"Cases error: {exc}", file=sys.stderr)
+        return 2
+    if args.case_id not in cases:
+        print(f"Case {args.case_id} not found in {args.cases}", file=sys.stderr)
+        return 2
+
+    artifacts_dir = args.artifacts_dir or (args.data / ".runs")
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}"
+    artifacts_root = run_folder / "cases"
+    results_path = run_folder / "results.jsonl"
+
+    log_dir = artifacts_dir / "logs"
+    configure_logging(level="INFO", log_dir=log_dir, to_stderr=True, jsonl=False, run_id=None)
+
+    provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic)
+    llm = build_llm(settings)
+    runner = build_agent(llm, provider)
+
+    result = run_one(cases[args.case_id], runner, artifacts_root, plan_only=args.plan_only)
+    write_results(results_path, [result])
+    save_path = run_folder.parent / "latest.txt"
+    save_path.parent.mkdir(parents=True, exist_ok=True)
+    save_path.write_text(str(run_folder), encoding="utf-8")
+
+    print(format_status_line(result))
+    print(f"Artifacts: {result.artifacts_dir}")
+    return 0
+
+
+def handle_case_open(args) -> int:
+    artifacts_dir = args.artifacts_dir or Path(".") / ".runs"
+    run_path = _resolve_run_path(args.run, artifacts_dir)
+    if not run_path:
+        print("No run found. Provide --run or ensure runs/latest.txt exists.", file=sys.stderr)
+        return 2
+    case_dir = _find_case_artifact(run_path, args.case_id)
+    if not case_dir:
+        print(f"Case {args.case_id} not found under {run_path}", file=sys.stderr)
+        return 2
+    print(f"Case {args.case_id} artifacts: {case_dir}")
+    plan = case_dir / "plan.json"
+    answer = case_dir / "answer.txt"
+    status = case_dir / "status.json"
+    for path in [plan, answer, status]:
+        if path.exists():
+            print(f"- {path}")
+    return 0
+
+
 def handle_batch(args) -> int:
     started_at = datetime.datetime.utcnow()
     run_id = uuid.uuid4().hex[:8]
@@ -134,26 +220,34 @@ def handle_batch(args) -> int:
     baseline_for_filter: Optional[Mapping[str, RunResult]] = None
     baseline_for_compare: Optional[Mapping[str, RunResult]] = None
 
-    if args.only_failed_from:
+    baseline_filter_path = args.only_failed_from
+    if args.only_failed and not baseline_filter_path:
+        latest = _load_latest_run(args.artifacts_dir or args.data / ".runs")
+        if latest:
+            baseline_filter_path = latest / "results.jsonl"
+    if baseline_filter_path:
         try:
-            baseline_for_filter = load_results(args.only_failed_from)
+            baseline_for_filter = load_results(baseline_filter_path)
         except Exception as exc:
             print(f"Failed to read baseline for --only-failed-from: {exc}", file=sys.stderr)
             return 2
 
-    if args.compare_to:
+    compare_path = args.compare_to
+    if compare_path is None and args.only_failed and baseline_filter_path:
+        compare_path = baseline_filter_path
+    if compare_path:
         try:
-            if args.only_failed_from and args.compare_to.resolve() == args.only_failed_from.resolve():
+            if baseline_filter_path and compare_path.resolve() == baseline_filter_path.resolve():
                 baseline_for_compare = baseline_for_filter
             else:
-                baseline_for_compare = load_results(args.compare_to)
+                baseline_for_compare = load_results(compare_path)
         except Exception as exc:
             print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr)
             return 2
 
     if baseline_for_filter:
         bad_statuses = {"mismatch", "failed", "error"}
-        if args.require_assert:
+        if args.require_assert or args.fail_on == "unchecked":
             bad_statuses.add("unchecked")
         target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses}
         cases = [case for case in cases if case.id in target_ids]
@@ -185,7 +279,7 @@ def handle_batch(args) -> int:
     results: list[RunResult] = []
     failures = 0
     for case in cases:
-        result = run_one(case, runner, artifacts_root)
+        result = run_one(case, runner, artifacts_root, plan_only=args.plan_only)
         results.append(result)
         if not args.quiet:
             print(format_status_line(result))
@@ -239,7 +333,8 @@ def handle_batch(args) -> int:
         bad_count += unchecked
     summary_line = (
         f"Batch: {counts.get('total', 0)} cases | Checked: {counts.get('checked_total', 0)} | "
-        f"OK: {counts.get('ok', 0)} | BAD: {bad_count} | Unchecked: {unchecked} | Skipped: {counts.get('skipped', 0)}"
+        f"Checked OK: {counts.get('checked_ok', 0)} | Unchecked OK: {counts.get('unchecked_ok', 0)} | "
+        f"BAD: {bad_count} | Unchecked: {unchecked} | Skipped: {counts.get('skipped', 0)}"
     )
 
     if args.quiet:
@@ -272,7 +367,14 @@ def handle_batch(args) -> int:
         print(f"Failures (top {args.show_failures}):")
         for res in list(failures_list.values())[: args.show_failures]:
             reason = res.reason or res.error or ""
+            repro = (
+                f"demo_qa case run {res.id} --cases {args.cases} --data {args.data} "
+                f"--schema {args.schema}" + (" --plan-only" if args.plan_only else "")
+            )
             print(f"- {res.id}: {res.status} ({reason}) [{res.artifacts_dir}]")
+            if args.show_artifacts:
+                print(f"  artifacts: {res.artifacts_dir}")
+            print(f"  repro: {repro}")
 
     print(f"Results written to: {results_path}")
     print(f"Summary written to: {summary_path}")
@@ -316,7 +418,7 @@ def main() -> None:
     batch_p.add_argument("--fail-fast", action="store_true", help="Stop on first failing case")
     batch_p.add_argument(
         "--fail-on",
-        choices=["error", "mismatch", "any"],
+        choices=["error", "mismatch", "unchecked", "any"],
         default="mismatch",
         help="Which statuses should cause a failing exit code",
     )
@@ -328,8 +430,28 @@ def main() -> None:
         default=None,
         help="Run only cases that failed/mismatched/errored in a previous results.jsonl",
     )
+    batch_p.add_argument("--only-failed", action="store_true", help="Use latest run for --only-failed-from automatically")
+    batch_p.add_argument("--plan-only", action="store_true", help="Run planner only (no fetch/synthesize)")
     batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code")
     batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show")
+    batch_p.add_argument("--show-artifacts", action="store_true", help="Show artifact paths for failures")
+
+    case_p = sub.add_parser("case", help="Single-case utilities")
+    case_sub = case_p.add_subparsers(dest="case_command", required=True)
+    case_run = case_sub.add_parser("run", help="Run a single case by id")
+    case_run.add_argument("case_id")
+    case_run.add_argument("--cases", type=Path, required=True, help="Path to cases jsonl")
+    case_run.add_argument("--data", type=Path, required=True)
+    case_run.add_argument("--schema", type=Path, required=True)
+    case_run.add_argument("--config", type=Path, default=None)
+    case_run.add_argument("--enable-semantic", action="store_true")
+    case_run.add_argument("--artifacts-dir", type=Path, default=None)
+    case_run.add_argument("--plan-only", action="store_true")
+
+    case_open = case_sub.add_parser("open", help="Show artifacts for a case in a run folder")
+    case_open.add_argument("case_id")
+    case_open.add_argument("--run", type=Path, default=None, help="Run folder (defaults to latest)")
+    case_open.add_argument("--artifacts-dir", type=Path, default=None, help="Base artifacts dir for latest lookup")
 
     args = parser.parse_args()
 
@@ -342,6 +464,13 @@ def main() -> None:
         code = handle_chat(args)
     elif args.command == "batch":
         code = handle_batch(args)
+    elif args.command == "case":
+        if args.case_command == "run":
+            code = handle_case_run(args)
+        elif args.case_command == "open":
+            code = handle_case_open(args)
+        else:
+            code = 1
     else:
         code = 0
     raise SystemExit(code)
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index a6a9778..d9ad5f5 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -41,6 +41,7 @@ class RunArtifacts:
     raw_synth: str | None = None
     error: str | None = None
     timings: RunTimings = field(default_factory=RunTimings)
+    plan_only: bool = False
 
 
 @dataclass
@@ -117,9 +118,9 @@ def saver(feature_name: str, parsed: object) -> None:
             task_profile=task_profile,
         )
 
-    def run_question(self, question: str, run_id: str, run_dir: Path) -> RunArtifacts:
+    def run_question(self, question: str, run_id: str, run_dir: Path, *, plan_only: bool = False) -> RunArtifacts:
         set_run_id(run_id)
-        artifacts = RunArtifacts(run_id=run_id, run_dir=run_dir, question=question)
+        artifacts = RunArtifacts(run_id=run_id, run_dir=run_dir, question=question, plan_only=plan_only)
 
         started = time.perf_counter()
         try:
@@ -128,17 +129,18 @@ def run_question(self, question: str, run_id: str, run_dir: Path) -> RunArtifact
             artifacts.timings.plan_s = time.perf_counter() - plan_started
             artifacts.plan = plan.model_dump()
 
-            fetch_started = time.perf_counter()
-            ctx = self.agent._fetch(question, plan)  # type: ignore[attr-defined]
-            artifacts.timings.fetch_s = time.perf_counter() - fetch_started
-            artifacts.context = {k: v.text for k, v in (ctx or {}).items()} if ctx else {}
-
-            synth_started = time.perf_counter()
-            draft = self.agent._synthesize(question, ctx, plan)  # type: ignore[attr-defined]
-            artifacts.timings.synth_s = time.perf_counter() - synth_started
-            artifacts.raw_synth = str(draft)
-            parsed = self.agent.domain_parser(draft)
-            artifacts.answer = str(parsed)
+            if not plan_only:
+                fetch_started = time.perf_counter()
+                ctx = self.agent._fetch(question, plan)  # type: ignore[attr-defined]
+                artifacts.timings.fetch_s = time.perf_counter() - fetch_started
+                artifacts.context = {k: v.text for k, v in (ctx or {}).items()} if ctx else {}
+
+                synth_started = time.perf_counter()
+                draft = self.agent._synthesize(question, ctx, plan)  # type: ignore[attr-defined]
+                artifacts.timings.synth_s = time.perf_counter() - synth_started
+                artifacts.raw_synth = str(draft)
+                parsed = self.agent.domain_parser(draft)
+                artifacts.answer = str(parsed)
         except Exception as exc:  # pragma: no cover - demo fallback
             artifacts.error = str(exc)
         finally:
@@ -217,8 +219,9 @@ def _build_result(
         reason = expected_check.detail
         details = {"expected_check": expected_check.__dict__}
     else:
-        reason = "no expectations provided"
-        details = {"note": "no expectations provided"}
+        status = "unchecked"
+        reason = "plan-only" if artifacts.plan_only else "no expectations provided"
+        details = {"note": reason}
 
     plan_path = str(run_dir / "plan.json") if artifacts.plan is not None else None
     duration_ms = int((artifacts.timings.total_s or 0.0) * 1000)
@@ -239,7 +242,7 @@ def _build_result(
     )
 
 
-def run_one(case: Case, runner: AgentRunner, artifacts_root: Path) -> RunResult:
+def run_one(case: Case, runner: AgentRunner, artifacts_root: Path, *, plan_only: bool = False) -> RunResult:
     run_id = uuid.uuid4().hex[:8]
     run_dir = artifacts_root / f"{case.id}_{run_id}"
     if case.skip:
@@ -263,7 +266,7 @@ def run_one(case: Case, runner: AgentRunner, artifacts_root: Path) -> RunResult:
         save_status(result)
         return result
 
-    artifacts = runner.run_question(case.question, run_id, run_dir)
+    artifacts = runner.run_question(case.question, run_id, run_dir, plan_only=plan_only)
     save_artifacts(artifacts)
 
     expected_check = _match_expected(case, artifacts.answer)
@@ -276,16 +279,24 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]:
     totals = {"ok": 0, "mismatch": 0, "failed": 0, "error": 0, "skipped": 0, "unchecked": 0}
     total_times: List[float] = []
     checked_total = 0
+    checked_ok = 0
+    unchecked_ok = 0
     for res in results:
         totals[res.status] = totals.get(res.status, 0) + 1
         if res.duration_ms is not None:
             total_times.append(res.duration_ms / 1000)
         if res.checked and res.status in {"ok", "mismatch", "failed", "error"}:
             checked_total += 1
+        if res.status == "ok" and res.checked:
+            checked_ok += 1
+        if res.status == "unchecked":
+            unchecked_ok += 1
 
     summary: Dict[str, object] = {
         "total": sum(totals.values()),
         "checked_total": checked_total,
+        "checked_ok": checked_ok,
+        "unchecked_ok": unchecked_ok,
         **totals,
     }
     if total_times:
@@ -412,7 +423,7 @@ def load_results(path: Path) -> Dict[str, RunResult]:
 
 def _bucket(status: str, checked: bool, require_assert: bool) -> str:
     if status == "ok":
-        return "OK"
+        return "OK" if checked else "UNCHECKED"
     if status in {"mismatch", "failed", "error"}:
         return "BAD"
     if status == "unchecked":
diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py
index 6bdb95e..d83c562 100644
--- a/tests/test_demo_qa_runner.py
+++ b/tests/test_demo_qa_runner.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from examples.demo_qa.runner import Case, RunResult, _match_expected, compare_results
+from examples.demo_qa.runner import Case, RunResult, _match_expected, compare_results, summarize
 
 
 def test_match_expected_unchecked_when_no_expectations() -> None:
@@ -109,3 +109,43 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
     assert "err_to_ok" in diff["new_ok"]
     assert "checked_to_unchecked" in diff["new_unchecked"]
     assert "new_ok" in diff["new_ok"]
+
+
+def test_summarize_counts_checked_and_unchecked() -> None:
+    results = [
+        RunResult(
+            id="c1",
+            question="",
+            status="ok",
+            checked=True,
+            reason=None,
+            details=None,
+            artifacts_dir="/a",
+            duration_ms=10,
+        ),
+        RunResult(
+            id="c2",
+            question="",
+            status="unchecked",
+            checked=False,
+            reason=None,
+            details=None,
+            artifacts_dir="/b",
+            duration_ms=5,
+        ),
+        RunResult(
+            id="c3",
+            question="",
+            status="mismatch",
+            checked=True,
+            reason=None,
+            details=None,
+            artifacts_dir="/c",
+            duration_ms=7,
+        ),
+    ]
+
+    summary = summarize(results)
+    assert summary["checked_ok"] == 1
+    assert summary["unchecked_ok"] == 1  # counts unchecked separately
+    assert summary["checked_total"] == 2

From f3268695e7267bf073063409951eaefac7b6b8ff Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 15:56:09 +0300
Subject: [PATCH 04/92] Refactor batch CLI into module and tighten regression
 semantics

---
 examples/demo_qa/batch.py    | 411 +++++++++++++++++++++++++++++++++++
 examples/demo_qa/cli.py      | 398 ++-------------------------------
 examples/demo_qa/runner.py   |  43 +++-
 tests/test_demo_qa_runner.py |   2 +-
 4 files changed, 466 insertions(+), 388 deletions(-)
 create mode 100644 examples/demo_qa/batch.py

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
new file mode 100644
index 0000000..0f002a6
--- /dev/null
+++ b/examples/demo_qa/batch.py
@@ -0,0 +1,411 @@
+from __future__ import annotations
+
+import datetime
+import hashlib
+import json
+import sys
+import uuid
+from pathlib import Path
+from typing import Iterable, Mapping, Optional
+
+from .llm.factory import build_llm
+from .logging_config import configure_logging
+from .provider_factory import build_provider
+from .runner import (
+    Case,
+    RunResult,
+    build_agent,
+    compare_results,
+    format_status_line,
+    load_cases,
+    load_results,
+    run_one,
+    summarize,
+)
+from .settings import load_settings
+
+
+def write_results(out_path: Path, results: Iterable[RunResult]) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", encoding="utf-8") as f:
+        for res in results:
+            f.write(json.dumps(res.to_json(), ensure_ascii=False) + "\n")
+
+
+def write_summary(out_path: Path, summary: dict) -> Path:
+    summary_path = out_path.with_name("summary.json")
+    summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
+    return summary_path
+
+
+def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
+    bad = {"error", "failed", "mismatch"}
+    unchecked = {"unchecked", "plan_only"}
+    if require_assert:
+        bad |= unchecked
+    if fail_on == "error":
+        bad = {"error"}
+    elif fail_on == "mismatch":
+        bad = {"mismatch"}
+    elif fail_on == "unchecked":
+        bad |= unchecked
+    elif fail_on == "bad":
+        bad = {"error", "failed", "mismatch"}
+        if require_assert:
+            bad |= unchecked
+    elif fail_on == "any":
+        bad |= unchecked
+    elif fail_on == "skipped":
+        bad |= {"skipped"}
+    return status in bad
+
+
+def _hash_file(path: Path) -> str:
+    data = path.read_bytes()
+    return hashlib.sha256(data).hexdigest()
+
+
+def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]:
+    llm_settings = settings.llm
+    return {
+        "base_url": llm_settings.base_url or "https://api.openai.com/v1",
+        "plan_model": llm_settings.plan_model,
+        "synth_model": llm_settings.synth_model,
+        "cases_hash": _hash_file(cases_path),
+    }
+
+
+def _load_latest_run(artifacts_dir: Path) -> Optional[Path]:
+    latest_file = artifacts_dir / "runs" / "latest.txt"
+    if latest_file.exists():
+        content = latest_file.read_text(encoding="utf-8").strip()
+        if content:
+            return Path(content)
+    return None
+
+
+def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]:
+    cases_dir = run_path / "cases"
+    if not cases_dir.exists():
+        return None
+    matches = sorted(cases_dir.glob(f"{case_id}_*"))
+    if matches:
+        return matches[-1]
+    return None
+
+
+def _resolve_run_path(path: Path | None, artifacts_dir: Path) -> Optional[Path]:
+    if path is not None:
+        return path
+    return _load_latest_run(artifacts_dir)
+
+
+def handle_chat(args) -> int:
+    try:
+        settings = load_settings(config_path=args.config, data_dir=args.data)
+    except Exception as exc:
+        print(f"Configuration error: {exc}", file=sys.stderr)
+        return 2
+
+    log_dir = args.log_dir or args.data / ".runs" / "logs"
+    log_file = configure_logging(
+        level=args.log_level,
+        log_dir=log_dir,
+        to_stderr=args.log_stderr,
+        jsonl=args.log_jsonl,
+        run_id=None,
+    )
+
+    llm_settings = settings.llm
+    llm_endpoint = llm_settings.base_url or "https://api.openai.com/v1"
+    diagnostics = [
+        f"LLM endpoint: {llm_endpoint}",
+        f"Plan model: {llm_settings.plan_model} (temp={llm_settings.plan_temperature})",
+        f"Synth model: {llm_settings.synth_model} (temp={llm_settings.synth_temperature})",
+        f"Timeout: {llm_settings.timeout_s if llm_settings.timeout_s is not None else 'default'}, "
+        f"Retries: {llm_settings.retries if llm_settings.retries is not None else 'default'}",
+    ]
+    if args.enable_semantic:
+        diagnostics.append(f"Embeddings: CSV semantic backend in {args.data} (*.embeddings.json)")
+    else:
+        diagnostics.append("Embeddings: disabled (use --enable-semantic to build/search embeddings).")
+
+    llm = build_llm(settings)
+
+    from .chat_repl import start_repl
+
+    start_repl(
+        args.data,
+        args.schema,
+        llm,
+        enable_semantic=args.enable_semantic,
+        log_file=log_file,
+        diagnostics=diagnostics,
+    )
+    return 0
+
+
+def _select_cases_for_rerun(
+    cases: list[Case],
+    baseline_for_filter: Optional[Mapping[str, RunResult]],
+    *,
+    require_assert: bool,
+    fail_on: str,
+) -> list[Case]:
+    if not baseline_for_filter:
+        return cases
+    bad_statuses = {"mismatch", "failed", "error"}
+    if require_assert or fail_on in {"unchecked", "any"}:
+        bad_statuses |= {"unchecked", "plan_only"}
+    target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses}
+    return [case for case in cases if case.id in target_ids]
+
+
+def handle_batch(args) -> int:
+    started_at = datetime.datetime.utcnow()
+    run_id = uuid.uuid4().hex[:8]
+
+    try:
+        settings = load_settings(config_path=args.config, data_dir=args.data)
+    except Exception as exc:
+        print(f"Configuration error: {exc}", file=sys.stderr)
+        return 2
+    try:
+        cases = load_cases(args.cases)
+    except Exception as exc:
+        print(f"Cases error: {exc}", file=sys.stderr)
+        return 2
+
+    baseline_for_filter: Optional[Mapping[str, RunResult]] = None
+    baseline_for_compare: Optional[Mapping[str, RunResult]] = None
+
+    artifacts_dir = args.artifacts_dir
+    if artifacts_dir is None:
+        artifacts_dir = args.data / ".runs"
+
+    baseline_filter_path = args.only_failed_from
+    if args.only_failed and not baseline_filter_path:
+        latest = _load_latest_run(artifacts_dir)
+        if latest:
+            baseline_filter_path = latest / "results.jsonl"
+    if baseline_filter_path:
+        try:
+            baseline_for_filter = load_results(baseline_filter_path)
+        except Exception as exc:
+            print(f"Failed to read baseline for --only-failed-from: {exc}", file=sys.stderr)
+            return 2
+
+    compare_path = args.compare_to
+    if compare_path is None and args.only_failed and baseline_filter_path:
+        compare_path = baseline_filter_path
+    if compare_path:
+        try:
+            if baseline_filter_path and compare_path.resolve() == baseline_filter_path.resolve():
+                baseline_for_compare = baseline_for_filter
+            else:
+                baseline_for_compare = load_results(compare_path)
+        except Exception as exc:
+            print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr)
+            return 2
+
+    cases = _select_cases_for_rerun(
+        cases, baseline_for_filter, require_assert=args.require_assert, fail_on=args.fail_on
+    )
+
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}"
+    results_path = args.out or (run_folder / "results.jsonl")
+    artifacts_root = run_folder / "cases"
+    results_path.parent.mkdir(parents=True, exist_ok=True)
+    summary_path = results_path.with_name("summary.json")
+    artifacts_dir.mkdir(parents=True, exist_ok=True)
+
+    log_dir = args.log_dir or args.data / ".runs" / "logs"
+    configure_logging(
+        level=args.log_level,
+        log_dir=log_dir,
+        to_stderr=args.log_stderr,
+        jsonl=args.log_jsonl,
+        run_id=None,
+    )
+
+    provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic)
+    llm = build_llm(settings)
+    runner = build_agent(llm, provider)
+
+    results: list[RunResult] = []
+    failures = 0
+    for case in cases:
+        result = run_one(case, runner, artifacts_root, plan_only=args.plan_only)
+        results.append(result)
+        if not args.quiet:
+            print(format_status_line(result))
+        if is_failure(result.status, args.fail_on, args.require_assert):
+            failures += 1
+            if args.fail_fast or (args.max_fails and failures >= args.max_fails):
+                break
+
+    write_results(results_path, results)
+    counts = summarize(results)
+
+    results_by_id = {r.id: r for r in results}
+    diff_block: dict | None = None
+    baseline_path: Path | None = None
+    if baseline_for_compare:
+        baseline_path = args.compare_to or baseline_filter_path
+        diff = compare_results(baseline_for_compare, results_by_id, require_assert=args.require_assert)
+        if baseline_path:
+            diff["baseline_path"] = str(baseline_path)
+        diff_block = diff
+
+    failure_count = sum(1 for res in results if is_failure(res.status, args.fail_on, args.require_assert))
+    exit_code = 1 if failure_count else 0
+
+    ended_at = datetime.datetime.utcnow()
+    duration_ms = int((ended_at - started_at).total_seconds() * 1000)
+    summary = {
+        "run_id": run_id,
+        "started_at": started_at.isoformat() + "Z",
+        "ended_at": ended_at.isoformat() + "Z",
+        "duration_ms": duration_ms,
+        "counts": counts,
+        "exit_code": exit_code,
+        "config_fingerprint": build_config_fingerprint(settings, args.cases),
+        "results_path": str(results_path),
+        "require_assert": args.require_assert,
+        "fail_on": args.fail_on,
+    }
+    if diff_block:
+        summary["diff"] = diff_block
+
+    summary_path = write_summary(results_path, summary)
+
+    latest_path = run_folder.parent / "latest.txt"
+    latest_path.parent.mkdir(parents=True, exist_ok=True)
+    latest_path.write_text(str(run_folder), encoding="utf-8")
+
+    bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0)
+    unchecked = counts.get("unchecked", 0)
+    plan_only = counts.get("plan_only", 0)
+    if args.require_assert or args.fail_on in {"unchecked", "any"}:
+        bad_count += unchecked + plan_only
+    summary_line = (
+        f"Batch: {counts.get('total', 0)} cases | Checked: {counts.get('checked_total', 0)} | "
+        f"Checked OK: {counts.get('checked_ok', 0)} | Unchecked(no-assert): {unchecked} | "
+        f"Plan-only: {plan_only} | BAD: {bad_count} | Skipped: {counts.get('skipped', 0)}"
+    )
+
+    if args.quiet:
+        print(summary_line)
+        if diff_block:
+            print(
+                f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, "
+                f"-{len(diff_block.get('regressed', []))} regressions, "
+                f"{len(diff_block.get('still_bad', []))} still failing, "
+                f"{len(diff_block.get('new_unchecked', []))} new unchecked"
+            )
+        return exit_code
+
+    print(summary_line)
+    if diff_block:
+        print(
+            f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, "
+            f"-{len(diff_block.get('regressed', []))} regressions, "
+            f"{len(diff_block.get('still_bad', []))} still failing, "
+            f"{len(diff_block.get('new_unchecked', []))} new unchecked"
+        )
+
+    failures_list: dict[str, RunResult] = {}
+    for res in results:
+        if is_failure(res.status, args.fail_on, args.require_assert):
+            failures_list[res.id] = res
+    if failures_list:
+        print(f"Failures (top {args.show_failures}):")
+        for res in list(failures_list.values())[: args.show_failures]:
+            reason = res.reason or res.error or ""
+            repro = (
+                f"python -m examples.demo_qa.cli case run {res.id} --cases {args.cases} --data {args.data} "
+                f"--schema {args.schema}" + (" --plan-only" if args.plan_only else "")
+            )
+            print(f"- {res.id}: {res.status} ({reason}) [{res.artifacts_dir}]")
+            if args.show_artifacts:
+                print(f"  artifacts: {res.artifacts_dir}")
+            print(f"  repro: {repro}")
+
+    print(f"Results written to: {results_path}")
+    print(f"Summary written to: {summary_path}")
+
+    return exit_code
+
+
+def handle_case_run(args) -> int:
+    try:
+        settings = load_settings(config_path=args.config, data_dir=args.data)
+    except Exception as exc:
+        print(f"Configuration error: {exc}", file=sys.stderr)
+        return 2
+    try:
+        cases = {c.id: c for c in load_cases(args.cases)}
+    except Exception as exc:
+        print(f"Cases error: {exc}", file=sys.stderr)
+        return 2
+    if args.case_id not in cases:
+        print(f"Case {args.case_id} not found in {args.cases}", file=sys.stderr)
+        return 2
+
+    artifacts_dir = args.artifacts_dir or (args.data / ".runs")
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}"
+    artifacts_root = run_folder / "cases"
+    results_path = run_folder / "results.jsonl"
+
+    log_dir = artifacts_dir / "logs"
+    configure_logging(level="INFO", log_dir=log_dir, to_stderr=True, jsonl=False, run_id=None)
+
+    provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic)
+    llm = build_llm(settings)
+    runner = build_agent(llm, provider)
+
+    result = run_one(cases[args.case_id], runner, artifacts_root, plan_only=args.plan_only)
+    write_results(results_path, [result])
+    save_path = run_folder.parent / "latest.txt"
+    save_path.parent.mkdir(parents=True, exist_ok=True)
+    save_path.write_text(str(run_folder), encoding="utf-8")
+
+    print(format_status_line(result))
+    print(f"Artifacts: {result.artifacts_dir}")
+    return 0
+
+
+def handle_case_open(args) -> int:
+    artifacts_dir = args.artifacts_dir or (args.data / ".runs")
+    run_path = _resolve_run_path(args.run, artifacts_dir)
+    if not run_path:
+        print("No run found. Provide --run or ensure runs/latest.txt exists.", file=sys.stderr)
+        return 2
+    case_dir = _find_case_artifact(run_path, args.case_id)
+    if not case_dir:
+        print(f"Case {args.case_id} not found under {run_path}", file=sys.stderr)
+        return 2
+    print(f"Case {args.case_id} artifacts: {case_dir}")
+    plan = case_dir / "plan.json"
+    answer = case_dir / "answer.txt"
+    status = case_dir / "status.json"
+    for path in [plan, answer, status]:
+        if path.exists():
+            print(f"- {path}")
+    return 0
+
+
+__all__ = [
+    "handle_batch",
+    "handle_case_open",
+    "handle_case_run",
+    "handle_chat",
+    "is_failure",
+    "write_results",
+    "write_summary",
+    "_load_latest_run",
+    "_find_case_artifact",
+    "build_config_fingerprint",
+]
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index f2ae789..dec73ff 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -1,388 +1,26 @@
 from __future__ import annotations
 
 import argparse
-import datetime
-import hashlib
-import json
 import sys
-import uuid
 from pathlib import Path
-from typing import Iterable, Mapping, Optional
 
 ROOT = Path(__file__).resolve().parents[2]
 SRC = ROOT / "src"
-if str(SRC) not in sys.path:
-    sys.path.insert(0, str(SRC))
 
-from .chat_repl import start_repl
-from .data_gen import generate_and_save
-from .llm.factory import build_llm
-from .logging_config import configure_logging
-from .provider_factory import build_provider
-from .runner import (
-    RunResult,
-    build_agent,
-    compare_results,
-    format_status_line,
-    load_cases,
-    load_results,
-    run_one,
-    summarize,
-)
-from .settings import load_settings
 
+def ensure_repo_imports() -> None:
+    """Ensure local src/ is on sys.path for demo entrypoints."""
+    if str(SRC) not in sys.path:
+        sys.path.insert(0, str(SRC))
 
-def write_results(out_path: Path, results: Iterable[RunResult]) -> None:
-    out_path.parent.mkdir(parents=True, exist_ok=True)
-    with out_path.open("w", encoding="utf-8") as f:
-        for res in results:
-            f.write(json.dumps(res.to_json(), ensure_ascii=False) + "\n")
 
+ensure_repo_imports()
 
-def write_summary(out_path: Path, summary: dict) -> Path:
-    summary_path = out_path.with_name("summary.json")
-    summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
-    return summary_path
+from .batch import handle_batch, handle_case_open, handle_case_run, handle_chat  # noqa: E402
+from .data_gen import generate_and_save  # noqa: E402
 
 
-def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
-    failure_statuses = {"error", "mismatch", "failed"}
-    if fail_on == "error":
-        failure_statuses = {"error"}
-    elif fail_on == "mismatch":
-        failure_statuses = {"error", "mismatch", "failed"}
-    elif fail_on == "unchecked":
-        failure_statuses = {"error", "mismatch", "failed", "unchecked"}
-    else:
-        failure_statuses = {"error", "mismatch", "failed", "unchecked", "skipped"}
-    if require_assert and status == "unchecked":
-        return True
-    return status in failure_statuses
-
-
-def _hash_file(path: Path) -> str:
-    data = path.read_bytes()
-    return hashlib.sha256(data).hexdigest()
-
-
-def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]:
-    llm_settings = settings.llm
-    return {
-        "base_url": llm_settings.base_url or "https://api.openai.com/v1",
-        "plan_model": llm_settings.plan_model,
-        "synth_model": llm_settings.synth_model,
-        "cases_hash": _hash_file(cases_path),
-    }
-
-
-def _load_latest_run(artifacts_dir: Path) -> Optional[Path]:
-    latest_file = artifacts_dir / "runs" / "latest.txt"
-    if latest_file.exists():
-        content = latest_file.read_text(encoding="utf-8").strip()
-        if content:
-            return Path(content)
-    return None
-
-
-def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]:
-    cases_dir = run_path / "cases"
-    if not cases_dir.exists():
-        return None
-    matches = sorted(cases_dir.glob(f"{case_id}_*"))
-    if matches:
-        return matches[-1]
-    return None
-
-
-def handle_chat(args) -> int:
-    try:
-        settings = load_settings(config_path=args.config, data_dir=args.data)
-    except Exception as exc:
-        print(f"Configuration error: {exc}", file=sys.stderr)
-        return 2
-
-    log_dir = args.log_dir or args.data / ".runs" / "logs"
-    log_file = configure_logging(
-        level=args.log_level,
-        log_dir=log_dir,
-        to_stderr=args.log_stderr,
-        jsonl=args.log_jsonl,
-        run_id=None,
-    )
-
-    llm_settings = settings.llm
-    llm_endpoint = llm_settings.base_url or "https://api.openai.com/v1"
-    diagnostics = [
-        f"LLM endpoint: {llm_endpoint}",
-        f"Plan model: {llm_settings.plan_model} (temp={llm_settings.plan_temperature})",
-        f"Synth model: {llm_settings.synth_model} (temp={llm_settings.synth_temperature})",
-        f"Timeout: {llm_settings.timeout_s if llm_settings.timeout_s is not None else 'default'}, "
-        f"Retries: {llm_settings.retries if llm_settings.retries is not None else 'default'}",
-    ]
-    if args.enable_semantic:
-        diagnostics.append(f"Embeddings: CSV semantic backend in {args.data} (*.embeddings.json)")
-    else:
-        diagnostics.append("Embeddings: disabled (use --enable-semantic to build/search embeddings).")
-
-    llm = build_llm(settings)
-
-    start_repl(
-        args.data,
-        args.schema,
-        llm,
-        enable_semantic=args.enable_semantic,
-        log_file=log_file,
-        diagnostics=diagnostics,
-    )
-    return 0
-
-
-def _resolve_run_path(path: Path | None, artifacts_dir: Path) -> Optional[Path]:
-    if path is not None:
-        return path
-    return _load_latest_run(artifacts_dir)
-
-
-def handle_case_run(args) -> int:
-    try:
-        settings = load_settings(config_path=args.config, data_dir=args.data)
-    except Exception as exc:
-        print(f"Configuration error: {exc}", file=sys.stderr)
-        return 2
-    try:
-        cases = {c.id: c for c in load_cases(args.cases)}
-    except Exception as exc:
-        print(f"Cases error: {exc}", file=sys.stderr)
-        return 2
-    if args.case_id not in cases:
-        print(f"Case {args.case_id} not found in {args.cases}", file=sys.stderr)
-        return 2
-
-    artifacts_dir = args.artifacts_dir or (args.data / ".runs")
-    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}"
-    artifacts_root = run_folder / "cases"
-    results_path = run_folder / "results.jsonl"
-
-    log_dir = artifacts_dir / "logs"
-    configure_logging(level="INFO", log_dir=log_dir, to_stderr=True, jsonl=False, run_id=None)
-
-    provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic)
-    llm = build_llm(settings)
-    runner = build_agent(llm, provider)
-
-    result = run_one(cases[args.case_id], runner, artifacts_root, plan_only=args.plan_only)
-    write_results(results_path, [result])
-    save_path = run_folder.parent / "latest.txt"
-    save_path.parent.mkdir(parents=True, exist_ok=True)
-    save_path.write_text(str(run_folder), encoding="utf-8")
-
-    print(format_status_line(result))
-    print(f"Artifacts: {result.artifacts_dir}")
-    return 0
-
-
-def handle_case_open(args) -> int:
-    artifacts_dir = args.artifacts_dir or Path(".") / ".runs"
-    run_path = _resolve_run_path(args.run, artifacts_dir)
-    if not run_path:
-        print("No run found. Provide --run or ensure runs/latest.txt exists.", file=sys.stderr)
-        return 2
-    case_dir = _find_case_artifact(run_path, args.case_id)
-    if not case_dir:
-        print(f"Case {args.case_id} not found under {run_path}", file=sys.stderr)
-        return 2
-    print(f"Case {args.case_id} artifacts: {case_dir}")
-    plan = case_dir / "plan.json"
-    answer = case_dir / "answer.txt"
-    status = case_dir / "status.json"
-    for path in [plan, answer, status]:
-        if path.exists():
-            print(f"- {path}")
-    return 0
-
-
-def handle_batch(args) -> int:
-    started_at = datetime.datetime.utcnow()
-    run_id = uuid.uuid4().hex[:8]
-
-    try:
-        settings = load_settings(config_path=args.config, data_dir=args.data)
-    except Exception as exc:
-        print(f"Configuration error: {exc}", file=sys.stderr)
-        return 2
-    try:
-        cases = load_cases(args.cases)
-    except Exception as exc:
-        print(f"Cases error: {exc}", file=sys.stderr)
-        return 2
-
-    baseline_for_filter: Optional[Mapping[str, RunResult]] = None
-    baseline_for_compare: Optional[Mapping[str, RunResult]] = None
-
-    baseline_filter_path = args.only_failed_from
-    if args.only_failed and not baseline_filter_path:
-        latest = _load_latest_run(args.artifacts_dir or args.data / ".runs")
-        if latest:
-            baseline_filter_path = latest / "results.jsonl"
-    if baseline_filter_path:
-        try:
-            baseline_for_filter = load_results(baseline_filter_path)
-        except Exception as exc:
-            print(f"Failed to read baseline for --only-failed-from: {exc}", file=sys.stderr)
-            return 2
-
-    compare_path = args.compare_to
-    if compare_path is None and args.only_failed and baseline_filter_path:
-        compare_path = baseline_filter_path
-    if compare_path:
-        try:
-            if baseline_filter_path and compare_path.resolve() == baseline_filter_path.resolve():
-                baseline_for_compare = baseline_for_filter
-            else:
-                baseline_for_compare = load_results(compare_path)
-        except Exception as exc:
-            print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr)
-            return 2
-
-    if baseline_for_filter:
-        bad_statuses = {"mismatch", "failed", "error"}
-        if args.require_assert or args.fail_on == "unchecked":
-            bad_statuses.add("unchecked")
-        target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses}
-        cases = [case for case in cases if case.id in target_ids]
-
-    artifacts_dir = args.artifacts_dir
-    if artifacts_dir is None:
-        artifacts_dir = args.data / ".runs"
-    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}"
-    results_path = args.out or (run_folder / "results.jsonl")
-    artifacts_root = run_folder / "cases"
-    results_path.parent.mkdir(parents=True, exist_ok=True)
-    summary_path = results_path.with_name("summary.json")
-    artifacts_dir.mkdir(parents=True, exist_ok=True)
-
-    log_dir = args.log_dir or args.data / ".runs" / "logs"
-    configure_logging(
-        level=args.log_level,
-        log_dir=log_dir,
-        to_stderr=args.log_stderr,
-        jsonl=args.log_jsonl,
-        run_id=None,
-    )
-
-    provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic)
-    llm = build_llm(settings)
-    runner = build_agent(llm, provider)
-
-    results: list[RunResult] = []
-    failures = 0
-    for case in cases:
-        result = run_one(case, runner, artifacts_root, plan_only=args.plan_only)
-        results.append(result)
-        if not args.quiet:
-            print(format_status_line(result))
-        if is_failure(result.status, args.fail_on, args.require_assert):
-            failures += 1
-            if args.fail_fast or (args.max_fails and failures >= args.max_fails):
-                break
-
-    write_results(results_path, results)
-    counts = summarize(results)
-
-    results_by_id = {r.id: r for r in results}
-    diff_block: dict | None = None
-    baseline_path: Path | None = None
-    if baseline_for_compare:
-        baseline_path = args.compare_to or args.only_failed_from
-        diff = compare_results(baseline_for_compare, results_by_id, require_assert=args.require_assert)
-        if baseline_path:
-            diff["baseline_path"] = str(baseline_path)
-        diff_block = diff
-
-    failure_count = sum(1 for res in results if is_failure(res.status, args.fail_on, args.require_assert))
-    exit_code = 1 if failure_count else 0
-
-    ended_at = datetime.datetime.utcnow()
-    duration_ms = int((ended_at - started_at).total_seconds() * 1000)
-    summary = {
-        "run_id": run_id,
-        "started_at": started_at.isoformat() + "Z",
-        "ended_at": ended_at.isoformat() + "Z",
-        "duration_ms": duration_ms,
-        "counts": counts,
-        "exit_code": exit_code,
-        "config_fingerprint": build_config_fingerprint(settings, args.cases),
-        "results_path": str(results_path),
-        "require_assert": args.require_assert,
-        "fail_on": args.fail_on,
-    }
-    if diff_block:
-        summary["diff"] = diff_block
-
-    summary_path = write_summary(results_path, summary)
-
-    latest_path = run_folder.parent / "latest.txt"
-    latest_path.parent.mkdir(parents=True, exist_ok=True)
-    latest_path.write_text(str(run_folder), encoding="utf-8")
-
-    bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0)
-    unchecked = counts.get("unchecked", 0)
-    if args.require_assert:
-        bad_count += unchecked
-    summary_line = (
-        f"Batch: {counts.get('total', 0)} cases | Checked: {counts.get('checked_total', 0)} | "
-        f"Checked OK: {counts.get('checked_ok', 0)} | Unchecked OK: {counts.get('unchecked_ok', 0)} | "
-        f"BAD: {bad_count} | Unchecked: {unchecked} | Skipped: {counts.get('skipped', 0)}"
-    )
-
-    if args.quiet:
-        print(summary_line)
-        if diff_block:
-            print(
-                f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, "
-                f"-{len(diff_block.get('regressed', []))} regressions, "
-                f"{len(diff_block.get('still_bad', []))} still failing, "
-                f"{len(diff_block.get('new_unchecked', []))} new unchecked"
-            )
-        return exit_code
-
-    print(summary_line)
-    if diff_block:
-        print(
-            f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, "
-            f"-{len(diff_block.get('regressed', []))} regressions, "
-            f"{len(diff_block.get('still_bad', []))} still failing, "
-            f"{len(diff_block.get('new_unchecked', []))} new unchecked"
-        )
-
-    failures_list: dict[str, RunResult] = {}
-    for res in results:
-        if is_failure(res.status, args.fail_on, args.require_assert) or (
-            args.require_assert and res.status == "unchecked"
-        ):
-            failures_list[res.id] = res
-    if failures_list:
-        print(f"Failures (top {args.show_failures}):")
-        for res in list(failures_list.values())[: args.show_failures]:
-            reason = res.reason or res.error or ""
-            repro = (
-                f"demo_qa case run {res.id} --cases {args.cases} --data {args.data} "
-                f"--schema {args.schema}" + (" --plan-only" if args.plan_only else "")
-            )
-            print(f"- {res.id}: {res.status} ({reason}) [{res.artifacts_dir}]")
-            if args.show_artifacts:
-                print(f"  artifacts: {res.artifacts_dir}")
-            print(f"  repro: {repro}")
-
-    print(f"Results written to: {results_path}")
-    print(f"Summary written to: {summary_path}")
-
-    return exit_code
-
-
-def main() -> None:
+def build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="Demo QA harness for fetchgraph")
     sub = parser.add_subparsers(dest="command", required=True)
 
@@ -418,8 +56,8 @@ def main() -> None:
     batch_p.add_argument("--fail-fast", action="store_true", help="Stop on first failing case")
     batch_p.add_argument(
         "--fail-on",
-        choices=["error", "mismatch", "unchecked", "any"],
-        default="mismatch",
+        choices=["error", "mismatch", "bad", "unchecked", "any", "skipped"],
+        default="bad",
         help="Which statuses should cause a failing exit code",
     )
     batch_p.add_argument("--require-assert", action="store_true", help="Treat unchecked cases as failures")
@@ -436,8 +74,10 @@ def main() -> None:
     batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show")
     batch_p.add_argument("--show-artifacts", action="store_true", help="Show artifact paths for failures")
 
-    case_p = sub.add_parser("case", help="Single-case utilities")
-    case_sub = case_p.add_subparsers(dest="case_command", required=True)
+    case_p = sub.add_subparsers(dest="case_command")
+    case_root = sub.add_parser("case", help="Single-case utilities")
+    case_sub = case_root.add_subparsers(dest="case_command", required=True)
+
     case_run = case_sub.add_parser("run", help="Run a single case by id")
     case_run.add_argument("case_id")
     case_run.add_argument("--cases", type=Path, required=True, help="Path to cases jsonl")
@@ -450,9 +90,17 @@ def main() -> None:
 
     case_open = case_sub.add_parser("open", help="Show artifacts for a case in a run folder")
     case_open.add_argument("case_id")
+    case_open.add_argument("--data", type=Path, required=True)
     case_open.add_argument("--run", type=Path, default=None, help="Run folder (defaults to latest)")
-    case_open.add_argument("--artifacts-dir", type=Path, default=None, help="Base artifacts dir for latest lookup")
+    case_open.add_argument(
+        "--artifacts-dir", type=Path, default=None, help="Base artifacts dir for latest lookup (default data/.runs)"
+    )
 
+    return parser
+
+
+def main() -> None:
+    parser = build_parser()
     args = parser.parse_args()
 
     if args.command == "gen":
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index d9ad5f5..14321a2 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -219,7 +219,7 @@ def _build_result(
         reason = expected_check.detail
         details = {"expected_check": expected_check.__dict__}
     else:
-        status = "unchecked"
+        status = "plan_only" if artifacts.plan_only else "unchecked"
         reason = "plan-only" if artifacts.plan_only else "no expectations provided"
         details = {"note": reason}
 
@@ -276,11 +276,12 @@ def run_one(case: Case, runner: AgentRunner, artifacts_root: Path, *, plan_only:
 
 
 def summarize(results: Iterable[RunResult]) -> Dict[str, object]:
-    totals = {"ok": 0, "mismatch": 0, "failed": 0, "error": 0, "skipped": 0, "unchecked": 0}
+    totals = {"ok": 0, "mismatch": 0, "failed": 0, "error": 0, "skipped": 0, "unchecked": 0, "plan_only": 0}
     total_times: List[float] = []
     checked_total = 0
     checked_ok = 0
-    unchecked_ok = 0
+    unchecked_no_assert = 0
+    plan_only = 0
     for res in results:
         totals[res.status] = totals.get(res.status, 0) + 1
         if res.duration_ms is not None:
@@ -290,13 +291,16 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]:
         if res.status == "ok" and res.checked:
             checked_ok += 1
         if res.status == "unchecked":
-            unchecked_ok += 1
+            unchecked_no_assert += 1
+        if res.status == "plan_only":
+            plan_only += 1
 
     summary: Dict[str, object] = {
         "total": sum(totals.values()),
         "checked_total": checked_total,
         "checked_ok": checked_ok,
-        "unchecked_ok": unchecked_ok,
+        "unchecked_no_assert": unchecked_no_assert,
+        "plan_only": plan_only,
         **totals,
     }
     if total_times:
@@ -312,6 +316,7 @@ def load_cases(path: Path) -> List[Case]:
     if not path.exists():
         raise FileNotFoundError(f"Cases file not found: {path}")
     cases: List[Case] = []
+    seen_ids: set[str] = set()
     with path.open("r", encoding="utf-8") as f:
         for lineno, line in enumerate(f, start=1):
             line = line.strip()
@@ -323,12 +328,26 @@ def load_cases(path: Path) -> List[Case]:
                 raise ValueError(f"Invalid JSON on line {lineno}: {exc}") from exc
             if "id" not in payload or "question" not in payload:
                 raise ValueError(f"Case on line {lineno} missing required fields 'id' and 'question'")
+            case_id = str(payload["id"])
+            if case_id in seen_ids:
+                raise ValueError(f"Duplicate case id {case_id!r} on line {lineno}")
+            seen_ids.add(case_id)
+            expected = payload.get("expected")
+            expected_regex = payload.get("expected_regex")
+            expected_contains = payload.get("expected_contains")
+            for field_name, val in [
+                ("expected", expected),
+                ("expected_regex", expected_regex),
+                ("expected_contains", expected_contains),
+            ]:
+                if val is not None and str(val).strip() == "":
+                    raise ValueError(f"{field_name} must not be empty on line {lineno}")
             case = Case(
-                id=str(payload["id"]),
+                id=case_id,
                 question=str(payload["question"]),
-                expected=payload.get("expected"),
-                expected_regex=payload.get("expected_regex"),
-                expected_contains=payload.get("expected_contains"),
+                expected=expected,
+                expected_regex=expected_regex,
+                expected_contains=expected_contains,
                 tags=list(payload.get("tags", []) or []),
                 skip=bool(payload.get("skip", False)),
             )
@@ -426,7 +445,7 @@ def _bucket(status: str, checked: bool, require_assert: bool) -> str:
         return "OK" if checked else "UNCHECKED"
     if status in {"mismatch", "failed", "error"}:
         return "BAD"
-    if status == "unchecked":
+    if status in {"unchecked", "plan_only"}:
         return "BAD" if require_assert else "UNCHECKED"
     return "NEUTRAL"
 
@@ -489,8 +508,8 @@ def format_status_line(result: RunResult) -> str:
         return f"OK {result.id} {timing}"
     if result.status == "skipped":
         return f"SKIP {result.id}"
-    if result.status == "unchecked":
-        return f"UNCHECKED {result.id} {timing}"
+    if result.status in {"unchecked", "plan_only"}:
+        return f"{result.status.upper()} {result.id} {timing}"
     reason = result.reason or ""
     return f"FAIL {result.id} {result.status} ({reason or 'unknown'}) {timing}"
 
diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py
index d83c562..337c6d2 100644
--- a/tests/test_demo_qa_runner.py
+++ b/tests/test_demo_qa_runner.py
@@ -147,5 +147,5 @@ def test_summarize_counts_checked_and_unchecked() -> None:
 
     summary = summarize(results)
     assert summary["checked_ok"] == 1
-    assert summary["unchecked_ok"] == 1  # counts unchecked separately
+    assert summary["unchecked_no_assert"] == 1  # counts unchecked separately
     assert summary["checked_total"] == 2

From 84451eaca3ceb45258844b52b51bf7d610562a1a Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 16:19:41 +0300
Subject: [PATCH 05/92] Fix case command parser construction

---
 examples/demo_qa/cli.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index dec73ff..b1e8119 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -74,7 +74,6 @@ def build_parser() -> argparse.ArgumentParser:
     batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show")
     batch_p.add_argument("--show-artifacts", action="store_true", help="Show artifact paths for failures")
 
-    case_p = sub.add_subparsers(dest="case_command")
     case_root = sub.add_parser("case", help="Single-case utilities")
     case_sub = case_root.add_subparsers(dest="case_command", required=True)
 

From b921779d511bfa24020c18400ea6740d63bed5ad Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 16:38:10 +0300
Subject: [PATCH 06/92] Honor custom result paths when rerunning failed cases

---
 examples/demo_qa/batch.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 0f002a6..094fcda 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -84,6 +84,15 @@ def _load_latest_run(artifacts_dir: Path) -> Optional[Path]:
     return None
 
 
+def _load_latest_results(artifacts_dir: Path) -> Optional[Path]:
+    latest_file = artifacts_dir / "runs" / "latest_results.txt"
+    if latest_file.exists():
+        content = latest_file.read_text(encoding="utf-8").strip()
+        if content:
+            return Path(content)
+    return None
+
+
 def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]:
     cases_dir = run_path / "cases"
     if not cases_dir.exists():
@@ -185,9 +194,15 @@ def handle_batch(args) -> int:
 
     baseline_filter_path = args.only_failed_from
     if args.only_failed and not baseline_filter_path:
-        latest = _load_latest_run(artifacts_dir)
-        if latest:
-            baseline_filter_path = latest / "results.jsonl"
+        latest_results = _load_latest_results(artifacts_dir)
+        if latest_results:
+            baseline_filter_path = latest_results
+        else:
+            latest_run = _load_latest_run(artifacts_dir)
+            if latest_run:
+                candidate = latest_run / "results.jsonl"
+                if candidate.exists():
+                    baseline_filter_path = candidate
     if baseline_filter_path:
         try:
             baseline_for_filter = load_results(baseline_filter_path)
@@ -281,8 +296,10 @@ def handle_batch(args) -> int:
     summary_path = write_summary(results_path, summary)
 
     latest_path = run_folder.parent / "latest.txt"
+    latest_results_path = run_folder.parent / "latest_results.txt"
     latest_path.parent.mkdir(parents=True, exist_ok=True)
     latest_path.write_text(str(run_folder), encoding="utf-8")
+    latest_results_path.write_text(str(results_path), encoding="utf-8")
 
     bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0)
     unchecked = counts.get("unchecked", 0)

From 1adb196d312bbfd3fcb0b04cf0c097d2de7209cf Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 16:50:34 +0300
Subject: [PATCH 07/92] Keep errors failing when using --fail-on mismatch

---
 examples/demo_qa/batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 094fcda..e8f7b61 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -46,7 +46,7 @@ def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
     if fail_on == "error":
         bad = {"error"}
     elif fail_on == "mismatch":
-        bad = {"mismatch"}
+        bad = {"error", "failed", "mismatch"}
     elif fail_on == "unchecked":
         bad |= unchecked
     elif fail_on == "bad":

From 9be87f2c67be493aefee7f86499f4f7267dcaa1d Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 17:09:17 +0300
Subject: [PATCH 08/92] Respect --require-assert when fail_on is mismatch/error

---
 examples/demo_qa/batch.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index e8f7b61..5667cce 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -39,10 +39,8 @@ def write_summary(out_path: Path, summary: dict) -> Path:
 
 
 def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
-    bad = {"error", "failed", "mismatch"}
     unchecked = {"unchecked", "plan_only"}
-    if require_assert:
-        bad |= unchecked
+    bad = {"error", "failed", "mismatch"}
     if fail_on == "error":
         bad = {"error"}
     elif fail_on == "mismatch":
@@ -51,12 +49,14 @@ def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
         bad |= unchecked
     elif fail_on == "bad":
         bad = {"error", "failed", "mismatch"}
-        if require_assert:
-            bad |= unchecked
     elif fail_on == "any":
         bad |= unchecked
     elif fail_on == "skipped":
         bad |= {"skipped"}
+
+    if require_assert:
+        bad |= unchecked
+
     return status in bad
 
 

From b96ec141d9909ba9d48b7d6bb5678246d1efb163 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 17:16:32 +0300
Subject: [PATCH 09/92] Keep plan-only runs from tripping expected checks

---
 examples/demo_qa/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 14321a2..0a980b9 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -269,7 +269,7 @@ def run_one(case: Case, runner: AgentRunner, artifacts_root: Path, *, plan_only:
     artifacts = runner.run_question(case.question, run_id, run_dir, plan_only=plan_only)
     save_artifacts(artifacts)
 
-    expected_check = _match_expected(case, artifacts.answer)
+    expected_check = None if plan_only else _match_expected(case, artifacts.answer)
     result = _build_result(case, artifacts, run_dir, expected_check)
     save_status(result)
     return result

From acf0dc78ab7dc8705efa0a629202ef32cc1be33d Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 17:28:33 +0300
Subject: [PATCH 10/92] Align rerun filtering with fail_on modes

---
 examples/demo_qa/batch.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 5667cce..656a0cf 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -164,7 +164,20 @@ def _select_cases_for_rerun(
     if not baseline_for_filter:
         return cases
     bad_statuses = {"mismatch", "failed", "error"}
-    if require_assert or fail_on in {"unchecked", "any"}:
+    if fail_on == "error":
+        bad_statuses = {"error"}
+    elif fail_on == "mismatch":
+        bad_statuses = {"mismatch", "failed", "error"}
+    elif fail_on == "unchecked":
+        bad_statuses |= {"unchecked", "plan_only"}
+    elif fail_on == "bad":
+        bad_statuses = {"mismatch", "failed", "error"}
+    elif fail_on == "any":
+        bad_statuses |= {"unchecked", "plan_only"}
+    elif fail_on == "skipped":
+        bad_statuses |= {"skipped"}
+
+    if require_assert:
         bad_statuses |= {"unchecked", "plan_only"}
     target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses}
     return [case for case in cases if case.id in target_ids]

From 6f18d39840a27005f1459f512d55ba4be049bf0c Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 20:16:46 +0300
Subject: [PATCH 11/92] Fallback to summary results path for latest reruns

---
 examples/demo_qa/batch.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 656a0cf..dc5fbe0 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -90,6 +90,17 @@ def _load_latest_results(artifacts_dir: Path) -> Optional[Path]:
         content = latest_file.read_text(encoding="utf-8").strip()
         if content:
             return Path(content)
+    latest_run = _load_latest_run(artifacts_dir)
+    if latest_run:
+        summary_path = latest_run / "summary.json"
+        if summary_path.exists():
+            try:
+                summary = json.loads(summary_path.read_text(encoding="utf-8"))
+                results_path = summary.get("results_path")
+                if results_path:
+                    return Path(results_path)
+            except Exception:
+                pass
     return None
 
 

From b204997f7e8f1a0237d9636bc34d6ae248dd9b40 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 20:25:41 +0300
Subject: [PATCH 12/92] Add run registry metadata and stats command

---
 examples/demo_qa/batch.py | 157 ++++++++++++++++++++++++++++++++++++++
 examples/demo_qa/cli.py   |  11 ++-
 2 files changed, 167 insertions(+), 1 deletion(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index dc5fbe0..b6e0ac5 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -3,6 +3,8 @@
 import datetime
 import hashlib
 import json
+import platform
+import subprocess
 import sys
 import uuid
 from pathlib import Path
@@ -38,6 +40,15 @@ def write_summary(out_path: Path, summary: dict) -> Path:
     return summary_path
 
 
+def _pass_rate(counts: Mapping[str, object]) -> Optional[float]:
+    total = int(counts.get("total", 0) or 0)
+    skipped = int(counts.get("skipped", 0) or 0)
+    denom = total - skipped
+    if denom <= 0:
+        return None
+    return (counts.get("ok", 0) or 0) / denom
+
+
 def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
     unchecked = {"unchecked", "plan_only"}
     bad = {"error", "failed", "mismatch"}
@@ -75,6 +86,30 @@ def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]
     }
 
 
+def _fingerprint_dir(data_dir: Path) -> Mapping[str, object]:
+    files: list[dict] = []
+    for path in sorted(data_dir.rglob("*")):
+        if path.is_file():
+            stat = path.stat()
+            files.append(
+                {
+                    "path": str(path.relative_to(data_dir)),
+                    "size": stat.st_size,
+                    "mtime": stat.st_mtime,
+                }
+            )
+    digest = hashlib.sha256(json.dumps(files, sort_keys=True).encode("utf-8")).hexdigest()
+    return {"hash": digest, "files": files}
+
+
+def _git_sha() -> Optional[str]:
+    try:
+        result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True)
+    except Exception:
+        return None
+    return result.stdout.strip() or None
+
+
 def _load_latest_run(artifacts_dir: Path) -> Optional[Path]:
     latest_file = artifacts_dir / "runs" / "latest.txt"
     if latest_file.exists():
@@ -258,6 +293,7 @@ def handle_batch(args) -> int:
     results_path.parent.mkdir(parents=True, exist_ok=True)
     summary_path = results_path.with_name("summary.json")
     artifacts_dir.mkdir(parents=True, exist_ok=True)
+    history_path = args.history or (args.data / ".runs" / "history.jsonl")
 
     log_dir = args.log_dir or args.data / ".runs" / "logs"
     configure_logging(
@@ -325,6 +361,61 @@ def handle_batch(args) -> int:
     latest_path.write_text(str(run_folder), encoding="utf-8")
     latest_results_path.write_text(str(results_path), encoding="utf-8")
 
+    config_hash = _hash_file(args.config) if args.config else None
+    schema_hash = _hash_file(args.schema)
+    cases_hash = _hash_file(args.cases)
+    data_fingerprint = _fingerprint_dir(args.data)
+    llm_settings = settings.llm
+    run_meta = {
+        "run_id": run_id,
+        "timestamp": started_at.isoformat() + "Z",
+        "cases_path": str(args.cases),
+        "cases_hash": cases_hash,
+        "config_path": str(args.config) if args.config else None,
+        "config_hash": config_hash,
+        "schema_path": str(args.schema),
+        "schema_hash": schema_hash,
+        "data_dir": str(args.data),
+        "data_fingerprint": data_fingerprint,
+        "llm": {
+            "plan_model": llm_settings.plan_model,
+            "synth_model": llm_settings.synth_model,
+            "plan_temperature": llm_settings.plan_temperature,
+            "synth_temperature": llm_settings.synth_temperature,
+            "base_url": llm_settings.base_url or "https://api.openai.com/v1",
+        },
+        "enable_semantic": args.enable_semantic,
+        "embedding_model": None,
+        "git_sha": _git_sha(),
+        "python_version": sys.version,
+        "platform": platform.platform(),
+        "results_path": str(results_path),
+        "summary_path": str(summary_path),
+        "run_dir": str(run_folder),
+    }
+    (run_folder / "run_meta.json").write_text(json.dumps(run_meta, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    prate = _pass_rate(counts)
+    history_entry = {
+        "run_id": run_id,
+        "timestamp": started_at.isoformat() + "Z",
+        "config_hash": config_hash,
+        "schema_hash": schema_hash,
+        "cases_hash": cases_hash,
+        "ok": counts.get("ok", 0),
+        "mismatch": counts.get("mismatch", 0),
+        "error": counts.get("error", 0),
+        "skipped": counts.get("skipped", 0),
+        "pass_rate": prate,
+        "avg_total_s": counts.get("avg_total_s"),
+        "median_total_s": counts.get("median_total_s"),
+        "run_dir": str(run_folder),
+        "results_path": str(results_path),
+    }
+    history_path.parent.mkdir(parents=True, exist_ok=True)
+    with history_path.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(history_entry, ensure_ascii=False) + "\n")
+
     bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0)
     unchecked = counts.get("unchecked", 0)
     plan_only = counts.get("plan_only", 0)
@@ -438,6 +529,72 @@ def handle_case_open(args) -> int:
     return 0
 
 
+def _load_history(history_path: Path) -> list[dict]:
+    if not history_path.exists():
+        return []
+    entries: list[dict] = []
+    with history_path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                entries.append(json.loads(line))
+            except json.JSONDecodeError:
+                continue
+    return entries
+
+
+def _print_stats(entries: list[dict]) -> None:
+    if not entries:
+        print("No history entries found.")
+        return
+    header = f"{'run_id':<10} {'ok':>4} {'mis':>4} {'err':>4} {'skip':>5} {'pass%':>7} {'median_s':>10} {'Δpass':>8} {'Δmedian':>9}"
+    print(header)
+    prev = None
+    for entry in entries:
+        pass_rate = entry.get("pass_rate")
+        median = entry.get("median_total_s")
+        delta_pass = None
+        delta_median = None
+        if prev:
+            if pass_rate is not None and prev.get("pass_rate") is not None:
+                delta_pass = pass_rate - prev.get("pass_rate")
+            if median is not None and prev.get("median_total_s") is not None:
+                delta_median = median - prev.get("median_total_s")
+        pr_display = f"{pass_rate*100:.1f}%" if pass_rate is not None else "n/a"
+        median_display = f"{median:.2f}" if median is not None else "n/a"
+        dp = f"{delta_pass:+.1f}%" if delta_pass is not None else "n/a"
+        dm = f"{delta_median:+.2f}" if delta_median is not None else "n/a"
+        print(
+            f"{entry.get('run_id',''):<10} "
+            f"{entry.get('ok',0):>4} {entry.get('mismatch',0):>4} {entry.get('error',0):>4} {entry.get('skipped',0):>5} "
+            f"{pr_display:>7} {median_display:>10} {dp:>8} {dm:>9}"
+        )
+        prev = entry
+
+
+def handle_stats(args) -> int:
+    history_path: Optional[Path] = args.history
+    if history_path is None:
+        if not args.data:
+            print("Provide --data or --history to locate history.jsonl", file=sys.stderr)
+            return 2
+        history_path = args.data / ".runs" / "history.jsonl"
+    entries = _load_history(history_path)
+    if args.group_by == "config_hash":
+        grouped: dict[str, list[dict]] = {}
+        for e in entries:
+            key = e.get("config_hash") or "unknown"
+            grouped.setdefault(key, []).append(e)
+        for key, vals in grouped.items():
+            print(f"\nconfig_hash={key}")
+            _print_stats(vals[-args.last :])
+    else:
+        _print_stats(entries[-args.last :])
+    return 0
+
+
 __all__ = [
     "handle_batch",
     "handle_case_open",
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index b1e8119..fc6eaa8 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -16,7 +16,7 @@ def ensure_repo_imports() -> None:
 
 ensure_repo_imports()
 
-from .batch import handle_batch, handle_case_open, handle_case_run, handle_chat  # noqa: E402
+from .batch import handle_batch, handle_case_open, handle_case_run, handle_chat, handle_stats  # noqa: E402
 from .data_gen import generate_and_save  # noqa: E402
 
 
@@ -73,6 +73,7 @@ def build_parser() -> argparse.ArgumentParser:
     batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code")
     batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show")
     batch_p.add_argument("--show-artifacts", action="store_true", help="Show artifact paths for failures")
+    batch_p.add_argument("--history", type=Path, default=None, help="Path to history.jsonl (default: <data>/.runs/history.jsonl)")
 
     case_root = sub.add_parser("case", help="Single-case utilities")
     case_sub = case_root.add_subparsers(dest="case_command", required=True)
@@ -95,6 +96,12 @@ def build_parser() -> argparse.ArgumentParser:
         "--artifacts-dir", type=Path, default=None, help="Base artifacts dir for latest lookup (default data/.runs)"
     )
 
+    stats_p = sub.add_parser("stats", help="Show batch history stats")
+    stats_p.add_argument("--data", type=Path, default=None, help="Data dir to resolve default history path")
+    stats_p.add_argument("--history", type=Path, default=None, help="Path to history.jsonl (default: <data>/.runs/history.jsonl)")
+    stats_p.add_argument("--last", type=int, default=10, help="How many recent runs to show")
+    stats_p.add_argument("--group-by", choices=["config_hash"], default=None, help="Group stats by config hash")
+
     return parser
 
 
@@ -118,6 +125,8 @@ def main() -> None:
             code = handle_case_open(args)
         else:
             code = 1
+    elif args.command == "stats":
+        code = handle_stats(args)
     else:
         code = 0
     raise SystemExit(code)

From 557be567379271b00a885b4d54f735ff9cc48120 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 20:29:28 +0300
Subject: [PATCH 13/92] Add compare command with markdown and junit outputs

---
 examples/demo_qa/batch.py | 184 ++++++++++++++++++++++++++++++++++++++
 examples/demo_qa/cli.py   |  17 +++-
 2 files changed, 200 insertions(+), 1 deletion(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index b6e0ac5..8a812c1 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -40,6 +40,17 @@ def write_summary(out_path: Path, summary: dict) -> Path:
     return summary_path
 
 
+def _median_duration(results: Mapping[str, RunResult]) -> Optional[float]:
+    durations = [res.duration_ms for res in results.values() if res.duration_ms is not None]
+    if not durations:
+        return None
+    durations.sort()
+    mid = len(durations) // 2
+    if len(durations) % 2 == 1:
+        return durations[mid] / 1000
+    return (durations[mid - 1] + durations[mid]) / 2000
+
+
 def _pass_rate(counts: Mapping[str, object]) -> Optional[float]:
     total = int(counts.get("total", 0) or 0)
     skipped = int(counts.get("skipped", 0) or 0)
@@ -200,6 +211,166 @@ def handle_chat(args) -> int:
     return 0
 
 
+def _bad_statuses() -> set[str]:
+    return {"mismatch", "error", "failed"}
+
+
+def _reason(res: RunResult) -> str:
+    if res.reason:
+        return res.reason
+    if res.error:
+        return res.error
+    if res.expected_check and res.expected_check.detail:
+        return res.expected_check.detail
+    return ""
+
+
+def _artifact_links(res: RunResult) -> dict[str, str]:
+    links = {}
+    base = Path(res.artifacts_dir)
+    for name in ["plan.json", "answer.txt", "raw_synth.txt", "status.json"]:
+        path = base / name
+        if path.exists():
+            links[name] = str(path)
+    return links
+
+
+def compare_runs(base_path: Path, new_path: Path) -> dict[str, object]:
+    base = load_results(base_path)
+    new = load_results(new_path)
+    bad = _bad_statuses()
+
+    new_fail: list[dict] = []
+    fixed: list[dict] = []
+    still_fail: list[dict] = []
+
+    for case_id, new_res in new.items():
+        old_res = base.get(case_id)
+        if old_res is None:
+            continue
+        old_bad = old_res.status in bad
+        new_bad = new_res.status in bad
+        if not old_bad and new_bad:
+            new_fail.append(
+                {
+                    "id": case_id,
+                    "from": old_res.status,
+                    "to": new_res.status,
+                    "reason": _reason(new_res),
+                    "artifacts": _artifact_links(new_res),
+                }
+            )
+        elif old_bad and not new_bad:
+            fixed.append(
+                {
+                    "id": case_id,
+                    "from": old_res.status,
+                    "to": new_res.status,
+                    "reason": _reason(new_res),
+                    "artifacts": _artifact_links(new_res),
+                }
+            )
+        elif old_bad and new_bad:
+            still_fail.append(
+                {
+                    "id": case_id,
+                    "from": old_res.status,
+                    "to": new_res.status,
+                    "reason": _reason(new_res),
+                    "artifacts": _artifact_links(new_res),
+                }
+            )
+
+    base_counts = summarize(base.values())
+    new_counts = summarize(new.values())
+    base_med = _median_duration(base)
+    new_med = _median_duration(new)
+    base_avg = base_counts.get("avg_total_s")
+    new_avg = new_counts.get("avg_total_s")
+    return {
+        "new_fail": new_fail,
+        "fixed": fixed,
+        "still_fail": still_fail,
+        "base_counts": base_counts,
+        "new_counts": new_counts,
+        "base_median": base_med,
+        "new_median": new_med,
+        "base_avg": base_avg,
+        "new_avg": new_avg,
+    }
+
+
+def render_markdown(compare: dict[str, object], out_path: Optional[Path]) -> str:
+    lines: list[str] = []
+    base_counts = compare["base_counts"]  # type: ignore[index]
+    new_counts = compare["new_counts"]  # type: ignore[index]
+    lines.append("# Batch comparison report")
+    lines.append("")
+    lines.append("## Summary")
+    lines.append(
+        f"- Base OK: {base_counts.get('ok',0)}, Bad: {base_counts.get('mismatch',0)+base_counts.get('error',0)+base_counts.get('failed',0)}"
+    )
+    lines.append(
+        f"- New  OK: {new_counts.get('ok',0)}, Bad: {new_counts.get('mismatch',0)+new_counts.get('error',0)+new_counts.get('failed',0)}"
+    )
+    base_med = compare.get("base_median")
+    new_med = compare.get("new_median")
+    if base_med is not None and new_med is not None:
+        lines.append(f"- Median total time: base {base_med:.2f}s → new {new_med:.2f}s (Δ {new_med - base_med:+.2f}s)")
+    lines.append("")
+
+    def table(title: str, rows: list[dict]) -> None:
+        lines.append(f"## {title}")
+        if not rows:
+            lines.append("None")
+            lines.append("")
+            return
+        lines.append("| id | status | reason | artifacts |")
+        lines.append("|---|---|---|---|")
+        for row in rows:
+            artifacts = row.get("artifacts", {})
+            links = ", ".join(f"[{k}]({v})" for k, v in artifacts.items())
+            lines.append(
+                f"| {row['id']} | {row['from']} → {row['to']} | {row.get('reason','')} | {links or ''} |"
+            )
+        lines.append("")
+
+    table("New regressions", compare["new_fail"])  # type: ignore[arg-type]
+    table("Fixed", compare["fixed"])  # type: ignore[arg-type]
+    table("Still failing", compare["still_fail"])  # type: ignore[arg-type]
+
+    content = "\n".join(lines)
+    if out_path:
+        out_path.write_text(content, encoding="utf-8")
+    return content
+
+
+def write_junit(compare: dict[str, object], out_path: Path) -> None:
+    import xml.etree.ElementTree as ET
+
+    suite = ET.Element("testsuite", name="demo_qa_compare")
+    bad = compare["new_fail"] + compare["still_fail"]  # type: ignore[operator]
+    fixed = compare["fixed"]  # type: ignore[assignment]
+    cases = compare["new_counts"].get("total", 0) if isinstance(compare.get("new_counts"), dict) else 0
+    suite.set("tests", str(cases))
+    suite.set("failures", str(len(bad)))
+    suite.set("errors", "0")
+
+    for row in bad:
+        tc = ET.SubElement(suite, "testcase", name=row["id"])
+        msg = row.get("reason", "") or f"{row.get('from')} → {row.get('to')}"
+        failure = ET.SubElement(tc, "failure", message=msg)
+        artifacts = row.get("artifacts", {})
+        if artifacts:
+            failure.text = "\n".join(f"{k}: {v}" for k, v in artifacts.items())
+
+    for row in fixed:
+        ET.SubElement(suite, "testcase", name=row["id"])
+
+    tree = ET.ElementTree(suite)
+    out_path.write_text(ET.tostring(suite, encoding="unicode"), encoding="utf-8")
+
+
 def _select_cases_for_rerun(
     cases: list[Case],
     baseline_for_filter: Optional[Mapping[str, RunResult]],
@@ -595,6 +766,19 @@ def handle_stats(args) -> int:
     return 0
 
 
+def handle_compare(args) -> int:
+    if not args.base.exists() or not args.new.exists():
+        print("Base or new results file not found.", file=sys.stderr)
+        return 2
+    comparison = compare_runs(args.base, args.new)
+    report = render_markdown(comparison, args.out)
+    print(report)
+    if args.junit:
+        write_junit(comparison, args.junit)
+        print(f"JUnit written to {args.junit}")
+    return 0
+
+
 __all__ = [
     "handle_batch",
     "handle_case_open",
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index fc6eaa8..0c98f21 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -16,7 +16,14 @@ def ensure_repo_imports() -> None:
 
 ensure_repo_imports()
 
-from .batch import handle_batch, handle_case_open, handle_case_run, handle_chat, handle_stats  # noqa: E402
+from .batch import (
+    handle_batch,
+    handle_case_open,
+    handle_case_run,
+    handle_chat,
+    handle_compare,
+    handle_stats,
+)  # noqa: E402
 from .data_gen import generate_and_save  # noqa: E402
 
 
@@ -102,6 +109,12 @@ def build_parser() -> argparse.ArgumentParser:
     stats_p.add_argument("--last", type=int, default=10, help="How many recent runs to show")
     stats_p.add_argument("--group-by", choices=["config_hash"], default=None, help="Group stats by config hash")
 
+    compare_p = sub.add_parser("compare", help="Compare two batch result files")
+    compare_p.add_argument("--base", type=Path, required=True, help="Path to baseline results.jsonl")
+    compare_p.add_argument("--new", type=Path, required=True, help="Path to new results.jsonl")
+    compare_p.add_argument("--out", type=Path, default=None, help="Path to markdown report to write")
+    compare_p.add_argument("--junit", type=Path, default=None, help="Path to junit xml output")
+
     return parser
 
 
@@ -127,6 +140,8 @@ def main() -> None:
             code = 1
     elif args.command == "stats":
         code = handle_stats(args)
+    elif args.command == "compare":
+        code = handle_compare(args)
     else:
         code = 0
     raise SystemExit(code)

From db65975ae286d7a1a4d0056add467dc451ce9b55 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 20:33:07 +0300
Subject: [PATCH 14/92] Add tags to results and summary by tag

---
 examples/demo_qa/runner.py   | 32 ++++++++++++++++++++++++++++++++
 tests/test_demo_qa_runner.py | 10 ++++++++++
 2 files changed, 42 insertions(+)

diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 0a980b9..77f6815 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -54,6 +54,7 @@ class RunResult:
     details: Dict[str, object] | None
     artifacts_dir: str
     duration_ms: int
+    tags: list[str]
     answer: str | None = None
     error: str | None = None
     plan_path: str | None = None
@@ -70,6 +71,7 @@ def to_json(self) -> Dict[str, object]:
             "details": self.details,
             "artifacts_dir": self.artifacts_dir,
             "duration_ms": self.duration_ms,
+            "tags": self.tags,
             "answer": self.answer,
             "error": self.error,
             "plan_path": self.plan_path,
@@ -234,6 +236,7 @@ def _build_result(
         details=details,
         artifacts_dir=str(run_dir),
         duration_ms=duration_ms,
+        tags=list(case.tags),
         answer=artifacts.answer,
         error=artifacts.error,
         plan_path=plan_path,
@@ -282,6 +285,7 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]:
     checked_ok = 0
     unchecked_no_assert = 0
     plan_only = 0
+    per_tag: Dict[str, Dict[str, object]] = {}
     for res in results:
         totals[res.status] = totals.get(res.status, 0) + 1
         if res.duration_ms is not None:
@@ -294,6 +298,12 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]:
             unchecked_no_assert += 1
         if res.status == "plan_only":
             plan_only += 1
+        for tag in res.tags:
+            bucket = per_tag.setdefault(
+                tag, {"ok": 0, "mismatch": 0, "failed": 0, "error": 0, "skipped": 0, "unchecked": 0, "plan_only": 0}
+            )
+            bucket[res.status] = bucket.get(res.status, 0) + 1
+            bucket["total"] = bucket.get("total", 0) + 1
 
     summary: Dict[str, object] = {
         "total": sum(totals.values()),
@@ -301,6 +311,7 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]:
         "checked_ok": checked_ok,
         "unchecked_no_assert": unchecked_no_assert,
         "plan_only": plan_only,
+        "summary_by_tag": per_tag,
         **totals,
     }
     if total_times:
@@ -309,6 +320,26 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]:
     else:
         summary["avg_total_s"] = None
         summary["median_total_s"] = None
+
+    for tag, bucket in per_tag.items():
+        times: List[float] = []
+        # no per-tag timing collected; reuse overall average for simplicity
+        if times:
+            bucket["avg_total_s"] = statistics.fmean(times)
+            bucket["median_total_s"] = statistics.median(times)
+        else:
+            bucket["avg_total_s"] = None
+            bucket["median_total_s"] = None
+        total = bucket.get("total", 0)
+        checked_total_tag = (bucket.get("ok", 0) or 0) + (bucket.get("mismatch", 0) or 0) + (
+            bucket.get("failed", 0) or 0
+        )
+        bucket["checked_total"] = checked_total_tag
+        non_skipped = total - (bucket.get("skipped", 0) or 0)
+        if non_skipped > 0:
+            bucket["pass_rate"] = (bucket.get("ok", 0) or 0) / non_skipped
+        else:
+            bucket["pass_rate"] = None
     return summary
 
 
@@ -414,6 +445,7 @@ def _run_result_from_payload(payload: Mapping[str, object]) -> RunResult:
         details=details,
         artifacts_dir=artifacts_dir,
         duration_ms=duration_ms,
+        tags=list(payload.get("tags", []) or []),
         answer=payload.get("answer"),  # type: ignore[arg-type]
         error=payload.get("error"),  # type: ignore[arg-type]
         plan_path=payload.get("plan_path"),  # type: ignore[arg-type]
diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py
index 337c6d2..ad14b00 100644
--- a/tests/test_demo_qa_runner.py
+++ b/tests/test_demo_qa_runner.py
@@ -37,6 +37,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             details=None,
             artifacts_dir="/tmp/ok",
             duration_ms=10,
+            tags=[],
         ),
         "err_to_ok": RunResult(
             id="err_to_ok",
@@ -47,6 +48,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             details=None,
             artifacts_dir="/tmp/err",
             duration_ms=10,
+            tags=[],
         ),
         "checked_to_unchecked": RunResult(
             id="checked_to_unchecked",
@@ -57,6 +59,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             details=None,
             artifacts_dir="/tmp/ok2",
             duration_ms=10,
+            tags=[],
         ),
     }
 
@@ -70,6 +73,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             details=None,
             artifacts_dir="/tmp/ok",
             duration_ms=10,
+            tags=[],
         ),
         "err_to_ok": RunResult(
             id="err_to_ok",
@@ -80,6 +84,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             details=None,
             artifacts_dir="/tmp/err",
             duration_ms=10,
+            tags=[],
         ),
         "checked_to_unchecked": RunResult(
             id="checked_to_unchecked",
@@ -90,6 +95,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             details=None,
             artifacts_dir="/tmp/ok2",
             duration_ms=10,
+            tags=[],
         ),
         "new_ok": RunResult(
             id="new_ok",
@@ -100,6 +106,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             details=None,
             artifacts_dir="/tmp/new",
             duration_ms=10,
+            tags=[],
         ),
     }
 
@@ -122,6 +129,7 @@ def test_summarize_counts_checked_and_unchecked() -> None:
             details=None,
             artifacts_dir="/a",
             duration_ms=10,
+            tags=[],
         ),
         RunResult(
             id="c2",
@@ -132,6 +140,7 @@ def test_summarize_counts_checked_and_unchecked() -> None:
             details=None,
             artifacts_dir="/b",
             duration_ms=5,
+            tags=[],
         ),
         RunResult(
             id="c3",
@@ -142,6 +151,7 @@ def test_summarize_counts_checked_and_unchecked() -> None:
             details=None,
             artifacts_dir="/c",
             duration_ms=7,
+            tags=[],
         ),
     ]
 

From 549a1e077729877c112924704b955b09c9d3be7f Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 20:38:49 +0300
Subject: [PATCH 15/92] Add tag filters, events logging, and comparison outputs

---
 examples/demo_qa/batch.py     | 60 ++++++++++++++++++++++--
 examples/demo_qa/chat_repl.py | 38 +++++++++++++---
 examples/demo_qa/cli.py       |  6 +++
 examples/demo_qa/runner.py    | 86 ++++++++++++++++++++++++++++++++---
 4 files changed, 172 insertions(+), 18 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 8a812c1..cae1ebc 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -15,6 +15,7 @@
 from .provider_factory import build_provider
 from .runner import (
     Case,
+    EventLogger,
     RunResult,
     build_agent,
     compare_results,
@@ -87,6 +88,24 @@ def _hash_file(path: Path) -> str:
     return hashlib.sha256(data).hexdigest()
 
 
+def _split_csv(value: Optional[str]) -> set[str] | None:
+    if not value:
+        return None
+    return {item.strip() for item in value.split(",") if item.strip()}
+
+
+def _load_ids(path: Optional[Path]) -> set[str] | None:
+    if path is None:
+        return None
+    ids = set()
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                ids.add(line)
+    return ids
+
+
 def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]:
     llm_settings = settings.llm
     return {
@@ -377,9 +396,25 @@ def _select_cases_for_rerun(
     *,
     require_assert: bool,
     fail_on: str,
+    include_tags: set[str] | None,
+    exclude_tags: set[str] | None,
+    include_ids: set[str] | None,
+    exclude_ids: set[str] | None,
 ) -> list[Case]:
+    filtered: list[Case] = []
+    for case in cases:
+        tags = set(case.tags)
+        if include_tags and not tags.intersection(include_tags):
+            continue
+        if exclude_tags and tags.intersection(exclude_tags):
+            continue
+        if include_ids and case.id not in include_ids:
+            continue
+        if exclude_ids and case.id in exclude_ids:
+            continue
+        filtered.append(case)
     if not baseline_for_filter:
-        return cases
+        return filtered
     bad_statuses = {"mismatch", "failed", "error"}
     if fail_on == "error":
         bad_statuses = {"error"}
@@ -397,7 +432,7 @@ def _select_cases_for_rerun(
     if require_assert:
         bad_statuses |= {"unchecked", "plan_only"}
     target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses}
-    return [case for case in cases if case.id in target_ids]
+    return [case for case in filtered if case.id in target_ids]
 
 
 def handle_batch(args) -> int:
@@ -454,7 +489,14 @@ def handle_batch(args) -> int:
             return 2
 
     cases = _select_cases_for_rerun(
-        cases, baseline_for_filter, require_assert=args.require_assert, fail_on=args.fail_on
+        cases,
+        baseline_for_filter,
+        require_assert=args.require_assert,
+        fail_on=args.fail_on,
+        include_tags=_split_csv(args.include_tags),
+        exclude_tags=_split_csv(args.exclude_tags),
+        include_ids=_load_ids(args.include_ids),
+        exclude_ids=_load_ids(args.exclude_ids),
     )
 
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -478,11 +520,17 @@ def handle_batch(args) -> int:
     provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic)
     llm = build_llm(settings)
     runner = build_agent(llm, provider)
+    events_path = None
+    if args.events == "on":
+        events_path = args.events_file or (run_folder / "events.jsonl")
+    event_logger = EventLogger(events_path, run_id) if events_path else None
+    if event_logger:
+        event_logger.emit({"type": "run_started", "cases": len(cases), "run_dir": str(run_folder)})
 
     results: list[RunResult] = []
     failures = 0
     for case in cases:
-        result = run_one(case, runner, artifacts_root, plan_only=args.plan_only)
+        result = run_one(case, runner, artifacts_root, plan_only=args.plan_only, event_logger=event_logger)
         results.append(result)
         if not args.quiet:
             print(format_status_line(result))
@@ -525,6 +573,10 @@ def handle_batch(args) -> int:
         summary["diff"] = diff_block
 
     summary_path = write_summary(results_path, summary)
+    summary_by_tag = summary.get("summary_by_tag")
+    if summary_by_tag:
+        summary_by_tag_path = summary_path.with_name("summary_by_tag.json")
+        summary_by_tag_path.write_text(json.dumps(summary_by_tag, ensure_ascii=False, indent=2), encoding="utf-8")
 
     latest_path = run_folder.parent / "latest.txt"
     latest_results_path = run_folder.parent / "latest_results.txt"
diff --git a/examples/demo_qa/chat_repl.py b/examples/demo_qa/chat_repl.py
index 678a78b..27ca76a 100644
--- a/examples/demo_qa/chat_repl.py
+++ b/examples/demo_qa/chat_repl.py
@@ -10,7 +10,16 @@
 import json
 
 from .provider_factory import build_provider
-from .runner import RunArtifacts, build_agent, save_artifacts
+from .runner import Case, EventLogger, RunArtifacts, build_agent, run_one, save_artifacts
+
+
+def _load_json(path: Path) -> object | None:
+    if not path.exists():
+        return None
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
 
 
 def _maybe_add_history(entry: str) -> None:
@@ -94,25 +103,40 @@ def start_repl(
         run_id = uuid.uuid4().hex[:8]
         timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
         run_dir = runs_root / f"{timestamp}_{run_id}"
+        events_path = run_dir / "events.jsonl"
+        event_logger = EventLogger(events_path, run_id)
+        print(f"Events: {events_path}")
 
         artifacts: RunArtifacts | None = None
         try:
-            artifacts = runner.run_question(line, run_id, run_dir)
+            case = Case(id=run_id, question=line, tags=[])
+            result = run_one(case, runner, run_dir, plan_only=False, event_logger=event_logger)
+            plan_obj = _load_json(Path(result.artifacts_dir) / "plan.json")
+            ctx_obj = _load_json(Path(result.artifacts_dir) / "context.json") or {}
+            artifacts = RunArtifacts(
+                run_id=run_id,
+                run_dir=Path(result.artifacts_dir),
+                question=line,
+                plan=plan_obj if isinstance(plan_obj, dict) else None,
+                context=ctx_obj if isinstance(ctx_obj, dict) else None,
+                answer=result.answer,
+                raw_synth=None,
+                error=result.error,
+            )
             last_artifacts = artifacts
-            save_artifacts(artifacts)
             if plan_debug_mode in {"on", "once"} and artifacts.plan:
                 print("--- PLAN ---")
                 print(json.dumps(artifacts.plan, ensure_ascii=False, indent=2))
-            print(artifacts.answer or "")
+            print(result.answer or "")
         except Exception as exc:  # pragma: no cover - REPL resilience
             error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=run_dir, question=line)
             error_artifacts.error = error_artifacts.error or str(exc)
             last_artifacts = error_artifacts
             save_artifacts(error_artifacts)
-            print(f"Error during run {run_id}: {exc}", file=sys.stderr)
+            print(f\"Error during run {run_id}: {exc}\", file=sys.stderr)
         finally:
-            if plan_debug_mode == "once":
-                plan_debug_mode = "off"
+            if plan_debug_mode == \"once\":
+                plan_debug_mode = \"off\"
 
 
 __all__ = ["start_repl"]
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index 0c98f21..1626908 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -81,6 +81,12 @@ def build_parser() -> argparse.ArgumentParser:
     batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show")
     batch_p.add_argument("--show-artifacts", action="store_true", help="Show artifact paths for failures")
     batch_p.add_argument("--history", type=Path, default=None, help="Path to history.jsonl (default: <data>/.runs/history.jsonl)")
+    batch_p.add_argument("--include-tags", type=str, default=None, help="Comma-separated tags to include")
+    batch_p.add_argument("--exclude-tags", type=str, default=None, help="Comma-separated tags to exclude")
+    batch_p.add_argument("--include-ids", type=Path, default=None, help="Path to file with ids to include (one per line)")
+    batch_p.add_argument("--exclude-ids", type=Path, default=None, help="Path to file with ids to exclude (one per line)")
+    batch_p.add_argument("--events", choices=["on", "off"], default="on", help="Enable events.jsonl emission")
+    batch_p.add_argument("--events-file", type=Path, default=None, help="Override events file path")
 
     case_root = sub.add_parser("case", help="Single-case utilities")
     case_sub = case_root.add_subparsers(dest="case_command", required=True)
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 77f6815..bb2e70c 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import datetime
 import json
 import re
 import statistics
@@ -120,29 +121,49 @@ def saver(feature_name: str, parsed: object) -> None:
             task_profile=task_profile,
         )
 
-    def run_question(self, question: str, run_id: str, run_dir: Path, *, plan_only: bool = False) -> RunArtifacts:
+    def run_question(
+        self,
+        case: Case,
+        run_id: str,
+        run_dir: Path,
+        *,
+        plan_only: bool = False,
+        event_logger: EventLogger | None = None,
+    ) -> RunArtifacts:
         set_run_id(run_id)
-        artifacts = RunArtifacts(run_id=run_id, run_dir=run_dir, question=question, plan_only=plan_only)
+        artifacts = RunArtifacts(run_id=run_id, run_dir=run_dir, question=case.question, plan_only=plan_only)
 
         started = time.perf_counter()
         try:
+            if event_logger:
+                event_logger.emit({"type": "plan_started", "case_id": case.id})
             plan_started = time.perf_counter()
-            plan = self.agent._plan(question)  # type: ignore[attr-defined]
+            plan = self.agent._plan(case.question)  # type: ignore[attr-defined]
             artifacts.timings.plan_s = time.perf_counter() - plan_started
             artifacts.plan = plan.model_dump()
 
+            if event_logger:
+                event_logger.emit({"type": "plan_built", "case_id": case.id, "plan_path": str(run_dir / "plan.json")})
             if not plan_only:
+                if event_logger:
+                    event_logger.emit({"type": "fetch_started", "case_id": case.id})
                 fetch_started = time.perf_counter()
-                ctx = self.agent._fetch(question, plan)  # type: ignore[attr-defined]
+                ctx = self.agent._fetch(case.question, plan)  # type: ignore[attr-defined]
                 artifacts.timings.fetch_s = time.perf_counter() - fetch_started
                 artifacts.context = {k: v.text for k, v in (ctx or {}).items()} if ctx else {}
 
+                if event_logger:
+                    event_logger.emit({"type": "fetch_finished", "case_id": case.id})
+                if event_logger:
+                    event_logger.emit({"type": "synth_started", "case_id": case.id})
                 synth_started = time.perf_counter()
-                draft = self.agent._synthesize(question, ctx, plan)  # type: ignore[attr-defined]
+                draft = self.agent._synthesize(case.question, ctx, plan)  # type: ignore[attr-defined]
                 artifacts.timings.synth_s = time.perf_counter() - synth_started
                 artifacts.raw_synth = str(draft)
                 parsed = self.agent.domain_parser(draft)
                 artifacts.answer = str(parsed)
+                if event_logger:
+                    event_logger.emit({"type": "synth_finished", "case_id": case.id})
         except Exception as exc:  # pragma: no cover - demo fallback
             artifacts.error = str(exc)
         finally:
@@ -245,9 +266,19 @@ def _build_result(
     )
 
 
-def run_one(case: Case, runner: AgentRunner, artifacts_root: Path, *, plan_only: bool = False) -> RunResult:
+def run_one(
+    case: Case,
+    runner: AgentRunner,
+    artifacts_root: Path,
+    *,
+    plan_only: bool = False,
+    event_logger: EventLogger | None = None,
+) -> RunResult:
     run_id = uuid.uuid4().hex[:8]
     run_dir = artifacts_root / f"{case.id}_{run_id}"
+    case_logger = event_logger.for_case(case.id, run_dir / "events.jsonl") if event_logger else None
+    if case_logger:
+        case_logger.emit({"type": "case_started", "case_id": case.id, "run_dir": str(run_dir)})
     if case.skip:
         run_dir.mkdir(parents=True, exist_ok=True)
         _save_text(run_dir / "skipped.txt", "Skipped by request")
@@ -267,14 +298,36 @@ def run_one(case: Case, runner: AgentRunner, artifacts_root: Path, *, plan_only:
             expected_check=None,
         )
         save_status(result)
+        if case_logger:
+            case_logger.emit({"type": "case_finished", "case_id": case.id, "status": "skipped"})
         return result
 
-    artifacts = runner.run_question(case.question, run_id, run_dir, plan_only=plan_only)
+    artifacts = runner.run_question(case, run_id, run_dir, plan_only=plan_only, event_logger=case_logger)
     save_artifacts(artifacts)
 
     expected_check = None if plan_only else _match_expected(case, artifacts.answer)
     result = _build_result(case, artifacts, run_dir, expected_check)
     save_status(result)
+    if case_logger:
+        if result.status == "error":
+            case_logger.emit(
+                {
+                    "type": "case_failed",
+                    "case_id": case.id,
+                    "status": result.status,
+                    "reason": result.reason,
+                    "artifacts_dir": result.artifacts_dir,
+                }
+            )
+        case_logger.emit(
+            {
+                "type": "case_finished",
+                "case_id": case.id,
+                "status": result.status,
+                "duration_ms": result.duration_ms,
+                "artifacts_dir": result.artifacts_dir,
+            }
+        )
     return result
 
 
@@ -552,6 +605,7 @@ def format_status_line(result: RunResult) -> str:
     "ExpectedCheck",
     "RunArtifacts",
     "RunResult",
+    "EventLogger",
     "build_agent",
     "compare_results",
     "format_status_line",
@@ -563,3 +617,21 @@ def format_status_line(result: RunResult) -> str:
     "summarize",
     "_match_expected",
 ]
+class EventLogger:
+    def __init__(self, path: Path | None, run_id: str):
+        self.path = path
+        self.run_id = run_id
+        if path:
+            path.parent.mkdir(parents=True, exist_ok=True)
+
+    def emit(self, event: Dict[str, object]) -> None:
+        if not self.path:
+            return
+        payload = {"timestamp": datetime.datetime.utcnow().isoformat() + "Z", "run_id": self.run_id, **event}
+        with self.path.open("a", encoding="utf-8") as f:
+            f.write(json.dumps(payload, ensure_ascii=False) + "\n")
+
+    def for_case(self, case_id: str, path: Path | None = None) -> "EventLogger":
+        if path is None:
+            return self
+        return EventLogger(path, self.run_id)

From 44c1328cab3afe4db4c3acb51aa0492f91f00a8c Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 20:53:08 +0300
Subject: [PATCH 16/92] Include tags on skipped results to avoid constructor
 errors

---
 examples/demo_qa/runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index bb2e70c..5bd7d17 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -291,6 +291,7 @@ def run_one(
             details=None,
             artifacts_dir=str(run_dir),
             duration_ms=0,
+            tags=list(case.tags),
             answer=None,
             error=None,
             plan_path=None,

From 56af0eaae63d4016402d5d79a4fa190ee42effae Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 21:04:24 +0300
Subject: [PATCH 17/92] Persist summary_by_tag in batch outputs

---
 examples/demo_qa/batch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index cae1ebc..7070ded 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -563,6 +563,7 @@ def handle_batch(args) -> int:
         "ended_at": ended_at.isoformat() + "Z",
         "duration_ms": duration_ms,
         "counts": counts,
+        "summary_by_tag": counts.get("summary_by_tag"),
         "exit_code": exit_code,
         "config_fingerprint": build_config_fingerprint(settings, args.cases),
         "results_path": str(results_path),

From 806bd20f527d718d7df0b99b38fa75dbc8a4054f Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 21:09:03 +0300
Subject: [PATCH 18/92] Exclude .runs/.cache from data fingerprint

---
 examples/demo_qa/batch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 7070ded..434fe89 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -120,10 +120,13 @@ def _fingerprint_dir(data_dir: Path) -> Mapping[str, object]:
     files: list[dict] = []
     for path in sorted(data_dir.rglob("*")):
         if path.is_file():
+            rel = path.relative_to(data_dir)
+            if rel.parts and rel.parts[0] in {".runs", ".cache"}:
+                continue
             stat = path.stat()
             files.append(
                 {
-                    "path": str(path.relative_to(data_dir)),
+                    "path": str(rel),
                     "size": stat.st_size,
                     "mtime": stat.st_mtime,
                 }

From ec567d6ff0fa68811f4b145949d17b39b7384cc2 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 21:15:20 +0300
Subject: [PATCH 19/92] Validate regexes and fix logging/reporting gaps

---
 README_demo_qa.md             |  2 +-
 examples/demo_qa/batch.py     | 10 ++++++++--
 examples/demo_qa/chat_repl.py |  6 +++---
 examples/demo_qa/runner.py    |  5 +++++
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/README_demo_qa.md b/README_demo_qa.md
index 4036a23..ebfd92b 100644
--- a/README_demo_qa.md
+++ b/README_demo_qa.md
@@ -64,7 +64,7 @@ python -m examples.demo_qa.cli batch \
   --out results.jsonl
 ```
 
-* Артефакты по умолчанию пишутся в `<data>/.runs/batch_<timestamp>/id_runid/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`).
+* Артефакты по умолчанию пишутся в `<data>/.runs/runs/<timestamp>_<cases_stem>/cases/<id>_<runid>/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`).
 * `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов и, при наличии `--compare-to`, diff по прогрессу.
 * Флаги `--fail-on (error|mismatch/unchecked/any)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to`, `--only-failed-from/--only-failed` и `--plan-only` управляют выбором кейсов, остановкой и кодом выхода (0/1/2).
 * Без `--out` результаты складываются в `<artifacts_dir>/runs/<timestamp>_<cases_stem>/results.jsonl`, а `runs/latest.txt` указывает на последнюю папку запуска.
diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 434fe89..cae75ec 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -313,6 +313,7 @@ def compare_runs(base_path: Path, new_path: Path) -> dict[str, object]:
         "new_fail": new_fail,
         "fixed": fixed,
         "still_fail": still_fail,
+        "all_ids": list(new.keys()),
         "base_counts": base_counts,
         "new_counts": new_counts,
         "base_median": base_med,
@@ -373,8 +374,9 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None:
     suite = ET.Element("testsuite", name="demo_qa_compare")
     bad = compare["new_fail"] + compare["still_fail"]  # type: ignore[operator]
     fixed = compare["fixed"]  # type: ignore[assignment]
-    cases = compare["new_counts"].get("total", 0) if isinstance(compare.get("new_counts"), dict) else 0
-    suite.set("tests", str(cases))
+    all_ids = set(compare.get("all_ids", []) or [])  # type: ignore[arg-type]
+    cases_total = len(all_ids)
+    suite.set("tests", str(cases_total))
     suite.set("failures", str(len(bad)))
     suite.set("errors", "0")
 
@@ -389,6 +391,10 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None:
     for row in fixed:
         ET.SubElement(suite, "testcase", name=row["id"])
 
+    ok_ids = all_ids - {row["id"] for row in bad} - {row["id"] for row in fixed}
+    for cid in ok_ids:
+        ET.SubElement(suite, "testcase", name=cid)
+
     tree = ET.ElementTree(suite)
     out_path.write_text(ET.tostring(suite, encoding="unicode"), encoding="utf-8")
 
diff --git a/examples/demo_qa/chat_repl.py b/examples/demo_qa/chat_repl.py
index 27ca76a..65bb0f7 100644
--- a/examples/demo_qa/chat_repl.py
+++ b/examples/demo_qa/chat_repl.py
@@ -133,10 +133,10 @@ def start_repl(
             error_artifacts.error = error_artifacts.error or str(exc)
             last_artifacts = error_artifacts
             save_artifacts(error_artifacts)
-            print(f\"Error during run {run_id}: {exc}\", file=sys.stderr)
+            print(f"Error during run {run_id}: {exc}", file=sys.stderr)
         finally:
-            if plan_debug_mode == \"once\":
-                plan_debug_mode = \"off\"
+            if plan_debug_mode == "once":
+                plan_debug_mode = "off"
 
 
 __all__ = ["start_repl"]
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 5bd7d17..22bec9a 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -427,6 +427,11 @@ def load_cases(path: Path) -> List[Case]:
             ]:
                 if val is not None and str(val).strip() == "":
                     raise ValueError(f"{field_name} must not be empty on line {lineno}")
+            if expected_regex is not None:
+                try:
+                    re.compile(expected_regex)
+                except re.error as exc:
+                    raise ValueError(f"Invalid expected_regex on line {lineno}: {exc}") from exc
             case = Case(
                 id=case_id,
                 question=str(payload["question"]),

From e47813eb1bc6e824d6f22c97fe1b375c55cc6a14 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 21:31:36 +0300
Subject: [PATCH 20/92] Write summary and latest_results for case runs

---
 examples/demo_qa/batch.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index cae75ec..0176358 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -733,12 +733,24 @@ def handle_case_run(args) -> int:
 
     result = run_one(cases[args.case_id], runner, artifacts_root, plan_only=args.plan_only)
     write_results(results_path, [result])
-    save_path = run_folder.parent / "latest.txt"
-    save_path.parent.mkdir(parents=True, exist_ok=True)
-    save_path.write_text(str(run_folder), encoding="utf-8")
+    counts = summarize([result])
+    summary = {
+        "run_id": run_folder.name,
+        "timestamp": timestamp + "Z",
+        "counts": counts,
+        "results_path": str(results_path),
+        "fail_on": "bad",
+        "require_assert": False,
+    }
+    summary_path = write_summary(results_path, summary)
+    save_dir = run_folder.parent
+    save_dir.mkdir(parents=True, exist_ok=True)
+    (save_dir / "latest.txt").write_text(str(run_folder), encoding="utf-8")
+    (save_dir / "latest_results.txt").write_text(str(results_path), encoding="utf-8")
 
     print(format_status_line(result))
     print(f"Artifacts: {result.artifacts_dir}")
+    print(f"Summary: {summary_path}")
     return 0
 
 

From b982b869fa38038289e4536a79ed2d3ae022c283 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 21:41:43 +0300
Subject: [PATCH 21/92] Fix pass-rate delta display in stats

---
 examples/demo_qa/batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 0176358..4ebe878 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -809,7 +809,7 @@ def _print_stats(entries: list[dict]) -> None:
                 delta_median = median - prev.get("median_total_s")
         pr_display = f"{pass_rate*100:.1f}%" if pass_rate is not None else "n/a"
         median_display = f"{median:.2f}" if median is not None else "n/a"
-        dp = f"{delta_pass:+.1f}%" if delta_pass is not None else "n/a"
+        dp = f"{delta_pass*100:+.1f}pp" if delta_pass is not None else "n/a"
         dm = f"{delta_median:+.2f}" if delta_median is not None else "n/a"
         print(
             f"{entry.get('run_id',''):<10} "

From 87edddf7ee2862f0f691dd2224e44e89dac6e66e Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 21:58:56 +0300
Subject: [PATCH 22/92] Provide bundled pydantic settings shim

---
 src/pydantic_settings/__init__.py | 98 +++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 src/pydantic_settings/__init__.py

diff --git a/src/pydantic_settings/__init__.py b/src/pydantic_settings/__init__.py
new file mode 100644
index 0000000..bf9fdae
--- /dev/null
+++ b/src/pydantic_settings/__init__.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+
+import os
+import tomllib
+from typing import Any, Callable, ClassVar, Dict, Iterable, Mapping
+
+from pydantic import BaseModel
+
+
+def SettingsConfigDict(**kwargs: Any) -> Dict[str, Any]:
+    return dict(**kwargs)
+
+
+def _deep_update(base: Dict[str, Any], updates: Mapping[str, Any]) -> Dict[str, Any]:
+    for key, value in updates.items():
+        if isinstance(value, Mapping) and isinstance(base.get(key), dict):
+            base[key] = _deep_update(base[key], value)
+        else:
+            base[key] = value
+    return base
+
+
+class TomlConfigSettingsSource:
+    def __init__(self, settings_cls: type[BaseModel], path: os.PathLike | str | None):
+        self._path = path
+
+    def __call__(self) -> Dict[str, Any]:
+        if not self._path:
+            return {}
+        try:
+            with open(self._path, "rb") as toml_file:
+                return tomllib.load(toml_file)
+        except FileNotFoundError:
+            return {}
+
+
+class BaseSettings(BaseModel):
+    model_config: ClassVar[SettingsConfigDict] = {}
+
+    def __init__(self, **values: Any) -> None:
+        sources = self.settings_customise_sources(
+            self.__class__,
+            self._build_init_settings(values),
+            self._build_env_settings(),
+            self._build_dotenv_settings(),
+            self._build_file_secret_settings(),
+        )
+        merged: Dict[str, Any] = {}
+        for source in reversed(tuple(sources)):
+            merged = _deep_update(merged, source() or {})
+        super().__init__(**merged)
+
+    @classmethod
+    def settings_customise_sources(
+        cls,
+        settings_cls: type[BaseModel],
+        init_settings: Callable[[], Mapping[str, Any]],
+        env_settings: Callable[[], Mapping[str, Any]],
+        dotenv_settings: Callable[[], Mapping[str, Any]],
+        file_secret_settings: Callable[[], Mapping[str, Any]],
+    ) -> Iterable[Callable[[], Mapping[str, Any]]]:
+        return (init_settings, env_settings, dotenv_settings, file_secret_settings)
+
+    @staticmethod
+    def _build_init_settings(values: Mapping[str, Any]) -> Callable[[], Mapping[str, Any]]:
+        return lambda: dict(values)
+
+    @classmethod
+    def _build_env_settings(cls) -> Callable[[], Mapping[str, Any]]:
+        prefix = cls.model_config.get("env_prefix", "") or ""
+        delimiter = cls.model_config.get("env_nested_delimiter", "__") or "__"
+
+        def source() -> Dict[str, Any]:
+            settings: Dict[str, Any] = {}
+            for key, value in os.environ.items():
+                if not key.startswith(prefix):
+                    continue
+                raw_key = key[len(prefix) :]
+                parts = raw_key.split(delimiter) if delimiter else [raw_key]
+                cls._insert_nested(settings, [part.lower() for part in parts], value)
+            return settings
+
+        return source
+
+    @classmethod
+    def _build_dotenv_settings(cls) -> Callable[[], Mapping[str, Any]]:
+        return lambda: {}
+
+    @classmethod
+    def _build_file_secret_settings(cls) -> Callable[[], Mapping[str, Any]]:
+        return lambda: {}
+
+    @staticmethod
+    def _insert_nested(target: Dict[str, Any], parts: list[str], value: Any) -> None:
+        current = target
+        for part in parts[:-1]:
+            current = current.setdefault(part, {})
+        current[parts[-1]] = value

From 69039cd6e0b8983a74203e254703cc85203b8fdb Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 22:14:41 +0300
Subject: [PATCH 23/92] Centralize failure status handling

---
 examples/demo_qa/batch.py   | 39 ++++++++++++-------------------------
 tests/test_demo_qa_batch.py | 19 ++++++++++++++++++
 2 files changed, 31 insertions(+), 27 deletions(-)
 create mode 100644 tests/test_demo_qa_batch.py

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 4ebe878..31a428f 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -61,18 +61,12 @@ def _pass_rate(counts: Mapping[str, object]) -> Optional[float]:
     return (counts.get("ok", 0) or 0) / denom
 
 
-def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
+def bad_statuses(fail_on: str, require_assert: bool) -> set[str]:
     unchecked = {"unchecked", "plan_only"}
     bad = {"error", "failed", "mismatch"}
     if fail_on == "error":
         bad = {"error"}
-    elif fail_on == "mismatch":
-        bad = {"error", "failed", "mismatch"}
-    elif fail_on == "unchecked":
-        bad |= unchecked
-    elif fail_on == "bad":
-        bad = {"error", "failed", "mismatch"}
-    elif fail_on == "any":
+    elif fail_on in {"unchecked", "any"}:
         bad |= unchecked
     elif fail_on == "skipped":
         bad |= {"skipped"}
@@ -80,7 +74,11 @@ def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
     if require_assert:
         bad |= unchecked
 
-    return status in bad
+    return bad
+
+
+def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
+    return status in bad_statuses(fail_on, require_assert)
 
 
 def _hash_file(path: Path) -> str:
@@ -234,7 +232,7 @@ def handle_chat(args) -> int:
 
 
 def _bad_statuses() -> set[str]:
-    return {"mismatch", "error", "failed"}
+    return bad_statuses("bad", False)
 
 
 def _reason(res: RunResult) -> str:
@@ -424,23 +422,9 @@ def _select_cases_for_rerun(
         filtered.append(case)
     if not baseline_for_filter:
         return filtered
-    bad_statuses = {"mismatch", "failed", "error"}
-    if fail_on == "error":
-        bad_statuses = {"error"}
-    elif fail_on == "mismatch":
-        bad_statuses = {"mismatch", "failed", "error"}
-    elif fail_on == "unchecked":
-        bad_statuses |= {"unchecked", "plan_only"}
-    elif fail_on == "bad":
-        bad_statuses = {"mismatch", "failed", "error"}
-    elif fail_on == "any":
-        bad_statuses |= {"unchecked", "plan_only"}
-    elif fail_on == "skipped":
-        bad_statuses |= {"skipped"}
-
-    if require_assert:
-        bad_statuses |= {"unchecked", "plan_only"}
-    target_ids = {case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses}
+    target_ids = {
+        case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses(fail_on, require_assert)
+    }
     return [case for case in filtered if case.id in target_ids]
 
 
@@ -858,6 +842,7 @@ def handle_compare(args) -> int:
     "handle_case_open",
     "handle_case_run",
     "handle_chat",
+    "bad_statuses",
     "is_failure",
     "write_results",
     "write_summary",
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
new file mode 100644
index 0000000..f52da8a
--- /dev/null
+++ b/tests/test_demo_qa_batch.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+import itertools
+
+import pytest
+
+from examples.demo_qa.batch import bad_statuses, is_failure
+
+
+@pytest.mark.parametrize(
+    "fail_on,require_assert",
+    itertools.product(["bad", "error", "mismatch", "unchecked", "any", "skipped"], [False, True]),
+)
+def test_is_failure_matches_bad_statuses(fail_on: str, require_assert: bool) -> None:
+    statuses = ["ok", "mismatch", "failed", "error", "unchecked", "plan_only", "skipped"]
+    bad = bad_statuses(fail_on, require_assert)
+    assert bad  # sanity check
+    for status in statuses:
+        assert is_failure(status, fail_on, require_assert) == (status in bad)

From 3f214d70fcd95e0ec8adfcad1040d4ff743746ac Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 22:58:56 +0300
Subject: [PATCH 24/92] Allow pytest to import examples package

---
 pytest.ini | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pytest.ini b/pytest.ini
index 1abd662..1e1d0bf 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -2,7 +2,9 @@
 [pytest]
 minversion = 7.0
 
-pythonpath = src
+pythonpath =
+    src
+    .
 
 testpaths =
     tests
@@ -57,4 +59,4 @@ filterwarnings =
     error::DeprecationWarning:fetchgraph.*
     error::PendingDeprecationWarning:fetchgraph.*
     ignore::DeprecationWarning
-    ignore::PendingDeprecationWarning
\ No newline at end of file
+    ignore::PendingDeprecationWarning

From 62e749c5abfc3db62e3bbc2a6d5c611f83289eec Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 23:16:11 +0300
Subject: [PATCH 25/92] Trim fail_on choices to supported modes

---
 README_demo_qa.md            |   2 +-
 examples/demo_qa/batch.py    | 162 +++++-------------------
 examples/demo_qa/cli.py      |   9 +-
 examples/demo_qa/runner.py   | 231 +++++++++++++++++++++++++----------
 tests/test_demo_qa_batch.py  |   2 +-
 tests/test_demo_qa_runner.py |  46 +++----
 6 files changed, 225 insertions(+), 227 deletions(-)

diff --git a/README_demo_qa.md b/README_demo_qa.md
index ebfd92b..86f9e67 100644
--- a/README_demo_qa.md
+++ b/README_demo_qa.md
@@ -66,7 +66,7 @@ python -m examples.demo_qa.cli batch \
 
 * Артефакты по умолчанию пишутся в `<data>/.runs/runs/<timestamp>_<cases_stem>/cases/<id>_<runid>/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`).
 * `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов и, при наличии `--compare-to`, diff по прогрессу.
-* Флаги `--fail-on (error|mismatch/unchecked/any)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to`, `--only-failed-from/--only-failed` и `--plan-only` управляют выбором кейсов, остановкой и кодом выхода (0/1/2).
+* Флаги `--fail-on (error|bad|unchecked|any|skipped)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to`, `--only-failed-from/--only-failed` и `--plan-only` управляют выбором кейсов, остановкой и кодом выхода (0/1/2).
 * Без `--out` результаты складываются в `<artifacts_dir>/runs/<timestamp>_<cases_stem>/results.jsonl`, а `runs/latest.txt` указывает на последнюю папку запуска.
 * Быстрый фокус на упавших: `--only-failed` возьмёт `runs/latest/results.jsonl`, `--show-artifacts` печатает пути, репро-команды выводятся для каждого FAIL.
 * Команды уровня кейса: `demo_qa case run <id> --cases ...` и `demo_qa case open <id> --run runs/latest` для быстрого воспроизведения.
diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 31a428f..4a1d392 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -17,9 +17,11 @@
     Case,
     EventLogger,
     RunResult,
+    bad_statuses,
     build_agent,
-    compare_results,
+    diff_runs,
     format_status_line,
+    is_failure,
     load_cases,
     load_results,
     run_one,
@@ -41,17 +43,6 @@ def write_summary(out_path: Path, summary: dict) -> Path:
     return summary_path
 
 
-def _median_duration(results: Mapping[str, RunResult]) -> Optional[float]:
-    durations = [res.duration_ms for res in results.values() if res.duration_ms is not None]
-    if not durations:
-        return None
-    durations.sort()
-    mid = len(durations) // 2
-    if len(durations) % 2 == 1:
-        return durations[mid] / 1000
-    return (durations[mid - 1] + durations[mid]) / 2000
-
-
 def _pass_rate(counts: Mapping[str, object]) -> Optional[float]:
     total = int(counts.get("total", 0) or 0)
     skipped = int(counts.get("skipped", 0) or 0)
@@ -61,26 +52,6 @@ def _pass_rate(counts: Mapping[str, object]) -> Optional[float]:
     return (counts.get("ok", 0) or 0) / denom
 
 
-def bad_statuses(fail_on: str, require_assert: bool) -> set[str]:
-    unchecked = {"unchecked", "plan_only"}
-    bad = {"error", "failed", "mismatch"}
-    if fail_on == "error":
-        bad = {"error"}
-    elif fail_on in {"unchecked", "any"}:
-        bad |= unchecked
-    elif fail_on == "skipped":
-        bad |= {"skipped"}
-
-    if require_assert:
-        bad |= unchecked
-
-    return bad
-
-
-def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
-    return status in bad_statuses(fail_on, require_assert)
-
-
 def _hash_file(path: Path) -> str:
     data = path.read_bytes()
     return hashlib.sha256(data).hexdigest()
@@ -231,94 +202,10 @@ def handle_chat(args) -> int:
     return 0
 
 
-def _bad_statuses() -> set[str]:
-    return bad_statuses("bad", False)
-
-
-def _reason(res: RunResult) -> str:
-    if res.reason:
-        return res.reason
-    if res.error:
-        return res.error
-    if res.expected_check and res.expected_check.detail:
-        return res.expected_check.detail
-    return ""
-
-
-def _artifact_links(res: RunResult) -> dict[str, str]:
-    links = {}
-    base = Path(res.artifacts_dir)
-    for name in ["plan.json", "answer.txt", "raw_synth.txt", "status.json"]:
-        path = base / name
-        if path.exists():
-            links[name] = str(path)
-    return links
-
-
-def compare_runs(base_path: Path, new_path: Path) -> dict[str, object]:
+def compare_runs(base_path: Path, new_path: Path, *, fail_on: str, require_assert: bool) -> dict[str, object]:
     base = load_results(base_path)
     new = load_results(new_path)
-    bad = _bad_statuses()
-
-    new_fail: list[dict] = []
-    fixed: list[dict] = []
-    still_fail: list[dict] = []
-
-    for case_id, new_res in new.items():
-        old_res = base.get(case_id)
-        if old_res is None:
-            continue
-        old_bad = old_res.status in bad
-        new_bad = new_res.status in bad
-        if not old_bad and new_bad:
-            new_fail.append(
-                {
-                    "id": case_id,
-                    "from": old_res.status,
-                    "to": new_res.status,
-                    "reason": _reason(new_res),
-                    "artifacts": _artifact_links(new_res),
-                }
-            )
-        elif old_bad and not new_bad:
-            fixed.append(
-                {
-                    "id": case_id,
-                    "from": old_res.status,
-                    "to": new_res.status,
-                    "reason": _reason(new_res),
-                    "artifacts": _artifact_links(new_res),
-                }
-            )
-        elif old_bad and new_bad:
-            still_fail.append(
-                {
-                    "id": case_id,
-                    "from": old_res.status,
-                    "to": new_res.status,
-                    "reason": _reason(new_res),
-                    "artifacts": _artifact_links(new_res),
-                }
-            )
-
-    base_counts = summarize(base.values())
-    new_counts = summarize(new.values())
-    base_med = _median_duration(base)
-    new_med = _median_duration(new)
-    base_avg = base_counts.get("avg_total_s")
-    new_avg = new_counts.get("avg_total_s")
-    return {
-        "new_fail": new_fail,
-        "fixed": fixed,
-        "still_fail": still_fail,
-        "all_ids": list(new.keys()),
-        "base_counts": base_counts,
-        "new_counts": new_counts,
-        "base_median": base_med,
-        "new_median": new_med,
-        "base_avg": base_avg,
-        "new_avg": new_avg,
-    }
+    return diff_runs(base.values(), new.values(), fail_on=fail_on, require_assert=require_assert)
 
 
 def render_markdown(compare: dict[str, object], out_path: Optional[Path]) -> str:
@@ -348,7 +235,7 @@ def table(title: str, rows: list[dict]) -> None:
             return
         lines.append("| id | status | reason | artifacts |")
         lines.append("|---|---|---|---|")
-        for row in rows:
+        for row in sorted(rows, key=lambda r: r.get("id", "")):
             artifacts = row.get("artifacts", {})
             links = ", ".join(f"[{k}]({v})" for k, v in artifacts.items())
             lines.append(
@@ -372,13 +259,14 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None:
     suite = ET.Element("testsuite", name="demo_qa_compare")
     bad = compare["new_fail"] + compare["still_fail"]  # type: ignore[operator]
     fixed = compare["fixed"]  # type: ignore[assignment]
-    all_ids = set(compare.get("all_ids", []) or [])  # type: ignore[arg-type]
+    all_ids_list = list(compare.get("all_ids", []) or [])  # type: ignore[arg-type]
+    all_ids = sorted(all_ids_list)
     cases_total = len(all_ids)
     suite.set("tests", str(cases_total))
     suite.set("failures", str(len(bad)))
     suite.set("errors", "0")
 
-    for row in bad:
+    for row in sorted(bad, key=lambda r: r.get("id", "")):
         tc = ET.SubElement(suite, "testcase", name=row["id"])
         msg = row.get("reason", "") or f"{row.get('from')} → {row.get('to')}"
         failure = ET.SubElement(tc, "failure", message=msg)
@@ -386,10 +274,12 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None:
         if artifacts:
             failure.text = "\n".join(f"{k}: {v}" for k, v in artifacts.items())
 
-    for row in fixed:
+    for row in sorted(fixed, key=lambda r: r.get("id", "")):
         ET.SubElement(suite, "testcase", name=row["id"])
 
-    ok_ids = all_ids - {row["id"] for row in bad} - {row["id"] for row in fixed}
+    bad_ids = {row["id"] for row in bad}
+    fixed_ids = {row["id"] for row in fixed}
+    ok_ids = [cid for cid in all_ids if cid not in bad_ids and cid not in fixed_ids]
     for cid in ok_ids:
         ET.SubElement(suite, "testcase", name=cid)
 
@@ -535,12 +425,16 @@ def handle_batch(args) -> int:
     write_results(results_path, results)
     counts = summarize(results)
 
-    results_by_id = {r.id: r for r in results}
     diff_block: dict | None = None
     baseline_path: Path | None = None
     if baseline_for_compare:
         baseline_path = args.compare_to or baseline_filter_path
-        diff = compare_results(baseline_for_compare, results_by_id, require_assert=args.require_assert)
+        diff = diff_runs(
+            baseline_for_compare.values(),
+            results,
+            fail_on=args.fail_on,
+            require_assert=args.require_assert,
+        )
         if baseline_path:
             diff["baseline_path"] = str(baseline_path)
         diff_block = diff
@@ -648,20 +542,20 @@ def handle_batch(args) -> int:
         print(summary_line)
         if diff_block:
             print(
-                f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, "
-                f"-{len(diff_block.get('regressed', []))} regressions, "
-                f"{len(diff_block.get('still_bad', []))} still failing, "
-                f"{len(diff_block.get('new_unchecked', []))} new unchecked"
+                f"Δ vs baseline: +{len(diff_block.get('fixed', []))} fixed, "
+                f"-{len(diff_block.get('new_fail', []))} regressions, "
+                f"{len(diff_block.get('still_fail', []))} still failing, "
+                f"{len(diff_block.get('new_cases', []))} new cases"
             )
         return exit_code
 
     print(summary_line)
     if diff_block:
         print(
-            f"Δ vs baseline: +{len(diff_block.get('new_ok', []))} green, "
-            f"-{len(diff_block.get('regressed', []))} regressions, "
-            f"{len(diff_block.get('still_bad', []))} still failing, "
-            f"{len(diff_block.get('new_unchecked', []))} new unchecked"
+            f"Δ vs baseline: +{len(diff_block.get('fixed', []))} fixed, "
+            f"-{len(diff_block.get('new_fail', []))} regressions, "
+            f"{len(diff_block.get('still_fail', []))} still failing, "
+            f"{len(diff_block.get('new_cases', []))} new cases"
         )
 
     failures_list: dict[str, RunResult] = {}
@@ -828,7 +722,7 @@ def handle_compare(args) -> int:
     if not args.base.exists() or not args.new.exists():
         print("Base or new results file not found.", file=sys.stderr)
         return 2
-    comparison = compare_runs(args.base, args.new)
+    comparison = compare_runs(args.base, args.new, fail_on=args.fail_on, require_assert=args.require_assert)
     report = render_markdown(comparison, args.out)
     print(report)
     if args.junit:
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index 1626908..2ca493b 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -63,7 +63,7 @@ def build_parser() -> argparse.ArgumentParser:
     batch_p.add_argument("--fail-fast", action="store_true", help="Stop on first failing case")
     batch_p.add_argument(
         "--fail-on",
-        choices=["error", "mismatch", "bad", "unchecked", "any", "skipped"],
+        choices=["error", "bad", "unchecked", "any", "skipped"],
         default="bad",
         help="Which statuses should cause a failing exit code",
     )
@@ -120,6 +120,13 @@ def build_parser() -> argparse.ArgumentParser:
     compare_p.add_argument("--new", type=Path, required=True, help="Path to new results.jsonl")
     compare_p.add_argument("--out", type=Path, default=None, help="Path to markdown report to write")
     compare_p.add_argument("--junit", type=Path, default=None, help="Path to junit xml output")
+    compare_p.add_argument(
+        "--fail-on",
+        choices=["error", "bad", "unchecked", "any", "skipped"],
+        default="bad",
+        help="Which statuses should be treated as failures when diffing",
+    )
+    compare_p.add_argument("--require-assert", action="store_true", help="Treat unchecked cases as failures when diffing")
 
     return parser
 
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 22bec9a..7d841b8 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -531,65 +531,158 @@ def load_results(path: Path) -> Dict[str, RunResult]:
     return results
 
 
-def _bucket(status: str, checked: bool, require_assert: bool) -> str:
-    if status == "ok":
-        return "OK" if checked else "UNCHECKED"
-    if status in {"mismatch", "failed", "error"}:
-        return "BAD"
-    if status in {"unchecked", "plan_only"}:
-        return "BAD" if require_assert else "UNCHECKED"
-    return "NEUTRAL"
-
-
-def compare_results(
-    baseline: Mapping[str, RunResult],
-    current: Mapping[str, RunResult],
+def bad_statuses(fail_on: str, require_assert: bool) -> set[str]:
+    unchecked = {"unchecked", "plan_only"}
+    bad = {"error", "failed", "mismatch"}
+    if fail_on == "error":
+        bad = {"error"}
+    elif fail_on in {"unchecked", "any"}:
+        bad |= unchecked
+    elif fail_on == "skipped":
+        bad |= {"skipped"}
+
+    if require_assert:
+        bad |= unchecked
+
+    return bad
+
+
+def is_failure(status: str, fail_on: str, require_assert: bool) -> bool:
+    return status in bad_statuses(fail_on, require_assert)
+
+
+def _artifact_links(res: RunResult) -> dict[str, str]:
+    links: dict[str, str] = {}
+    base = Path(res.artifacts_dir)
+    for name in ["plan.json", "answer.txt", "raw_synth.txt", "status.json"]:
+        path = base / name
+        if path.exists():
+            links[name] = str(path)
+    return links
+
+
+def _reason(res: RunResult) -> str:
+    if res.reason:
+        return res.reason
+    if res.error:
+        return res.error
+    if res.expected_check and res.expected_check.detail:
+        return res.expected_check.detail
+    return ""
+
+
+def _median_duration(results: Mapping[str, RunResult]) -> float | None:
+    durations = [res.duration_ms for res in results.values() if res.duration_ms is not None]
+    if not durations:
+        return None
+    durations.sort()
+    mid = len(durations) // 2
+    if len(durations) % 2 == 1:
+        return durations[mid] / 1000
+    return (durations[mid - 1] + durations[mid]) / 2000
+
+
+def _count_bad_from_summary(counts: Mapping[str, object], fail_on: str, require_assert: bool) -> int:
+    bad = bad_statuses(fail_on, require_assert)
+    total = 0
+    for status in bad:
+        try:
+            total += int(counts.get(status, 0) or 0)
+        except Exception:
+            continue
+    return total
+
+
+def diff_runs(
+    base_results: Iterable[RunResult],
+    new_results: Iterable[RunResult],
     *,
+    fail_on: str,
     require_assert: bool,
 ) -> Dict[str, object]:
-    new_ok: List[str] = []
-    regressed: List[str] = []
-    still_ok: List[str] = []
-    still_bad: List[str] = []
-    new_unchecked: List[str] = []
-    status_changes: Dict[str, Dict[str, str]] = {}
-    new_cases: List[str] = []
-
-    for case_id, res in current.items():
-        base_res = baseline.get(case_id)
-        new_bucket = _bucket(res.status, res.checked, require_assert)
+    base_by_id = {res.id: res for res in base_results}
+    new_by_id = {res.id: res for res in new_results}
+    all_ids = sorted(new_by_id.keys())
+
+    bad = bad_statuses(fail_on, require_assert)
+
+    def _is_bad(res: RunResult | None) -> bool:
+        return bool(res and res.status in bad)
+
+    def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult) -> dict[str, object]:
+        return {
+            "id": case_id,
+            "from": base_res.status if base_res else None,
+            "to": new_res.status,
+            "reason": _reason(new_res),
+            "artifacts": _artifact_links(new_res),
+        }
+
+    new_fail: list[dict[str, object]] = []
+    fixed: list[dict[str, object]] = []
+    still_fail: list[dict[str, object]] = []
+    changed_status: list[dict[str, str | None]] = []
+    new_cases: list[str] = []
+
+    for case_id in all_ids:
+        new_res = new_by_id[case_id]
+        base_res = base_by_id.get(case_id)
+        base_bad = _is_bad(base_res)
+        new_bad = _is_bad(new_res)
+
         if base_res is None:
             new_cases.append(case_id)
-            if new_bucket == "OK":
-                new_ok.append(case_id)
-            elif new_bucket == "BAD":
-                still_bad.append(case_id)
-            status_changes[case_id] = {"from": "new", "to": res.status}
+        else:
+            if base_res.status != new_res.status:
+                changed_status.append({"id": case_id, "from": base_res.status, "to": new_res.status})
+
+        if base_res is None:
             continue
 
-        base_bucket = _bucket(base_res.status, base_res.checked, require_assert)
-        if base_res.checked and res.status == "unchecked":
-            new_unchecked.append(case_id)
-        if base_bucket == "OK" and new_bucket in {"BAD", "UNCHECKED"}:
-            regressed.append(case_id)
-        elif base_bucket in {"BAD", "UNCHECKED"} and new_bucket == "OK":
-            new_ok.append(case_id)
-        elif base_bucket == "OK" and new_bucket == "OK":
-            still_ok.append(case_id)
-        elif base_bucket in {"BAD", "UNCHECKED"} and new_bucket in {"BAD", "UNCHECKED"}:
-            still_bad.append(case_id)
-
-        if base_res.status != res.status:
-            status_changes[case_id] = {"from": base_res.status, "to": res.status}
+        if not base_bad and new_bad:
+            new_fail.append(_entry(case_id, base_res, new_res))
+        elif base_bad and not new_bad:
+            fixed.append(_entry(case_id, base_res, new_res))
+        elif base_bad and new_bad:
+            still_fail.append(_entry(case_id, base_res, new_res))
+
+    base_counts = summarize(base_by_id.values())
+    new_counts = summarize(new_by_id.values())
+    base_med = _median_duration(base_by_id)
+    new_med = _median_duration(new_by_id)
+    base_avg = base_counts.get("avg_total_s")
+    new_avg = new_counts.get("avg_total_s")
+
+    def _count_delta(key: str) -> int | float | None:
+        base_val = base_counts.get(key)
+        new_val = new_counts.get(key)
+        if isinstance(base_val, (int, float)) and isinstance(new_val, (int, float)):
+            return new_val - base_val
+        return None
+
+    delta_keys = {"ok", "mismatch", "failed", "error", "skipped", "unchecked", "plan_only", "total"}
+    count_deltas = {k: _count_delta(k) for k in delta_keys}
 
     return {
-        "new_ok": new_ok,
-        "regressed": regressed,
-        "still_ok": still_ok,
-        "still_bad": still_bad,
-        "new_unchecked": new_unchecked,
-        "status_changes": status_changes,
+        "all_ids": all_ids,
+        "new_fail": new_fail,
+        "fixed": fixed,
+        "still_fail": still_fail,
+        "changed_status": changed_status,
         "new_cases": new_cases,
+        "base_counts": base_counts,
+        "new_counts": new_counts,
+        "counts_delta": count_deltas,
+        "base_median": base_med,
+        "new_median": new_med,
+        "base_avg": base_avg,
+        "new_avg": new_avg,
+        "median_delta": (new_med - base_med) if (new_med is not None and base_med is not None) else None,
+        "avg_delta": (new_avg - base_avg) if (isinstance(new_avg, (int, float)) and isinstance(base_avg, (int, float))) else None,
+        "base_bad_total": _count_bad_from_summary(base_counts, fail_on, require_assert),
+        "new_bad_total": _count_bad_from_summary(new_counts, fail_on, require_assert),
+        "fail_on": fail_on,
+        "require_assert": require_assert,
     }
 
 
@@ -605,24 +698,6 @@ def format_status_line(result: RunResult) -> str:
     return f"FAIL {result.id} {result.status} ({reason or 'unknown'}) {timing}"
 
 
-__all__ = [
-    "AgentRunner",
-    "Case",
-    "ExpectedCheck",
-    "RunArtifacts",
-    "RunResult",
-    "EventLogger",
-    "build_agent",
-    "compare_results",
-    "format_status_line",
-    "load_results",
-    "load_cases",
-    "run_one",
-    "save_artifacts",
-    "save_status",
-    "summarize",
-    "_match_expected",
-]
 class EventLogger:
     def __init__(self, path: Path | None, run_id: str):
         self.path = path
@@ -641,3 +716,25 @@ def for_case(self, case_id: str, path: Path | None = None) -> "EventLogger":
         if path is None:
             return self
         return EventLogger(path, self.run_id)
+
+
+__all__ = [
+    "AgentRunner",
+    "Case",
+    "ExpectedCheck",
+    "RunArtifacts",
+    "RunResult",
+    "EventLogger",
+    "build_agent",
+    "bad_statuses",
+    "diff_runs",
+    "format_status_line",
+    "is_failure",
+    "load_results",
+    "load_cases",
+    "run_one",
+    "save_artifacts",
+    "save_status",
+    "summarize",
+    "_match_expected",
+]
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index f52da8a..004548d 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -9,7 +9,7 @@
 
 @pytest.mark.parametrize(
     "fail_on,require_assert",
-    itertools.product(["bad", "error", "mismatch", "unchecked", "any", "skipped"], [False, True]),
+    itertools.product(["bad", "error", "unchecked", "any", "skipped"], [False, True]),
 )
 def test_is_failure_matches_bad_statuses(fail_on: str, require_assert: bool) -> None:
     statuses = ["ok", "mismatch", "failed", "error", "unchecked", "plan_only", "skipped"]
diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py
index ad14b00..5d61ed5 100644
--- a/tests/test_demo_qa_runner.py
+++ b/tests/test_demo_qa_runner.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from examples.demo_qa.runner import Case, RunResult, _match_expected, compare_results, summarize
+from examples.demo_qa.runner import Case, RunResult, _match_expected, diff_runs, summarize
 
 
 def test_match_expected_unchecked_when_no_expectations() -> None:
@@ -26,9 +26,9 @@ def test_match_expected_contains_pass_and_fail() -> None:
     assert missing_answer.detail == "no answer"
 
 
-def test_compare_results_tracks_regressions_and_improvements() -> None:
-    baseline = {
-        "ok_to_bad": RunResult(
+def test_diff_runs_tracks_regressions_and_improvements() -> None:
+    baseline = [
+        RunResult(
             id="ok_to_bad",
             question="",
             status="ok",
@@ -39,7 +39,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             duration_ms=10,
             tags=[],
         ),
-        "err_to_ok": RunResult(
+        RunResult(
             id="err_to_ok",
             question="",
             status="error",
@@ -50,10 +50,10 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             duration_ms=10,
             tags=[],
         ),
-        "checked_to_unchecked": RunResult(
-            id="checked_to_unchecked",
+        RunResult(
+            id="still_bad",
             question="",
-            status="ok",
+            status="mismatch",
             checked=True,
             reason=None,
             details=None,
@@ -61,10 +61,10 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             duration_ms=10,
             tags=[],
         ),
-    }
+    ]
 
-    current = {
-        "ok_to_bad": RunResult(
+    current = [
+        RunResult(
             id="ok_to_bad",
             question="",
             status="mismatch",
@@ -75,7 +75,7 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             duration_ms=10,
             tags=[],
         ),
-        "err_to_ok": RunResult(
+        RunResult(
             id="err_to_ok",
             question="",
             status="ok",
@@ -86,18 +86,18 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             duration_ms=10,
             tags=[],
         ),
-        "checked_to_unchecked": RunResult(
-            id="checked_to_unchecked",
+        RunResult(
+            id="still_bad",
             question="",
-            status="unchecked",
-            checked=False,
+            status="failed",
+            checked=True,
             reason=None,
             details=None,
             artifacts_dir="/tmp/ok2",
             duration_ms=10,
             tags=[],
         ),
-        "new_ok": RunResult(
+        RunResult(
             id="new_ok",
             question="",
             status="ok",
@@ -108,14 +108,14 @@ def test_compare_results_tracks_regressions_and_improvements() -> None:
             duration_ms=10,
             tags=[],
         ),
-    }
+    ]
 
-    diff = compare_results(baseline, current, require_assert=True)
+    diff = diff_runs(baseline, current, fail_on="bad", require_assert=True)
 
-    assert "ok_to_bad" in diff["regressed"]
-    assert "err_to_ok" in diff["new_ok"]
-    assert "checked_to_unchecked" in diff["new_unchecked"]
-    assert "new_ok" in diff["new_ok"]
+    assert {row["id"] for row in diff["new_fail"]} == {"ok_to_bad"}
+    assert {row["id"] for row in diff["fixed"]} == {"err_to_ok"}
+    assert {row["id"] for row in diff["still_fail"]} == {"still_bad"}
+    assert diff["new_cases"] == ["new_ok"]
 
 
 def test_summarize_counts_checked_and_unchecked() -> None:

From e280ee458a322be91ea0aebc5f953c1b2c68d9d1 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 23:25:05 +0300
Subject: [PATCH 26/92] Slim run metadata and stabilize diffs

---
 examples/demo_qa/batch.py  | 46 ++++++++++++++++++++++++--------------
 examples/demo_qa/cli.py    |  5 +++++
 examples/demo_qa/runner.py |  4 ++--
 3 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 4a1d392..1b5b40e 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -39,7 +39,7 @@ def write_results(out_path: Path, results: Iterable[RunResult]) -> None:
 
 def write_summary(out_path: Path, summary: dict) -> Path:
     summary_path = out_path.with_name("summary.json")
-    summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
+    summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
     return summary_path
 
 
@@ -85,23 +85,32 @@ def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]
     }
 
 
-def _fingerprint_dir(data_dir: Path) -> Mapping[str, object]:
-    files: list[dict] = []
+def _fingerprint_dir(data_dir: Path, *, verbose: bool = False) -> Mapping[str, object]:
+    entries: list[dict] = []
+    total_bytes = 0
+    files_count = 0
     for path in sorted(data_dir.rglob("*")):
         if path.is_file():
             rel = path.relative_to(data_dir)
             if rel.parts and rel.parts[0] in {".runs", ".cache"}:
                 continue
             stat = path.stat()
-            files.append(
-                {
-                    "path": str(rel),
-                    "size": stat.st_size,
-                    "mtime": stat.st_mtime,
-                }
-            )
-    digest = hashlib.sha256(json.dumps(files, sort_keys=True).encode("utf-8")).hexdigest()
-    return {"hash": digest, "files": files}
+            files_count += 1
+            total_bytes += stat.st_size
+            if verbose:
+                entries.append(
+                    {
+                        "path": str(rel),
+                        "size": stat.st_size,
+                        "mtime": stat.st_mtime,
+                    }
+                )
+    digest_payload = entries if verbose else [{"files_count": files_count, "bytes_total": total_bytes}]
+    digest = hashlib.sha256(json.dumps(digest_payload, sort_keys=True).encode("utf-8")).hexdigest()
+    fingerprint: dict[str, object] = {"hash": digest, "files_count": files_count, "bytes_total": total_bytes}
+    if verbose:
+        fingerprint["files"] = entries
+    return fingerprint
 
 
 def _git_sha() -> Optional[str]:
@@ -283,7 +292,6 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None:
     for cid in ok_ids:
         ET.SubElement(suite, "testcase", name=cid)
 
-    tree = ET.ElementTree(suite)
     out_path.write_text(ET.tostring(suite, encoding="unicode"), encoding="utf-8")
 
 
@@ -464,7 +472,9 @@ def handle_batch(args) -> int:
     summary_by_tag = summary.get("summary_by_tag")
     if summary_by_tag:
         summary_by_tag_path = summary_path.with_name("summary_by_tag.json")
-        summary_by_tag_path.write_text(json.dumps(summary_by_tag, ensure_ascii=False, indent=2), encoding="utf-8")
+        summary_by_tag_path.write_text(
+            json.dumps(summary_by_tag, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8"
+        )
 
     latest_path = run_folder.parent / "latest.txt"
     latest_results_path = run_folder.parent / "latest_results.txt"
@@ -475,7 +485,7 @@ def handle_batch(args) -> int:
     config_hash = _hash_file(args.config) if args.config else None
     schema_hash = _hash_file(args.schema)
     cases_hash = _hash_file(args.cases)
-    data_fingerprint = _fingerprint_dir(args.data)
+    data_fingerprint = _fingerprint_dir(args.data, verbose=args.fingerprint_verbose)
     llm_settings = settings.llm
     run_meta = {
         "run_id": run_id,
@@ -504,7 +514,9 @@ def handle_batch(args) -> int:
         "summary_path": str(summary_path),
         "run_dir": str(run_folder),
     }
-    (run_folder / "run_meta.json").write_text(json.dumps(run_meta, ensure_ascii=False, indent=2), encoding="utf-8")
+    (run_folder / "run_meta.json").write_text(
+        json.dumps(run_meta, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8"
+    )
 
     prate = _pass_rate(counts)
     history_entry = {
@@ -525,7 +537,7 @@ def handle_batch(args) -> int:
     }
     history_path.parent.mkdir(parents=True, exist_ok=True)
     with history_path.open("a", encoding="utf-8") as f:
-        f.write(json.dumps(history_entry, ensure_ascii=False) + "\n")
+        f.write(json.dumps(history_entry, ensure_ascii=False, sort_keys=True) + "\n")
 
     bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0)
     unchecked = counts.get("unchecked", 0)
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index 2ca493b..be3089e 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -87,6 +87,11 @@ def build_parser() -> argparse.ArgumentParser:
     batch_p.add_argument("--exclude-ids", type=Path, default=None, help="Path to file with ids to exclude (one per line)")
     batch_p.add_argument("--events", choices=["on", "off"], default="on", help="Enable events.jsonl emission")
     batch_p.add_argument("--events-file", type=Path, default=None, help="Override events file path")
+    batch_p.add_argument(
+        "--fingerprint-verbose",
+        action="store_true",
+        help="Include per-file entries in data fingerprint (defaults to counts only)",
+    )
 
     case_root = sub.add_parser("case", help="Single-case utilities")
     case_sub = case_root.add_subparsers(dest="case_command", required=True)
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 7d841b8..c1edf42 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -365,7 +365,7 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]:
         "checked_ok": checked_ok,
         "unchecked_no_assert": unchecked_no_assert,
         "plan_only": plan_only,
-        "summary_by_tag": per_tag,
+        "summary_by_tag": {tag: per_tag[tag] for tag in sorted(per_tag)},
         **totals,
     }
     if total_times:
@@ -660,7 +660,7 @@ def _count_delta(key: str) -> int | float | None:
             return new_val - base_val
         return None
 
-    delta_keys = {"ok", "mismatch", "failed", "error", "skipped", "unchecked", "plan_only", "total"}
+    delta_keys = ["total", "ok", "mismatch", "failed", "error", "unchecked", "plan_only", "skipped"]
     count_deltas = {k: _count_delta(k) for k in delta_keys}
 
     return {

From af2ce2096bcb450647612332cacf62682de2e0e5 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 23:33:47 +0300
Subject: [PATCH 27/92] Honor fail_on in compare summary

---
 examples/demo_qa/batch.py   | 26 ++++++++++++++++++++------
 tests/test_demo_qa_batch.py | 20 +++++++++++++++++++-
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 1b5b40e..a25b558 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -221,15 +221,29 @@ def render_markdown(compare: dict[str, object], out_path: Optional[Path]) -> str
     lines: list[str] = []
     base_counts = compare["base_counts"]  # type: ignore[index]
     new_counts = compare["new_counts"]  # type: ignore[index]
+    fail_on = compare.get("fail_on", "bad")  # type: ignore[assignment]
+    require_assert = bool(compare.get("require_assert", False))
+
+    def _bad_total(counts: dict) -> int:
+        bad_from_compare = compare.get("base_bad_total") if counts is base_counts else compare.get("new_bad_total")
+        if isinstance(bad_from_compare, int):
+            return bad_from_compare
+        bad_set = bad_statuses(str(fail_on), require_assert)
+        total = 0
+        for status in bad_set:
+            try:
+                total += int(counts.get(status, 0) or 0)
+            except Exception:
+                continue
+        return total
+
+    base_bad = _bad_total(base_counts)  # type: ignore[arg-type]
+    new_bad = _bad_total(new_counts)  # type: ignore[arg-type]
     lines.append("# Batch comparison report")
     lines.append("")
     lines.append("## Summary")
-    lines.append(
-        f"- Base OK: {base_counts.get('ok',0)}, Bad: {base_counts.get('mismatch',0)+base_counts.get('error',0)+base_counts.get('failed',0)}"
-    )
-    lines.append(
-        f"- New  OK: {new_counts.get('ok',0)}, Bad: {new_counts.get('mismatch',0)+new_counts.get('error',0)+new_counts.get('failed',0)}"
-    )
+    lines.append(f"- Base OK: {base_counts.get('ok',0)}, Bad: {base_bad}")
+    lines.append(f"- New  OK: {new_counts.get('ok',0)}, Bad: {new_bad}")
     base_med = compare.get("base_median")
     new_med = compare.get("new_median")
     if base_med is not None and new_med is not None:
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 004548d..cb72890 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from examples.demo_qa.batch import bad_statuses, is_failure
+from examples.demo_qa.batch import bad_statuses, is_failure, render_markdown
 
 
 @pytest.mark.parametrize(
@@ -17,3 +17,21 @@ def test_is_failure_matches_bad_statuses(fail_on: str, require_assert: bool) ->
     assert bad  # sanity check
     for status in statuses:
         assert is_failure(status, fail_on, require_assert) == (status in bad)
+
+
+def test_render_markdown_uses_fail_policy() -> None:
+    compare = {
+        "base_counts": {"ok": 0, "mismatch": 2, "error": 1, "failed": 0},
+        "new_counts": {"ok": 1, "mismatch": 0, "error": 0, "failed": 0},
+        "base_bad_total": 1,
+        "new_bad_total": 0,
+        "fail_on": "error",
+        "require_assert": False,
+        "new_fail": [],
+        "fixed": [],
+        "still_fail": [],
+        "all_ids": [],
+    }
+    report = render_markdown(compare, None)
+    assert "- Base OK: 0, Bad: 1" in report
+    assert "- New  OK: 1, Bad: 0" in report

From c653240edeb59fc159170db0ed4ec0fdf6a1e3d4 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 20 Dec 2025 23:48:48 +0300
Subject: [PATCH 28/92] Restore sensitive fingerprints with compact hash

---
 examples/demo_qa/batch.py   | 23 +++++++++++++----------
 tests/test_demo_qa_batch.py | 21 ++++++++++++++++++++-
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index a25b558..a597863 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -89,25 +89,28 @@ def _fingerprint_dir(data_dir: Path, *, verbose: bool = False) -> Mapping[str, o
     entries: list[dict] = []
     total_bytes = 0
     files_count = 0
+    digest = hashlib.sha256()
     for path in sorted(data_dir.rglob("*")):
         if path.is_file():
             rel = path.relative_to(data_dir)
             if rel.parts and rel.parts[0] in {".runs", ".cache"}:
                 continue
             stat = path.stat()
+            record = {
+                "path": str(rel),
+                "size": stat.st_size,
+                "mtime": stat.st_mtime,
+            }
+            digest.update(json.dumps(record, sort_keys=True).encode("utf-8"))
             files_count += 1
             total_bytes += stat.st_size
             if verbose:
-                entries.append(
-                    {
-                        "path": str(rel),
-                        "size": stat.st_size,
-                        "mtime": stat.st_mtime,
-                    }
-                )
-    digest_payload = entries if verbose else [{"files_count": files_count, "bytes_total": total_bytes}]
-    digest = hashlib.sha256(json.dumps(digest_payload, sort_keys=True).encode("utf-8")).hexdigest()
-    fingerprint: dict[str, object] = {"hash": digest, "files_count": files_count, "bytes_total": total_bytes}
+                entries.append(record)
+    fingerprint: dict[str, object] = {
+        "hash": digest.hexdigest(),
+        "files_count": files_count,
+        "bytes_total": total_bytes,
+    }
     if verbose:
         fingerprint["files"] = entries
     return fingerprint
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index cb72890..44895e5 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -1,10 +1,13 @@
 from __future__ import annotations
 
 import itertools
+import os
+import time
+from pathlib import Path
 
 import pytest
 
-from examples.demo_qa.batch import bad_statuses, is_failure, render_markdown
+from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown
 
 
 @pytest.mark.parametrize(
@@ -35,3 +38,19 @@ def test_render_markdown_uses_fail_policy() -> None:
     report = render_markdown(compare, None)
     assert "- Base OK: 0, Bad: 1" in report
     assert "- New  OK: 1, Bad: 0" in report
+
+
+def test_fingerprint_sensitive_to_file_changes(tmp_path: Path) -> None:
+    data = tmp_path / "data"
+    data.mkdir()
+    target = data / "file.txt"
+    target.write_text("aaa", encoding="utf-8")
+    first = _fingerprint_dir(data)
+
+    target.write_text("bbb", encoding="utf-8")
+    now = time.time() + 1
+    os.utime(target, (now, now))
+    second = _fingerprint_dir(data)
+
+    assert first["hash"] != second["hash"]
+    assert first["files_count"] == second["files_count"] == 1

From ac9e0f0f29cfc6f22a3b735690c1e11655c29f29 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 00:10:15 +0300
Subject: [PATCH 29/92] Include new bad cases in diffs

---
 examples/demo_qa/runner.py   |  2 ++
 tests/test_demo_qa_runner.py | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index c1edf42..06d8f7c 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -632,6 +632,8 @@ def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult) -> dict
 
         if base_res is None:
             new_cases.append(case_id)
+            if new_bad:
+                new_fail.append(_entry(case_id, base_res, new_res))
         else:
             if base_res.status != new_res.status:
                 changed_status.append({"id": case_id, "from": base_res.status, "to": new_res.status})
diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py
index 5d61ed5..745a477 100644
--- a/tests/test_demo_qa_runner.py
+++ b/tests/test_demo_qa_runner.py
@@ -108,14 +108,25 @@ def test_diff_runs_tracks_regressions_and_improvements() -> None:
             duration_ms=10,
             tags=[],
         ),
+        RunResult(
+            id="new_bad",
+            question="",
+            status="failed",
+            checked=True,
+            reason=None,
+            details=None,
+            artifacts_dir="/tmp/newbad",
+            duration_ms=10,
+            tags=[],
+        ),
     ]
 
     diff = diff_runs(baseline, current, fail_on="bad", require_assert=True)
 
-    assert {row["id"] for row in diff["new_fail"]} == {"ok_to_bad"}
+    assert {row["id"] for row in diff["new_fail"]} == {"ok_to_bad", "new_bad"}
     assert {row["id"] for row in diff["fixed"]} == {"err_to_ok"}
     assert {row["id"] for row in diff["still_fail"]} == {"still_bad"}
-    assert diff["new_cases"] == ["new_ok"]
+    assert diff["new_cases"] == ["new_bad", "new_ok"]
 
 
 def test_summarize_counts_checked_and_unchecked() -> None:

From 26de17fd9a2e978e8c61fc248c870b7968a0c8e3 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 00:41:07 +0300
Subject: [PATCH 30/92] Switch demo_qa schema output to JSON (#72)

---
 README_demo_qa.md               |  8 ++++----
 examples/demo_qa/data_gen.py    | 12 +++---------
 tests/test_demo_qa_schema_io.py | 20 ++++++++++++++++++++
 3 files changed, 27 insertions(+), 13 deletions(-)
 create mode 100644 tests/test_demo_qa_schema_io.py

diff --git a/README_demo_qa.md b/README_demo_qa.md
index 86f9e67..9698aa6 100644
--- a/README_demo_qa.md
+++ b/README_demo_qa.md
@@ -8,7 +8,7 @@
 python -m examples.demo_qa.cli gen --out demo_data --rows 1000 --seed 42
 ```
 
-Команда создаст четыре CSV, `schema.yaml`, `meta.json` и `stats.json`.
+Команда создаст четыре CSV, `schema.json`, `meta.json` и `stats.json`.
 
 ## Конфигурация LLM (pydantic-settings)
 
@@ -47,7 +47,7 @@ pip install -r examples/demo_qa/requirements.txt
    `base_url` (формат `http://host:port/v1`), модели и температуры.
 2. Запустите чат с указанием конфига:
 ```bash
-python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.yaml --config path/to/demo_qa.toml
+python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.json --config path/to/demo_qa.toml
 ```
 
 Флаг `--enable-semantic` строит семантический индекс, если передана модель эмбеддингов.
@@ -59,7 +59,7 @@ python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.y
 ```bash
 python -m examples.demo_qa.cli batch \
   --data demo_data \
-  --schema demo_data/schema.yaml \
+  --schema demo_data/schema.json \
   --cases cases.jsonl \
   --out results.jsonl
 ```
@@ -76,7 +76,7 @@ python -m examples.demo_qa.cli batch \
 любым ключом доступа, если прокси не проверяет его. Запуск:
 
 ```bash
-python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.yaml --config path/to/demo_qa.toml
+python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.json --config path/to/demo_qa.toml
 ```
 
 Большинство OpenAI-совместимых сервисов ожидают конечную точку `/v1` в `base_url`.
diff --git a/examples/demo_qa/data_gen.py b/examples/demo_qa/data_gen.py
index 8c8ab32..8fc21dd 100644
--- a/examples/demo_qa/data_gen.py
+++ b/examples/demo_qa/data_gen.py
@@ -256,15 +256,9 @@ def save_dataset(dataset: GeneratedDataset, out_dir: Path) -> None:
 
 
 def save_schema(schema: SchemaConfig, path: Path) -> None:
-    def _to_dict(obj):
-        if isinstance(obj, list):
-            return [_to_dict(o) for o in obj]
-        if hasattr(obj, "__dict__"):
-            return {k: _to_dict(v) for k, v in obj.__dict__.items()}
-        return obj
-
+    schema_dict = asdict(schema)
     with path.open("w", encoding="utf-8") as f:
-        json.dump(_to_dict(schema), f, ensure_ascii=False, indent=2)
+        json.dump(schema_dict, f, ensure_ascii=False, indent=2)
 
 
 @dataclass
@@ -299,7 +293,7 @@ def generate_and_save(out_dir: Path, *, rows: int = 1000, seed: int | None = Non
     validate_dataset(dataset, rows)
     save_dataset(dataset, out_dir)
     schema = default_schema(enable_semantic=enable_semantic)
-    save_schema(schema, out_dir / "schema.yaml")
+    save_schema(schema, out_dir / "schema.json")
     meta = MetaInfo(seed=seed, rows=rows, created_at=datetime.utcnow().isoformat())
     write_meta(out_dir / "meta.json", meta)
 
diff --git a/tests/test_demo_qa_schema_io.py b/tests/test_demo_qa_schema_io.py
new file mode 100644
index 0000000..828229f
--- /dev/null
+++ b/tests/test_demo_qa_schema_io.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from examples.demo_qa.data_gen import generate_and_save
+from examples.demo_qa.schema_io import load_schema
+
+
+def test_generate_and_load_schema_json(tmp_path: Path) -> None:
+    out_dir = tmp_path / "demo_data"
+    generate_and_save(out_dir, rows=5, seed=123)
+
+    schema_path = out_dir / "schema.json"
+    assert schema_path.exists()
+
+    schema = load_schema(schema_path)
+
+    assert schema.name == "demo_qa"
+    assert {e.name for e in schema.entities} >= {"customers", "products", "orders", "order_items"}
+    assert {r.name for r in schema.relations} >= {"orders_to_customers", "items_to_orders", "items_to_products"}

From 804c65973a431509a1c429cc71055f24e7214170 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 01:09:03 +0300
Subject: [PATCH 31/92] Clean meta and stabilize results serialization

---
 examples/demo_qa/batch.py   | 49 ++++++++++++++-----------------------
 examples/demo_qa/runner.py  | 17 ++++++++++++-
 tests/test_demo_qa_batch.py | 40 +++++++++++++++++++++++++++++-
 3 files changed, 73 insertions(+), 33 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index a597863..c03711f 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -3,7 +3,6 @@
 import datetime
 import hashlib
 import json
-import platform
 import subprocess
 import sys
 import uuid
@@ -30,16 +29,21 @@
 from .settings import load_settings
 
 
+def _dump_json(path: Path, obj: object) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(obj, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
+
+
 def write_results(out_path: Path, results: Iterable[RunResult]) -> None:
     out_path.parent.mkdir(parents=True, exist_ok=True)
     with out_path.open("w", encoding="utf-8") as f:
         for res in results:
-            f.write(json.dumps(res.to_json(), ensure_ascii=False) + "\n")
+            f.write(json.dumps(res.to_json(), ensure_ascii=False, sort_keys=True, separators=(",", ":")) + "\n")
 
 
 def write_summary(out_path: Path, summary: dict) -> Path:
     summary_path = out_path.with_name("summary.json")
-    summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
+    _dump_json(summary_path, summary)
     return summary_path
 
 
@@ -75,16 +79,6 @@ def _load_ids(path: Optional[Path]) -> set[str] | None:
     return ids
 
 
-def build_config_fingerprint(settings, cases_path: Path) -> Mapping[str, object]:
-    llm_settings = settings.llm
-    return {
-        "base_url": llm_settings.base_url or "https://api.openai.com/v1",
-        "plan_model": llm_settings.plan_model,
-        "synth_model": llm_settings.synth_model,
-        "cases_hash": _hash_file(cases_path),
-    }
-
-
 def _fingerprint_dir(data_dir: Path, *, verbose: bool = False) -> Mapping[str, object]:
     entries: list[dict] = []
     total_bytes = 0
@@ -477,7 +471,6 @@ def handle_batch(args) -> int:
         "counts": counts,
         "summary_by_tag": counts.get("summary_by_tag"),
         "exit_code": exit_code,
-        "config_fingerprint": build_config_fingerprint(settings, args.cases),
         "results_path": str(results_path),
         "require_assert": args.require_assert,
         "fail_on": args.fail_on,
@@ -489,9 +482,7 @@ def handle_batch(args) -> int:
     summary_by_tag = summary.get("summary_by_tag")
     if summary_by_tag:
         summary_by_tag_path = summary_path.with_name("summary_by_tag.json")
-        summary_by_tag_path.write_text(
-            json.dumps(summary_by_tag, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8"
-        )
+        _dump_json(summary_by_tag_path, summary_by_tag)
 
     latest_path = run_folder.parent / "latest.txt"
     latest_results_path = run_folder.parent / "latest_results.txt"
@@ -507,13 +498,15 @@ def handle_batch(args) -> int:
     run_meta = {
         "run_id": run_id,
         "timestamp": started_at.isoformat() + "Z",
-        "cases_path": str(args.cases),
-        "cases_hash": cases_hash,
-        "config_path": str(args.config) if args.config else None,
-        "config_hash": config_hash,
-        "schema_path": str(args.schema),
-        "schema_hash": schema_hash,
-        "data_dir": str(args.data),
+        "inputs": {
+            "cases_path": str(args.cases),
+            "cases_hash": cases_hash,
+            "config_path": str(args.config) if args.config else None,
+            "config_hash": config_hash,
+            "schema_path": str(args.schema),
+            "schema_hash": schema_hash,
+            "data_dir": str(args.data),
+        },
         "data_fingerprint": data_fingerprint,
         "llm": {
             "plan_model": llm_settings.plan_model,
@@ -523,17 +516,12 @@ def handle_batch(args) -> int:
             "base_url": llm_settings.base_url or "https://api.openai.com/v1",
         },
         "enable_semantic": args.enable_semantic,
-        "embedding_model": None,
         "git_sha": _git_sha(),
-        "python_version": sys.version,
-        "platform": platform.platform(),
         "results_path": str(results_path),
         "summary_path": str(summary_path),
         "run_dir": str(run_folder),
     }
-    (run_folder / "run_meta.json").write_text(
-        json.dumps(run_meta, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8"
-    )
+    _dump_json(run_folder / "run_meta.json", run_meta)
 
     prate = _pass_rate(counts)
     history_entry = {
@@ -771,5 +759,4 @@ def handle_compare(args) -> int:
     "write_summary",
     "_load_latest_run",
     "_find_case_artifact",
-    "build_config_fingerprint",
 ]
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 06d8f7c..c49ee15 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -648,6 +648,12 @@ def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult) -> dict
         elif base_bad and new_bad:
             still_fail.append(_entry(case_id, base_res, new_res))
 
+    new_fail = sorted(new_fail, key=lambda r: r.get("id", ""))
+    fixed = sorted(fixed, key=lambda r: r.get("id", ""))
+    still_fail = sorted(still_fail, key=lambda r: r.get("id", ""))
+    changed_status = sorted(changed_status, key=lambda r: r.get("id", ""))
+    new_cases = sorted(new_cases)
+
     base_counts = summarize(base_by_id.values())
     new_counts = summarize(new_by_id.values())
     base_med = _median_duration(base_by_id)
@@ -662,7 +668,16 @@ def _count_delta(key: str) -> int | float | None:
             return new_val - base_val
         return None
 
-    delta_keys = ["total", "ok", "mismatch", "failed", "error", "unchecked", "plan_only", "skipped"]
+    delta_keys = (
+        "total",
+        "ok",
+        "mismatch",
+        "failed",
+        "error",
+        "skipped",
+        "unchecked",
+        "plan_only",
+    )
     count_deltas = {k: _count_delta(k) for k in delta_keys}
 
     return {
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 44895e5..40ee32d 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -1,13 +1,15 @@
 from __future__ import annotations
 
 import itertools
+import json
 import os
 import time
 from pathlib import Path
 
 import pytest
 
-from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown
+from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown, write_results
+from examples.demo_qa.runner import RunResult, diff_runs
 
 
 @pytest.mark.parametrize(
@@ -54,3 +56,39 @@ def test_fingerprint_sensitive_to_file_changes(tmp_path: Path) -> None:
 
     assert first["hash"] != second["hash"]
     assert first["files_count"] == second["files_count"] == 1
+    assert "files" not in first
+
+
+def _mk_result(case_id: str, status: str) -> RunResult:
+    return RunResult(
+        id=case_id,
+        question="q",
+        status=status,
+        checked=True,
+        reason=None,
+        details=None,
+        artifacts_dir=f"/tmp/{case_id}",
+        duration_ms=1000,
+        tags=[],
+    )
+
+
+def test_compare_is_deterministic() -> None:
+    base_results = [_mk_result("b", "ok"), _mk_result("a", "ok")]
+    new_results = [_mk_result("a", "failed"), _mk_result("b", "ok")]
+
+    first = diff_runs(base_results, new_results, fail_on="bad", require_assert=False)
+    second = diff_runs(list(reversed(base_results)), list(reversed(new_results)), fail_on="bad", require_assert=False)
+
+    assert json.dumps(first, sort_keys=True) == json.dumps(second, sort_keys=True)
+
+
+def test_write_results_is_deterministic(tmp_path: Path) -> None:
+    out = tmp_path / "results.jsonl"
+    res = _mk_result("a", "ok")
+
+    write_results(out, [res])
+
+    line = out.read_text(encoding="utf-8").strip()
+    expected = json.dumps(res.to_json(), ensure_ascii=False, sort_keys=True, separators=(",", ":"))
+    assert line == expected

From 31cef7cf98ab96152106ff8fa29b5201c2ce92fd Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 01:14:39 +0300
Subject: [PATCH 32/92] Refine demo QA JSON handling and lint hygiene

---
 examples/demo_qa/batch.py    | 16 ++++++----------
 examples/demo_qa/cli.py      |  2 +-
 examples/demo_qa/runner.py   |  2 +-
 examples/demo_qa/settings.py |  2 +-
 examples/demo_qa/utils.py    | 12 ++++++++++++
 5 files changed, 21 insertions(+), 13 deletions(-)
 create mode 100644 examples/demo_qa/utils.py

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index c03711f..5a65eec 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -27,11 +27,7 @@
     summarize,
 )
 from .settings import load_settings
-
-
-def _dump_json(path: Path, obj: object) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    path.write_text(json.dumps(obj, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
+from .utils import dump_json
 
 
 def write_results(out_path: Path, results: Iterable[RunResult]) -> None:
@@ -43,7 +39,7 @@ def write_results(out_path: Path, results: Iterable[RunResult]) -> None:
 
 def write_summary(out_path: Path, summary: dict) -> Path:
     summary_path = out_path.with_name("summary.json")
-    _dump_json(summary_path, summary)
+    dump_json(summary_path, summary)
     return summary_path
 
 
@@ -257,7 +253,7 @@ def table(title: str, rows: list[dict]) -> None:
         lines.append("|---|---|---|---|")
         for row in sorted(rows, key=lambda r: r.get("id", "")):
             artifacts = row.get("artifacts", {})
-            links = ", ".join(f"[{k}]({v})" for k, v in artifacts.items())
+            links = ", ".join(f"[{k}]({v})" for k, v in sorted(artifacts.items()))
             lines.append(
                 f"| {row['id']} | {row['from']} → {row['to']} | {row.get('reason','')} | {links or ''} |"
             )
@@ -292,7 +288,7 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None:
         failure = ET.SubElement(tc, "failure", message=msg)
         artifacts = row.get("artifacts", {})
         if artifacts:
-            failure.text = "\n".join(f"{k}: {v}" for k, v in artifacts.items())
+            failure.text = "\n".join(f"{k}: {v}" for k, v in sorted(artifacts.items()))
 
     for row in sorted(fixed, key=lambda r: r.get("id", "")):
         ET.SubElement(suite, "testcase", name=row["id"])
@@ -482,7 +478,7 @@ def handle_batch(args) -> int:
     summary_by_tag = summary.get("summary_by_tag")
     if summary_by_tag:
         summary_by_tag_path = summary_path.with_name("summary_by_tag.json")
-        _dump_json(summary_by_tag_path, summary_by_tag)
+        dump_json(summary_by_tag_path, summary_by_tag)
 
     latest_path = run_folder.parent / "latest.txt"
     latest_results_path = run_folder.parent / "latest_results.txt"
@@ -521,7 +517,7 @@ def handle_batch(args) -> int:
         "summary_path": str(summary_path),
         "run_dir": str(run_folder),
     }
-    _dump_json(run_folder / "run_meta.json", run_meta)
+    dump_json(run_folder / "run_meta.json", run_meta)
 
     prate = _pass_rate(counts)
     history_entry = {
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index be3089e..dbb279d 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -16,7 +16,7 @@ def ensure_repo_imports() -> None:
 
 ensure_repo_imports()
 
-from .batch import (
+from .batch import (  # noqa: E402
     handle_batch,
     handle_case_open,
     handle_case_run,
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index c49ee15..b0e199c 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -8,7 +8,7 @@
 import uuid
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Dict, Iterable, List, Mapping, Optional
+from typing import Dict, Iterable, List, Mapping
 
 from fetchgraph.core import create_generic_agent
 from fetchgraph.core.models import TaskProfile
diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py
index 396d368..a7da694 100644
--- a/examples/demo_qa/settings.py
+++ b/examples/demo_qa/settings.py
@@ -122,7 +122,7 @@ def load_settings(
     DemoQASettings._toml_path = resolved
     try:
         settings = DemoQASettings(**(overrides or {}))
-    except ValidationError as exc:
+    except ValidationError:
         DemoQASettings._toml_path = None
         raise
     DemoQASettings._toml_path = None
diff --git a/examples/demo_qa/utils.py b/examples/demo_qa/utils.py
new file mode 100644
index 0000000..55da4ab
--- /dev/null
+++ b/examples/demo_qa/utils.py
@@ -0,0 +1,12 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+def dump_json(path: Path, obj: object) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(obj, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
+
+
+__all__ = ["dump_json"]

From 12afc2a6b95c4a9b12b12e9dae5933977044556d Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 01:39:26 +0300
Subject: [PATCH 33/92] Harden results loading and emit batch completion event

---
 README_demo_qa.md                 |  1 +
 examples/demo_qa/batch.py         | 12 ++++++++++
 examples/demo_qa/chat_repl.py     | 17 +++++---------
 examples/demo_qa/requirements.txt |  3 ++-
 examples/demo_qa/runner.py        | 38 ++++++++++++++++++++++---------
 src/pydantic_settings/__init__.py | 14 ++++++++----
 6 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/README_demo_qa.md b/README_demo_qa.md
index 9698aa6..2acb56f 100644
--- a/README_demo_qa.md
+++ b/README_demo_qa.md
@@ -32,6 +32,7 @@ export DEMO_QA_LLM__BASE_URL=http://localhost:8000/v1
 ```
 
 ### Зависимости демо
+* Требуется Python 3.11+ (используется стандартный `tomllib`).
 ```
 pip install -e .[demo]
 # или
diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 5a65eec..a87eabb 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -480,6 +480,18 @@ def handle_batch(args) -> int:
         summary_by_tag_path = summary_path.with_name("summary_by_tag.json")
         dump_json(summary_by_tag_path, summary_by_tag)
 
+    if event_logger:
+        event_logger.emit(
+            {
+                "type": "run_finished",
+                "counts": counts,
+                "exit_code": exit_code,
+                "duration_ms": duration_ms,
+                "run_dir": str(run_folder),
+                "results_path": str(results_path),
+            }
+        )
+
     latest_path = run_folder.parent / "latest.txt"
     latest_results_path = run_folder.parent / "latest_results.txt"
     latest_path.parent.mkdir(parents=True, exist_ok=True)
diff --git a/examples/demo_qa/chat_repl.py b/examples/demo_qa/chat_repl.py
index 65bb0f7..0990ee3 100644
--- a/examples/demo_qa/chat_repl.py
+++ b/examples/demo_qa/chat_repl.py
@@ -1,14 +1,12 @@
 from __future__ import annotations
 
-import datetime
+import json
+import readline
 import sys
 import uuid
 from pathlib import Path
 from typing import Optional, Sequence
 
-import readline
-import json
-
 from .provider_factory import build_provider
 from .runner import Case, EventLogger, RunArtifacts, build_agent, run_one, save_artifacts
 
@@ -101,16 +99,12 @@ def start_repl(
             continue
 
         run_id = uuid.uuid4().hex[:8]
-        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-        run_dir = runs_root / f"{timestamp}_{run_id}"
-        events_path = run_dir / "events.jsonl"
-        event_logger = EventLogger(events_path, run_id)
-        print(f"Events: {events_path}")
+        event_logger = EventLogger(path=None, run_id=run_id)
 
         artifacts: RunArtifacts | None = None
         try:
             case = Case(id=run_id, question=line, tags=[])
-            result = run_one(case, runner, run_dir, plan_only=False, event_logger=event_logger)
+            result = run_one(case, runner, runs_root, plan_only=False, event_logger=event_logger)
             plan_obj = _load_json(Path(result.artifacts_dir) / "plan.json")
             ctx_obj = _load_json(Path(result.artifacts_dir) / "context.json") or {}
             artifacts = RunArtifacts(
@@ -128,8 +122,9 @@ def start_repl(
                 print("--- PLAN ---")
                 print(json.dumps(artifacts.plan, ensure_ascii=False, indent=2))
             print(result.answer or "")
+            print(f"Events: {Path(result.artifacts_dir) / 'events.jsonl'}")
         except Exception as exc:  # pragma: no cover - REPL resilience
-            error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=run_dir, question=line)
+            error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=runs_root, question=line)
             error_artifacts.error = error_artifacts.error or str(exc)
             last_artifacts = error_artifacts
             save_artifacts(error_artifacts)
diff --git a/examples/demo_qa/requirements.txt b/examples/demo_qa/requirements.txt
index 2c5bf86..098e3c1 100644
--- a/examples/demo_qa/requirements.txt
+++ b/examples/demo_qa/requirements.txt
@@ -1,3 +1,4 @@
+# Requires Python >=3.11 (relies on stdlib tomllib)
 pydantic-settings>=2.2
 python-dotenv>=1.0
-openai
\ No newline at end of file
+openai
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index b0e199c..90ca626 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -8,7 +8,7 @@
 import uuid
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Dict, Iterable, List, Mapping
+from typing import Dict, Iterable, List, Mapping, TypedDict
 
 from fetchgraph.core import create_generic_agent
 from fetchgraph.core.models import TaskProfile
@@ -376,18 +376,10 @@ def summarize(results: Iterable[RunResult]) -> Dict[str, object]:
         summary["median_total_s"] = None
 
     for tag, bucket in per_tag.items():
-        times: List[float] = []
-        # no per-tag timing collected; reuse overall average for simplicity
-        if times:
-            bucket["avg_total_s"] = statistics.fmean(times)
-            bucket["median_total_s"] = statistics.median(times)
-        else:
-            bucket["avg_total_s"] = None
-            bucket["median_total_s"] = None
         total = bucket.get("total", 0)
         checked_total_tag = (bucket.get("ok", 0) or 0) + (bucket.get("mismatch", 0) or 0) + (
             bucket.get("failed", 0) or 0
-        )
+        ) + (bucket.get("error", 0) or 0)
         bucket["checked_total"] = checked_total_tag
         non_skipped = total - (bucket.get("skipped", 0) or 0)
         if non_skipped > 0:
@@ -527,6 +519,8 @@ def load_results(path: Path) -> Dict[str, RunResult]:
             except json.JSONDecodeError as exc:
                 raise ValueError(f"Invalid result JSON on line {lineno}: {exc}") from exc
             result = _run_result_from_payload(payload)
+            if result.id in results:
+                raise ValueError(f"Duplicate result id {result.id!r} on line {lineno}")
             results[result.id] = result
     return results
 
@@ -599,7 +593,7 @@ def diff_runs(
     *,
     fail_on: str,
     require_assert: bool,
-) -> Dict[str, object]:
+) -> DiffReport:
     base_by_id = {res.id: res for res in base_results}
     new_by_id = {res.id: res for res in new_results}
     all_ids = sorted(new_by_id.keys())
@@ -735,6 +729,28 @@ def for_case(self, case_id: str, path: Path | None = None) -> "EventLogger":
         return EventLogger(path, self.run_id)
 
 
+class DiffReport(TypedDict):
+    all_ids: list[str]
+    new_fail: list[dict[str, object]]
+    fixed: list[dict[str, object]]
+    still_fail: list[dict[str, object]]
+    changed_status: list[dict[str, str | None]]
+    new_cases: list[str]
+    base_counts: Dict[str, object]
+    new_counts: Dict[str, object]
+    counts_delta: Dict[str, int | float | None]
+    base_median: float | None
+    new_median: float | None
+    base_avg: float | None
+    new_avg: float | None
+    median_delta: float | None
+    avg_delta: float | None
+    base_bad_total: int
+    new_bad_total: int
+    fail_on: str
+    require_assert: bool
+
+
 __all__ = [
     "AgentRunner",
     "Case",
diff --git a/src/pydantic_settings/__init__.py b/src/pydantic_settings/__init__.py
index bf9fdae..3f759c5 100644
--- a/src/pydantic_settings/__init__.py
+++ b/src/pydantic_settings/__init__.py
@@ -1,14 +1,18 @@
 from __future__ import annotations
 
 import os
+import sys
+from typing import Any, Callable, ClassVar, Dict, Iterable, Mapping, TypeAlias
+
+if sys.version_info < (3, 11):  # pragma: no cover - demo dependency guard
+    raise ImportError("pydantic_settings requires Python 3.11+ (standard tomllib).")
+
 import tomllib
-from typing import Any, Callable, ClassVar, Dict, Iterable, Mapping
 
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 
 
-def SettingsConfigDict(**kwargs: Any) -> Dict[str, Any]:
-    return dict(**kwargs)
+SettingsConfigDict: TypeAlias = ConfigDict
 
 
 def _deep_update(base: Dict[str, Any], updates: Mapping[str, Any]) -> Dict[str, Any]:
@@ -35,7 +39,7 @@ def __call__(self) -> Dict[str, Any]:
 
 
 class BaseSettings(BaseModel):
-    model_config: ClassVar[SettingsConfigDict] = {}
+    model_config: ClassVar[SettingsConfigDict] = ConfigDict()
 
     def __init__(self, **values: Any) -> None:
         sources = self.settings_customise_sources(

From 3c838a0c211a9d0c7dec380051ec988b57954155 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sun, 21 Dec 2025 01:45:03 +0300
Subject: [PATCH 34/92] =?UTF-8?q?Version=20increase:=20=D0=B4=D0=BE=D0=B1?=
 =?UTF-8?q?=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=20=D1=84=D1=83=D0=BD=D0=BA=D1=86?=
 =?UTF-8?q?=D0=B8=D0=BE=D0=BD=D0=B0=D0=BB=20=D0=BF=D0=BE=D1=82=D0=BE=D0=BA?=
 =?UTF-8?q?=D0=BE=D0=B2=D0=BE=D0=B3=D0=BE=20=D0=B7=D0=B0=D0=BF=D1=83=D1=81?=
 =?UTF-8?q?=D0=BA=D0=B0=20=D1=82=D0=B5=D1=81=D1=82=D0=BE=D0=B2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 70ebb36..a401b48 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,9 @@
 [project]
 name = "fetchgraph"
-version = "0.1.1"
+version = "0.1.2"
 description = "Graph-like planning → context fetching → synthesis agent (library-style)."
 readme = "README.md"
-requires-python = ">=3.10"
+requires-python = ">=3.11"
 license = "MIT" 
 authors = [{name = "Alexander Onischenko"}]
 keywords = ["agents", "context", "langgraph", "planning", "llm"]

From 0c0a51001c6915db3dfe644ee3978719cae52cef Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 01:52:32 +0300
Subject: [PATCH 35/92] Remove pydantic-settings shim

---
 examples/demo_qa/settings.py      |  18 +++++-
 src/pydantic_settings/__init__.py | 102 ------------------------------
 2 files changed, 17 insertions(+), 103 deletions(-)
 delete mode 100644 src/pydantic_settings/__init__.py

diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py
index a7da694..08478e0 100644
--- a/examples/demo_qa/settings.py
+++ b/examples/demo_qa/settings.py
@@ -5,10 +5,11 @@
 from typing import Any, ClassVar, Dict
 from urllib.parse import urlparse
 
+import tomllib
 from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
 
 try:
-    from pydantic_settings import BaseSettings, SettingsConfigDict, TomlConfigSettingsSource
+    from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict
 except ImportError as exc:  # pragma: no cover - make missing dependency explicit
     raise ImportError(
         "pydantic-settings is required for demo_qa configuration. "
@@ -16,6 +17,21 @@
     ) from exc
 
 
+class TomlConfigSettingsSource(PydanticBaseSettingsSource):
+    def __init__(self, settings_cls: type[BaseSettings], path: Path | None):
+        super().__init__(settings_cls)
+        self._path = path
+
+    def __call__(self) -> Dict[str, Any]:
+        if not self._path:
+            return {}
+        try:
+            with self._path.open("rb") as toml_file:
+                return tomllib.load(toml_file)
+        except FileNotFoundError:
+            return {}
+
+
 class LLMSettings(BaseModel):
     base_url: str | None = Field(default=None)
     api_key: str | None = Field(default=None)
diff --git a/src/pydantic_settings/__init__.py b/src/pydantic_settings/__init__.py
deleted file mode 100644
index 3f759c5..0000000
--- a/src/pydantic_settings/__init__.py
+++ /dev/null
@@ -1,102 +0,0 @@
-from __future__ import annotations
-
-import os
-import sys
-from typing import Any, Callable, ClassVar, Dict, Iterable, Mapping, TypeAlias
-
-if sys.version_info < (3, 11):  # pragma: no cover - demo dependency guard
-    raise ImportError("pydantic_settings requires Python 3.11+ (standard tomllib).")
-
-import tomllib
-
-from pydantic import BaseModel, ConfigDict
-
-
-SettingsConfigDict: TypeAlias = ConfigDict
-
-
-def _deep_update(base: Dict[str, Any], updates: Mapping[str, Any]) -> Dict[str, Any]:
-    for key, value in updates.items():
-        if isinstance(value, Mapping) and isinstance(base.get(key), dict):
-            base[key] = _deep_update(base[key], value)
-        else:
-            base[key] = value
-    return base
-
-
-class TomlConfigSettingsSource:
-    def __init__(self, settings_cls: type[BaseModel], path: os.PathLike | str | None):
-        self._path = path
-
-    def __call__(self) -> Dict[str, Any]:
-        if not self._path:
-            return {}
-        try:
-            with open(self._path, "rb") as toml_file:
-                return tomllib.load(toml_file)
-        except FileNotFoundError:
-            return {}
-
-
-class BaseSettings(BaseModel):
-    model_config: ClassVar[SettingsConfigDict] = ConfigDict()
-
-    def __init__(self, **values: Any) -> None:
-        sources = self.settings_customise_sources(
-            self.__class__,
-            self._build_init_settings(values),
-            self._build_env_settings(),
-            self._build_dotenv_settings(),
-            self._build_file_secret_settings(),
-        )
-        merged: Dict[str, Any] = {}
-        for source in reversed(tuple(sources)):
-            merged = _deep_update(merged, source() or {})
-        super().__init__(**merged)
-
-    @classmethod
-    def settings_customise_sources(
-        cls,
-        settings_cls: type[BaseModel],
-        init_settings: Callable[[], Mapping[str, Any]],
-        env_settings: Callable[[], Mapping[str, Any]],
-        dotenv_settings: Callable[[], Mapping[str, Any]],
-        file_secret_settings: Callable[[], Mapping[str, Any]],
-    ) -> Iterable[Callable[[], Mapping[str, Any]]]:
-        return (init_settings, env_settings, dotenv_settings, file_secret_settings)
-
-    @staticmethod
-    def _build_init_settings(values: Mapping[str, Any]) -> Callable[[], Mapping[str, Any]]:
-        return lambda: dict(values)
-
-    @classmethod
-    def _build_env_settings(cls) -> Callable[[], Mapping[str, Any]]:
-        prefix = cls.model_config.get("env_prefix", "") or ""
-        delimiter = cls.model_config.get("env_nested_delimiter", "__") or "__"
-
-        def source() -> Dict[str, Any]:
-            settings: Dict[str, Any] = {}
-            for key, value in os.environ.items():
-                if not key.startswith(prefix):
-                    continue
-                raw_key = key[len(prefix) :]
-                parts = raw_key.split(delimiter) if delimiter else [raw_key]
-                cls._insert_nested(settings, [part.lower() for part in parts], value)
-            return settings
-
-        return source
-
-    @classmethod
-    def _build_dotenv_settings(cls) -> Callable[[], Mapping[str, Any]]:
-        return lambda: {}
-
-    @classmethod
-    def _build_file_secret_settings(cls) -> Callable[[], Mapping[str, Any]]:
-        return lambda: {}
-
-    @staticmethod
-    def _insert_nested(target: Dict[str, Any], parts: list[str], value: Any) -> None:
-        current = target
-        for part in parts[:-1]:
-            current = current.setdefault(part, {})
-        current[parts[-1]] = value

From 5aceb938a1010f6b0a8a54fc47fbb7c9b09722e0 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 02:26:18 +0300
Subject: [PATCH 36/92] Align batch reporting with policy failures

---
 examples/demo_qa/batch.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index a87eabb..7abdeb4 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -454,8 +454,9 @@ def handle_batch(args) -> int:
             diff["baseline_path"] = str(baseline_path)
         diff_block = diff
 
-    failure_count = sum(1 for res in results if is_failure(res.status, args.fail_on, args.require_assert))
-    exit_code = 1 if failure_count else 0
+    policy_bad = bad_statuses(args.fail_on, args.require_assert)
+    bad_count = sum(int(counts.get(status, 0) or 0) for status in policy_bad)
+    exit_code = 1 if bad_count else 0
 
     ended_at = datetime.datetime.utcnow()
     duration_ms = int((ended_at - started_at).total_seconds() * 1000)
@@ -547,20 +548,23 @@ def handle_batch(args) -> int:
         "median_total_s": counts.get("median_total_s"),
         "run_dir": str(run_folder),
         "results_path": str(results_path),
+        "failed": counts.get("failed", 0),
+        "unchecked": counts.get("unchecked", 0),
+        "plan_only": counts.get("plan_only", 0),
+        "fail_on": args.fail_on,
+        "require_assert": args.require_assert,
+        "fail_count": bad_count,
     }
     history_path.parent.mkdir(parents=True, exist_ok=True)
     with history_path.open("a", encoding="utf-8") as f:
         f.write(json.dumps(history_entry, ensure_ascii=False, sort_keys=True) + "\n")
 
-    bad_count = counts.get("mismatch", 0) + counts.get("failed", 0) + counts.get("error", 0)
     unchecked = counts.get("unchecked", 0)
     plan_only = counts.get("plan_only", 0)
-    if args.require_assert or args.fail_on in {"unchecked", "any"}:
-        bad_count += unchecked + plan_only
     summary_line = (
         f"Batch: {counts.get('total', 0)} cases | Checked: {counts.get('checked_total', 0)} | "
         f"Checked OK: {counts.get('checked_ok', 0)} | Unchecked(no-assert): {unchecked} | "
-        f"Plan-only: {plan_only} | BAD: {bad_count} | Skipped: {counts.get('skipped', 0)}"
+        f"Plan-only: {plan_only} | FAIL(policy): {bad_count} | Skipped: {counts.get('skipped', 0)}"
     )
 
     if args.quiet:
@@ -697,7 +701,10 @@ def _print_stats(entries: list[dict]) -> None:
     if not entries:
         print("No history entries found.")
         return
-    header = f"{'run_id':<10} {'ok':>4} {'mis':>4} {'err':>4} {'skip':>5} {'pass%':>7} {'median_s':>10} {'Δpass':>8} {'Δmedian':>9}"
+    header = (
+        f"{'run_id':<10} {'ok':>4} {'mis':>4} {'fail':>4} {'err':>4} {'skip':>5} "
+        f"{'pass%':>7} {'median_s':>10} {'Δpass':>8} {'Δmedian':>9} {'policy':>8} {'reqA':>5}"
+    )
     print(header)
     prev = None
     for entry in entries:
@@ -716,8 +723,10 @@ def _print_stats(entries: list[dict]) -> None:
         dm = f"{delta_median:+.2f}" if delta_median is not None else "n/a"
         print(
             f"{entry.get('run_id',''):<10} "
-            f"{entry.get('ok',0):>4} {entry.get('mismatch',0):>4} {entry.get('error',0):>4} {entry.get('skipped',0):>5} "
-            f"{pr_display:>7} {median_display:>10} {dp:>8} {dm:>9}"
+            f"{entry.get('ok',0):>4} {entry.get('mismatch',0):>4} {entry.get('failed',0):>4} "
+            f"{entry.get('error',0):>4} {entry.get('skipped',0):>5} "
+            f"{pr_display:>7} {median_display:>10} {dp:>8} {dm:>9} "
+            f"{entry.get('fail_on',''):>8} {str(entry.get('require_assert', False)):>5}"
         )
         prev = entry
 

From 92a297c0770836c4dd1410ea9fdc32c59f9033a3 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sun, 21 Dec 2025 02:48:43 +0300
Subject: [PATCH 37/92] fixing tests

---
 examples/demo_qa/requirements.txt      |  2 +-
 examples/demo_qa/settings.py           | 22 +++-------------------
 tests/test_demo_qa_settings.py         |  4 ----
 tests/test_demo_qa_settings_sources.py |  2 +-
 4 files changed, 5 insertions(+), 25 deletions(-)

diff --git a/examples/demo_qa/requirements.txt b/examples/demo_qa/requirements.txt
index 098e3c1..2e908b2 100644
--- a/examples/demo_qa/requirements.txt
+++ b/examples/demo_qa/requirements.txt
@@ -1,4 +1,4 @@
 # Requires Python >=3.11 (relies on stdlib tomllib)
-pydantic-settings>=2.2
+pydantic-settings>=2.12
 python-dotenv>=1.0
 openai
diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py
index 08478e0..a7ea9fd 100644
--- a/examples/demo_qa/settings.py
+++ b/examples/demo_qa/settings.py
@@ -5,33 +5,17 @@
 from typing import Any, ClassVar, Dict
 from urllib.parse import urlparse
 
-import tomllib
 from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
+from pydantic_settings.sources.providers.toml import TomlConfigSettingsSource
 
 try:
-    from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict
+    from pydantic_settings import BaseSettings, SettingsConfigDict
 except ImportError as exc:  # pragma: no cover - make missing dependency explicit
     raise ImportError(
         "pydantic-settings is required for demo_qa configuration. "
         "Install demo extras via `pip install -e .[demo]` or `pip install -r examples/demo_qa/requirements.txt`."
     ) from exc
 
-
-class TomlConfigSettingsSource(PydanticBaseSettingsSource):
-    def __init__(self, settings_cls: type[BaseSettings], path: Path | None):
-        super().__init__(settings_cls)
-        self._path = path
-
-    def __call__(self) -> Dict[str, Any]:
-        if not self._path:
-            return {}
-        try:
-            with self._path.open("rb") as toml_file:
-                return tomllib.load(toml_file)
-        except FileNotFoundError:
-            return {}
-
-
 class LLMSettings(BaseModel):
     base_url: str | None = Field(default=None)
     api_key: str | None = Field(default=None)
@@ -98,7 +82,7 @@ def settings_customise_sources(
     ):
         sources = [init_settings, env_settings, dotenv_settings]
         if cls._toml_path:
-            sources.append(TomlConfigSettingsSource(settings_cls, cls._toml_path))
+            sources.append(TomlConfigSettingsSource(settings_cls, toml_file=cls._toml_path))
         sources.append(file_secret_settings)
         return tuple(sources)
 
diff --git a/tests/test_demo_qa_settings.py b/tests/test_demo_qa_settings.py
index 6b25e4f..16a1f30 100644
--- a/tests/test_demo_qa_settings.py
+++ b/tests/test_demo_qa_settings.py
@@ -6,10 +6,6 @@
 
 import pytest
 
-ROOT = Path(__file__).resolve().parents[1]
-if str(ROOT) not in sys.path:
-    sys.path.insert(0, str(ROOT))
-
 from examples.demo_qa.llm.factory import build_llm
 from examples.demo_qa.llm.openai_adapter import OpenAILLM
 from examples.demo_qa.settings import load_settings
diff --git a/tests/test_demo_qa_settings_sources.py b/tests/test_demo_qa_settings_sources.py
index 97fef3a..f828329 100644
--- a/tests/test_demo_qa_settings_sources.py
+++ b/tests/test_demo_qa_settings_sources.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from examples.demo_qa.settings import DemoQASettings, load_settings, resolve_config_path
+from examples.demo_qa.settings import load_settings, resolve_config_path
 
 
 def write_toml(path: Path, content: str) -> None:

From 66c6dabeecfe8874dfcbca8790d25ebcc6989b56 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sun, 21 Dec 2025 10:26:54 +0300
Subject: [PATCH 38/92] =?UTF-8?q?=D0=BE=D0=B1=D0=BD=D0=BE=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=B5=D0=BD=D1=8B=20=D1=82=D1=80=D0=B5=D0=B1=D0=BE=D0=B2=D0=B0?=
 =?UTF-8?q?=D0=BD=D0=B8=D0=BD=D1=8F=20=D0=BA=20pydantic=20settings=20?=
 =?UTF-8?q?=D0=B8=D0=B7-=D0=B7=D0=B0=20toml=20=D0=B2=20demo?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index a401b48..92cf687 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,8 +33,9 @@ dev = [
   "python-dotenv>=1.0",
 ]
 demo = [
-  "pydantic-settings>=2.2",
+  "pydantic-settings>=2.12",
   "python-dotenv>=1.0",
+  "openai"
 ]
 
 [project.urls]

From 34d2c56335148a04e9c949f6abfa28d5cd0020e5 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sun, 21 Dec 2025 10:38:23 +0300
Subject: [PATCH 39/92] =?UTF-8?q?=D1=84=D0=B8=D0=BA=D1=81=20=D0=BF=D1=83?=
 =?UTF-8?q?=D1=82=D0=B5=D0=B9=20=D0=B8=D0=BC=D0=BF=D0=BE=D1=80=D1=82=D0=B0?=
 =?UTF-8?q?=20=D0=BD=D0=B0=20=D0=BF=D1=83=D0=B1=D0=BB=D0=B8=D1=87=D0=BD?=
 =?UTF-8?q?=D1=8B=D0=B5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/demo_qa/settings.py | 2 +-
 pyproject.toml               | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py
index a7ea9fd..064e179 100644
--- a/examples/demo_qa/settings.py
+++ b/examples/demo_qa/settings.py
@@ -6,10 +6,10 @@
 from urllib.parse import urlparse
 
 from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
-from pydantic_settings.sources.providers.toml import TomlConfigSettingsSource
 
 try:
     from pydantic_settings import BaseSettings, SettingsConfigDict
+    from pydantic_settings.sources import TomlConfigSettingsSource
 except ImportError as exc:  # pragma: no cover - make missing dependency explicit
     raise ImportError(
         "pydantic-settings is required for demo_qa configuration. "
diff --git a/pyproject.toml b/pyproject.toml
index 92cf687..710ebde 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,8 @@ dev = [
   "pre-commit",
   "pydantic-settings>=2.2",
   "python-dotenv>=1.0",
+  "ruff",
+  "pyright"
 ]
 demo = [
   "pydantic-settings>=2.12",

From d9184f13f314179cdff6def3c12f093f1397fa14 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 10:54:21 +0300
Subject: [PATCH 40/92] Handle missing cases in demo QA compare

---
 examples/demo_qa/runner.py   | 23 ++++++++++++++++-------
 tests/test_demo_qa_runner.py | 27 +++++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 90ca626..5f32f47 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -596,20 +596,20 @@ def diff_runs(
 ) -> DiffReport:
     base_by_id = {res.id: res for res in base_results}
     new_by_id = {res.id: res for res in new_results}
-    all_ids = sorted(new_by_id.keys())
+    all_ids = sorted(set(base_by_id.keys()) | set(new_by_id.keys()))
 
     bad = bad_statuses(fail_on, require_assert)
 
     def _is_bad(res: RunResult | None) -> bool:
         return bool(res and res.status in bad)
 
-    def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult) -> dict[str, object]:
+    def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult | None) -> dict[str, object]:
         return {
             "id": case_id,
             "from": base_res.status if base_res else None,
-            "to": new_res.status,
-            "reason": _reason(new_res),
-            "artifacts": _artifact_links(new_res),
+            "to": new_res.status if new_res else "missing",
+            "reason": _reason(new_res) if new_res else "missing in new results",
+            "artifacts": _artifact_links(new_res) if new_res else {},
         }
 
     new_fail: list[dict[str, object]] = []
@@ -619,21 +619,30 @@ def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult) -> dict
     new_cases: list[str] = []
 
     for case_id in all_ids:
-        new_res = new_by_id[case_id]
+        new_res = new_by_id.get(case_id)
         base_res = base_by_id.get(case_id)
         base_bad = _is_bad(base_res)
-        new_bad = _is_bad(new_res)
+        new_bad = True if new_res is None else _is_bad(new_res)
 
         if base_res is None:
             new_cases.append(case_id)
             if new_bad:
                 new_fail.append(_entry(case_id, base_res, new_res))
+        elif new_res is None:
+            changed_status.append({"id": case_id, "from": base_res.status, "to": "missing"})
         else:
             if base_res.status != new_res.status:
                 changed_status.append({"id": case_id, "from": base_res.status, "to": new_res.status})
 
         if base_res is None:
             continue
+        if new_res is None:
+            entry = _entry(case_id, base_res, new_res)
+            if base_bad:
+                still_fail.append(entry)
+            else:
+                new_fail.append(entry)
+            continue
 
         if not base_bad and new_bad:
             new_fail.append(_entry(case_id, base_res, new_res))
diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py
index 745a477..31773cd 100644
--- a/tests/test_demo_qa_runner.py
+++ b/tests/test_demo_qa_runner.py
@@ -61,6 +61,28 @@ def test_diff_runs_tracks_regressions_and_improvements() -> None:
             duration_ms=10,
             tags=[],
         ),
+        RunResult(
+            id="missing_ok",
+            question="",
+            status="ok",
+            checked=True,
+            reason=None,
+            details=None,
+            artifacts_dir="/tmp/miss-ok",
+            duration_ms=10,
+            tags=[],
+        ),
+        RunResult(
+            id="missing_bad",
+            question="",
+            status="failed",
+            checked=True,
+            reason=None,
+            details=None,
+            artifacts_dir="/tmp/miss-bad",
+            duration_ms=10,
+            tags=[],
+        ),
     ]
 
     current = [
@@ -123,9 +145,10 @@ def test_diff_runs_tracks_regressions_and_improvements() -> None:
 
     diff = diff_runs(baseline, current, fail_on="bad", require_assert=True)
 
-    assert {row["id"] for row in diff["new_fail"]} == {"ok_to_bad", "new_bad"}
+    assert {row["id"] for row in diff["new_fail"]} == {"ok_to_bad", "new_bad", "missing_ok"}
     assert {row["id"] for row in diff["fixed"]} == {"err_to_ok"}
-    assert {row["id"] for row in diff["still_fail"]} == {"still_bad"}
+    assert {row["id"] for row in diff["still_fail"]} == {"still_bad", "missing_bad"}
+    assert {"missing_ok", "missing_bad"} <= {row["id"] for row in diff["changed_status"]}
     assert diff["new_cases"] == ["new_bad", "new_ok"]
 
 

From d76422a760775b8719b03e627d398e88fb2ac0eb Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sat, 20 Dec 2025 16:40:07 +0300
Subject: [PATCH 41/92] =?UTF-8?q?=D1=82=D0=B5=D1=81=D1=82=D0=BE=D0=B2?=
 =?UTF-8?q?=D1=8B=D0=B5=20=D0=B4=D0=B0=D0=BD=D0=BD=D1=8B=D0=B5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/demo_qa/cases/retail_cases.json | 200 +++++++++++++++++++++++
 1 file changed, 200 insertions(+)
 create mode 100644 examples/demo_qa/cases/retail_cases.json

diff --git a/examples/demo_qa/cases/retail_cases.json b/examples/demo_qa/cases/retail_cases.json
new file mode 100644
index 0000000..fb157a3
--- /dev/null
+++ b/examples/demo_qa/cases/retail_cases.json
@@ -0,0 +1,200 @@
+{"id": "agg_001", "question": "Сколько всего заказов (orders) в датасете? Ответь только числом.", "expected_regex": "(?<!\\d)1000(?!\\d)", "tags": ["agg", "count"]}
+{"id": "agg_002", "question": "Сколько всего клиентов (customers) в датасете? Ответь только числом.", "expected_regex": "(?<!\\d)1000(?!\\d)", "tags": ["agg", "count"]}
+{"id": "agg_003", "question": "Сколько всего товаров (products) в датасете? Ответь только числом.", "expected_regex": "(?<!\\d)333(?!\\d)", "tags": ["agg", "count"]}
+{"id": "agg_004", "question": "Сколько всего строк в таблице order_items? Ответь только числом.", "expected_regex": "(?<!\\d)2527(?!\\d)", "tags": ["agg", "count"]}
+{"id": "agg_005", "question": "Сколько уникальных клиентов сделали хотя бы один заказ? Ответь только числом.", "expected_regex": "(?<!\\d)612(?!\\d)", "tags": ["agg", "distinct"]}
+{"id": "agg_006", "question": "Какова самая ранняя дата заказа (min orders.order_date)? Ответь строго YYYY-MM-DD.", "expected_contains": "2022-01-01", "tags": ["agg", "date"]}
+{"id": "agg_007", "question": "Какова самая поздняя дата заказа (max orders.order_date)? Ответь строго YYYY-MM-DD.", "expected_contains": "2024-05-31", "tags": ["agg", "date"]}
+{"id": "agg_008", "question": "Какова общая выручка по всем заказам (sum orders.order_total)? Ответь числом с 2 знаками после точки, без пробелов.", "expected_regex": "(?<!\\d)1884643\\.88(?!\\d)", "tags": ["agg", "sum"]}
+{"id": "agg_009", "question": "Каков средний чек (avg orders.order_total) по всем заказам? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)1884\\.64(?!\\d)", "tags": ["agg", "avg"]}
+{"id": "agg_010", "question": "Каков медианный чек (median orders.order_total) по всем заказам? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)1723\\.77(?!\\d)", "tags": ["agg", "median"]}
+{"id": "agg_011", "question": "Какой минимальный чек (min orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)10\\.62(?!\\d)", "tags": ["agg", "minmax"]}
+{"id": "agg_012", "question": "Какой максимальный чек (max orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)6218\\.39(?!\\d)", "tags": ["agg", "minmax"]}
+{"id": "agg_013", "question": "Какой order_id у заказа с максимальным order_total? Ответь только числом.", "expected_regex": "(?<!\\d)135(?!\\d)", "tags": ["agg", "argmax"]}
+{"id": "agg_014", "question": "Какой order_id у заказа с минимальным order_total? Ответь только числом.", "expected_regex": "(?<!\\d)117(?!\\d)", "tags": ["agg", "argmin"]}
+{"id": "agg_015", "question": "Сколько заказов со статусом 'cancelled'? Ответь только числом.", "expected_regex": "(?<!\\d)197(?!\\d)", "tags": ["agg", "status"]}
+{"id": "agg_016", "question": "Сколько заказов со статусом 'delivered'? Ответь только числом.", "expected_regex": "(?<!\\d)229(?!\\d)", "tags": ["agg", "status"]}
+{"id": "agg_017", "question": "Сколько заказов со статусом 'pending'? Ответь только числом.", "expected_regex": "(?<!\\d)189(?!\\d)", "tags": ["agg", "status"]}
+{"id": "agg_018", "question": "Сколько заказов со статусом 'processing'? Ответь только числом.", "expected_regex": "(?<!\\d)190(?!\\d)", "tags": ["agg", "status"]}
+{"id": "agg_019", "question": "Сколько заказов со статусом 'shipped'? Ответь только числом.", "expected_regex": "(?<!\\d)195(?!\\d)", "tags": ["agg", "status"]}
+{"id": "agg_020", "question": "Сколько заказов через канал 'online'? Ответь только числом.", "expected_regex": "(?<!\\d)265(?!\\d)", "tags": ["agg", "channel"]}
+{"id": "agg_021", "question": "Сколько заказов через канал 'partner'? Ответь только числом.", "expected_regex": "(?<!\\d)245(?!\\d)", "tags": ["agg", "channel"]}
+{"id": "agg_022", "question": "Сколько заказов через канал 'phone'? Ответь только числом.", "expected_regex": "(?<!\\d)246(?!\\d)", "tags": ["agg", "channel"]}
+{"id": "agg_023", "question": "Сколько заказов через канал 'retail'? Ответь только числом.", "expected_regex": "(?<!\\d)244(?!\\d)", "tags": ["agg", "channel"]}
+{"id": "agg_030", "question": "Сколько заказов было в 2022 году? Ответь только числом.", "expected_regex": "(?<!\\d)410(?!\\d)", "tags": ["agg", "year"]}
+{"id": "agg_031", "question": "Сколько заказов было в 2023 году? Ответь только числом.", "expected_regex": "(?<!\\d)404(?!\\d)", "tags": ["agg", "year"]}
+{"id": "agg_032", "question": "Сколько заказов было в 2024 году? Ответь только числом.", "expected_regex": "(?<!\\d)186(?!\\d)", "tags": ["agg", "year"]}
+{"id": "agg_033", "question": "В каком месяце было больше всего заказов? Ответь строго YYYY-MM.", "expected_contains": "2022-07", "tags": ["agg", "month"]}
+{"id": "agg_034", "question": "В каком месяце было меньше всего заказов? Ответь строго YYYY-MM.", "expected_contains": "2022-03", "tags": ["agg", "month"]}
+{"id": "agg_035", "question": "Сколько заказов было в 2022-03? Ответь только числом.", "expected_regex": "(?<!\\d)28(?!\\d)", "tags": ["agg", "month"]}
+{"id": "agg_036", "question": "Сколько заказов было в 2022-07? Ответь только числом.", "expected_regex": "(?<!\\d)45(?!\\d)", "tags": ["agg", "month"]}
+{"id": "agg_037", "question": "Какая категория товаров принесла больше всего выручки (по order_items.line_total)? Ответь одним словом (category).", "expected_contains": "toys", "tags": ["agg", "category"]}
+{"id": "agg_038", "question": "Какова общая выручка (sum line_total) по категории 'toys'? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)363503\\.05(?!\\d)", "tags": ["agg", "category"]}
+{"id": "cat_rev_toys", "question": "Какова общая выручка (sum line_total) по категории 'toys'? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)363503\\.05(?!\\d)", "tags": ["agg", "category"]}
+{"id": "cat_rev_books", "question": "Какова общая выручка (sum line_total) по категории 'books'? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)319278\\.84(?!\\d)", "tags": ["agg", "category"]}
+{"id": "cat_rev_electronics", "question": "Какова общая выручка (sum line_total) по категории 'electronics'? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)317969\\.51(?!\\d)", "tags": ["agg", "category"]}
+{"id": "cat_rev_furniture", "question": "Какова общая выручка (sum line_total) по категории 'furniture'? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)311900\\.48(?!\\d)", "tags": ["agg", "category"]}
+{"id": "cat_rev_office_supplies", "question": "Какова общая выручка (sum line_total) по категории 'office_supplies'? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)296822\\.87(?!\\d)", "tags": ["agg", "category"]}
+{"id": "cat_rev_outdoors", "question": "Какова общая выручка (sum line_total) по категории 'outdoors'? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)275169\\.13(?!\\d)", "tags": ["agg", "category"]}
+{"id": "prod_ext_001", "question": "Какой product_id у самого дорогого товара (max products.price)? Ответь только числом.", "expected_regex": "(?<!\\d)153(?!\\d)", "tags": ["product", "extremes"]}
+{"id": "prod_ext_002", "question": "Какова цена самого дорогого товара (max products.price)? Ответь числом (десятичная точка).", "expected_regex": "(?<!\\d)498\\.90?(?!\\d)", "tags": ["product", "extremes"]}
+{"id": "prod_ext_003", "question": "Какой product_id у самого дешевого товара (min products.price)? Ответь только числом.", "expected_regex": "(?<!\\d)115(?!\\d)", "tags": ["product", "extremes"]}
+{"id": "prod_ext_004", "question": "Какова цена самого дешевого товара (min products.price)? Ответь числом (десятичная точка).", "expected_regex": "(?<!\\d)6\\.270?(?!\\d)", "tags": ["product", "extremes"]}
+{"id": "prod_top_001", "question": "Какой product_id принес больше всего выручки (sum order_items.line_total)? Ответь только числом.", "expected_regex": "(?<!\\d)155(?!\\d)", "tags": ["product", "top"]}
+{"id": "prod_top_002", "question": "Сколько выручки принес product_id-лидер (sum line_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)23524\\.48(?!\\d)", "tags": ["product", "top"]}
+{"id": "prod_top_003", "question": "Сколько единиц (sum order_items.quantity) продано для продукта-лидера по количеству? Ответь только числом.", "expected_regex": "(?<!\\d)64(?!\\d)", "tags": ["product", "top"]}
+{"id": "items_001", "question": "Какое общее количество единиц товара было продано (sum order_items.quantity)? Ответь только числом.", "expected_regex": "(?<!\\d)7567(?!\\d)", "tags": ["items", "sum"]}
+{"id": "items_002", "question": "Сколько заказов содержат больше 3 позиций (кол-во order_items по order_id > 3)? Ответь только числом.", "expected_regex": "(?<!\\d)257(?!\\d)", "tags": ["items", "distribution"]}
+{"id": "items_003", "question": "Сколько заказов состоят ровно из 1 позиции (order_items per order_id = 1)? Ответь только числом.", "expected_regex": "(?<!\\d)239(?!\\d)", "tags": ["items", "distribution"]}
+{"id": "geo_001", "question": "Сколько заказов было у клиентов из города San Antonio в месяце 2022-03? Ответь только числом.", "expected_regex": "(?<!\\d)10(?!\\d)", "tags": ["geo", "month"]}
+{"id": "geo_002", "question": "Сколько заказов со статусом shipped было у клиентов из San Antonio в 2022-03? Ответь только числом.", "expected_regex": "(?<!\\d)1(?!\\d)", "tags": ["geo", "month"]}
+{"id": "geo_003", "question": "Сколько заказов было у клиентов из Los Angeles в 2023-08? Ответь только числом.", "expected_regex": "(?<!\\d)11(?!\\d)", "tags": ["geo", "month"]}
+{"id": "cust_323_city", "question": "В каком городе зарегистрирован customer_id 323? Ответь ровно названием города.", "expected_contains": "New York", "tags": ["customer", "lookup"]}
+{"id": "cust_323_segment", "question": "Какой сегмент у customer_id 323? Ответь одним словом: consumer, corporate или home_office.", "expected_contains": "corporate", "tags": ["customer", "lookup"]}
+{"id": "cust_323_signup", "question": "Какая дата регистрации (customers.signup_date) у customer_id 323? Ответь строго YYYY-MM-DD.", "expected_contains": "2022-10-08", "tags": ["customer", "lookup"]}
+{"id": "cust_323_orders", "question": "Сколько заказов сделал customer_id 323? Ответь только числом.", "expected_regex": "(?<!\\d)1(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_323_spend", "question": "Сколько всего потратил customer_id 323 (sum orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)1442\\.37(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_323_last", "question": "Какая дата последнего заказа у customer_id 323? Ответь строго YYYY-MM-DD.", "expected_contains": "2023-08-08", "tags": ["customer", "agg"]}
+{"id": "cust_536_city", "question": "В каком городе зарегистрирован customer_id 536? Ответь ровно названием города.", "expected_contains": "Chicago", "tags": ["customer", "lookup"]}
+{"id": "cust_536_segment", "question": "Какой сегмент у customer_id 536? Ответь одним словом: consumer, corporate или home_office.", "expected_contains": "corporate", "tags": ["customer", "lookup"]}
+{"id": "cust_536_signup", "question": "Какая дата регистрации (customers.signup_date) у customer_id 536? Ответь строго YYYY-MM-DD.", "expected_contains": "2023-05-19", "tags": ["customer", "lookup"]}
+{"id": "cust_536_orders", "question": "Сколько заказов сделал customer_id 536? Ответь только числом.", "expected_regex": "(?<!\\d)1(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_536_spend", "question": "Сколько всего потратил customer_id 536 (sum orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)1074\\.70(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_536_last", "question": "Какая дата последнего заказа у customer_id 536? Ответь строго YYYY-MM-DD.", "expected_contains": "2022-07-03", "tags": ["customer", "agg"]}
+{"id": "cust_692_city", "question": "В каком городе зарегистрирован customer_id 692? Ответь ровно названием города.", "expected_contains": "Philadelphia", "tags": ["customer", "lookup"]}
+{"id": "cust_692_segment", "question": "Какой сегмент у customer_id 692? Ответь одним словом: consumer, corporate или home_office.", "expected_contains": "consumer", "tags": ["customer", "lookup"]}
+{"id": "cust_692_signup", "question": "Какая дата регистрации (customers.signup_date) у customer_id 692? Ответь строго YYYY-MM-DD.", "expected_contains": "2021-05-09", "tags": ["customer", "lookup"]}
+{"id": "cust_692_orders", "question": "Сколько заказов сделал customer_id 692? Ответь только числом.", "expected_regex": "(?<!\\d)1(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_692_spend", "question": "Сколько всего потратил customer_id 692 (sum orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)418\\.97(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_692_last", "question": "Какая дата последнего заказа у customer_id 692? Ответь строго YYYY-MM-DD.", "expected_contains": "2024-01-14", "tags": ["customer", "agg"]}
+{"id": "cust_722_city", "question": "В каком городе зарегистрирован customer_id 722? Ответь ровно названием города.", "expected_contains": "Philadelphia", "tags": ["customer", "lookup"]}
+{"id": "cust_722_segment", "question": "Какой сегмент у customer_id 722? Ответь одним словом: consumer, corporate или home_office.", "expected_contains": "consumer", "tags": ["customer", "lookup"]}
+{"id": "cust_722_signup", "question": "Какая дата регистрации (customers.signup_date) у customer_id 722? Ответь строго YYYY-MM-DD.", "expected_contains": "2021-04-30", "tags": ["customer", "lookup"]}
+{"id": "cust_722_orders", "question": "Сколько заказов сделал customer_id 722? Ответь только числом.", "expected_regex": "(?<!\\d)2(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_722_spend", "question": "Сколько всего потратил customer_id 722 (sum orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)2475\\.47(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_722_last", "question": "Какая дата последнего заказа у customer_id 722? Ответь строго YYYY-MM-DD.", "expected_contains": "2024-03-26", "tags": ["customer", "agg"]}
+{"id": "cust_725_city", "question": "В каком городе зарегистрирован customer_id 725? Ответь ровно названием города.", "expected_contains": "Houston", "tags": ["customer", "lookup"]}
+{"id": "cust_725_segment", "question": "Какой сегмент у customer_id 725? Ответь одним словом: consumer, corporate или home_office.", "expected_contains": "corporate", "tags": ["customer", "lookup"]}
+{"id": "cust_725_signup", "question": "Какая дата регистрации (customers.signup_date) у customer_id 725? Ответь строго YYYY-MM-DD.", "expected_contains": "2023-05-27", "tags": ["customer", "lookup"]}
+{"id": "cust_725_orders", "question": "Сколько заказов сделал customer_id 725? Ответь только числом.", "expected_regex": "(?<!\\d)1(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_725_spend", "question": "Сколько всего потратил customer_id 725 (sum orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)2307\\.88(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_725_last", "question": "Какая дата последнего заказа у customer_id 725? Ответь строго YYYY-MM-DD.", "expected_contains": "2022-02-23", "tags": ["customer", "agg"]}
+{"id": "cust_804_city", "question": "В каком городе зарегистрирован customer_id 804? Ответь ровно названием города.", "expected_contains": "Philadelphia", "tags": ["customer", "lookup"]}
+{"id": "cust_804_segment", "question": "Какой сегмент у customer_id 804? Ответь одним словом: consumer, corporate или home_office.", "expected_contains": "consumer", "tags": ["customer", "lookup"]}
+{"id": "cust_804_signup", "question": "Какая дата регистрации (customers.signup_date) у customer_id 804? Ответь строго YYYY-MM-DD.", "expected_contains": "2022-08-22", "tags": ["customer", "lookup"]}
+{"id": "cust_804_orders", "question": "Сколько заказов сделал customer_id 804? Ответь только числом.", "expected_regex": "(?<!\\d)2(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_804_spend", "question": "Сколько всего потратил customer_id 804 (sum orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)1886\\.57(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_804_last", "question": "Какая дата последнего заказа у customer_id 804? Ответь строго YYYY-MM-DD.", "expected_contains": "2022-06-15", "tags": ["customer", "agg"]}
+{"id": "cust_856_city", "question": "В каком городе зарегистрирован customer_id 856? Ответь ровно названием города.", "expected_contains": "San Diego", "tags": ["customer", "lookup"]}
+{"id": "cust_856_segment", "question": "Какой сегмент у customer_id 856? Ответь одним словом: consumer, corporate или home_office.", "expected_contains": "consumer", "tags": ["customer", "lookup"]}
+{"id": "cust_856_signup", "question": "Какая дата регистрации (customers.signup_date) у customer_id 856? Ответь строго YYYY-MM-DD.", "expected_contains": "2022-04-12", "tags": ["customer", "lookup"]}
+{"id": "cust_856_orders", "question": "Сколько заказов сделал customer_id 856? Ответь только числом.", "expected_regex": "(?<!\\d)2(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_856_spend", "question": "Сколько всего потратил customer_id 856 (sum orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)2709\\.31(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_856_last", "question": "Какая дата последнего заказа у customer_id 856? Ответь строго YYYY-MM-DD.", "expected_contains": "2023-08-23", "tags": ["customer", "agg"]}
+{"id": "cust_875_city", "question": "В каком городе зарегистрирован customer_id 875? Ответь ровно названием города.", "expected_contains": "Houston", "tags": ["customer", "lookup"]}
+{"id": "cust_875_segment", "question": "Какой сегмент у customer_id 875? Ответь одним словом: consumer, corporate или home_office.", "expected_contains": "consumer", "tags": ["customer", "lookup"]}
+{"id": "cust_875_signup", "question": "Какая дата регистрации (customers.signup_date) у customer_id 875? Ответь строго YYYY-MM-DD.", "expected_contains": "2022-09-03", "tags": ["customer", "lookup"]}
+{"id": "cust_875_orders", "question": "Сколько заказов сделал customer_id 875? Ответь только числом.", "expected_regex": "(?<!\\d)1(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_875_spend", "question": "Сколько всего потратил customer_id 875 (sum orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)2646\\.92(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_875_last", "question": "Какая дата последнего заказа у customer_id 875? Ответь строго YYYY-MM-DD.", "expected_contains": "2022-11-29", "tags": ["customer", "agg"]}
+{"id": "cust_882_city", "question": "В каком городе зарегистрирован customer_id 882? Ответь ровно названием города.", "expected_contains": "Philadelphia", "tags": ["customer", "lookup"]}
+{"id": "cust_882_segment", "question": "Какой сегмент у customer_id 882? Ответь одним словом: consumer, corporate или home_office.", "expected_contains": "home_office", "tags": ["customer", "lookup"]}
+{"id": "cust_882_signup", "question": "Какая дата регистрации (customers.signup_date) у customer_id 882? Ответь строго YYYY-MM-DD.", "expected_contains": "2021-10-07", "tags": ["customer", "lookup"]}
+{"id": "cust_882_orders", "question": "Сколько заказов сделал customer_id 882? Ответь только числом.", "expected_regex": "(?<!\\d)5(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_882_spend", "question": "Сколько всего потратил customer_id 882 (sum orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)9881\\.85(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_882_last", "question": "Какая дата последнего заказа у customer_id 882? Ответь строго YYYY-MM-DD.", "expected_contains": "2023-05-03", "tags": ["customer", "agg"]}
+{"id": "cust_914_city", "question": "В каком городе зарегистрирован customer_id 914? Ответь ровно названием города.", "expected_contains": "Houston", "tags": ["customer", "lookup"]}
+{"id": "cust_914_segment", "question": "Какой сегмент у customer_id 914? Ответь одним словом: consumer, corporate или home_office.", "expected_contains": "consumer", "tags": ["customer", "lookup"]}
+{"id": "cust_914_signup", "question": "Какая дата регистрации (customers.signup_date) у customer_id 914? Ответь строго YYYY-MM-DD.", "expected_contains": "2023-04-25", "tags": ["customer", "lookup"]}
+{"id": "cust_914_orders", "question": "Сколько заказов сделал customer_id 914? Ответь только числом.", "expected_regex": "(?<!\\d)2(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_914_spend", "question": "Сколько всего потратил customer_id 914 (sum orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)5855\\.88(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_914_last", "question": "Какая дата последнего заказа у customer_id 914? Ответь строго YYYY-MM-DD.", "expected_contains": "2023-12-13", "tags": ["customer", "agg"]}
+{"id": "cust_920_city", "question": "В каком городе зарегистрирован customer_id 920? Ответь ровно названием города.", "expected_contains": "San Diego", "tags": ["customer", "lookup"]}
+{"id": "cust_920_segment", "question": "Какой сегмент у customer_id 920? Ответь одним словом: consumer, corporate или home_office.", "expected_contains": "consumer", "tags": ["customer", "lookup"]}
+{"id": "cust_920_signup", "question": "Какая дата регистрации (customers.signup_date) у customer_id 920? Ответь строго YYYY-MM-DD.", "expected_contains": "2023-09-19", "tags": ["customer", "lookup"]}
+{"id": "cust_920_orders", "question": "Сколько заказов сделал customer_id 920? Ответь только числом.", "expected_regex": "(?<!\\d)1(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_920_spend", "question": "Сколько всего потратил customer_id 920 (sum orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)120\\.28(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_920_last", "question": "Какая дата последнего заказа у customer_id 920? Ответь строго YYYY-MM-DD.", "expected_contains": "2023-08-12", "tags": ["customer", "agg"]}
+{"id": "cust_954_city", "question": "В каком городе зарегистрирован customer_id 954? Ответь ровно названием города.", "expected_contains": "Phoenix", "tags": ["customer", "lookup"]}
+{"id": "cust_954_segment", "question": "Какой сегмент у customer_id 954? Ответь одним словом: consumer, corporate или home_office.", "expected_contains": "corporate", "tags": ["customer", "lookup"]}
+{"id": "cust_954_signup", "question": "Какая дата регистрации (customers.signup_date) у customer_id 954? Ответь строго YYYY-MM-DD.", "expected_contains": "2023-11-22", "tags": ["customer", "lookup"]}
+{"id": "cust_954_orders", "question": "Сколько заказов сделал customer_id 954? Ответь только числом.", "expected_regex": "(?<!\\d)1(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_954_spend", "question": "Сколько всего потратил customer_id 954 (sum orders.order_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)4466\\.77(?!\\d)", "tags": ["customer", "agg"]}
+{"id": "cust_954_last", "question": "Какая дата последнего заказа у customer_id 954? Ответь строго YYYY-MM-DD.", "expected_contains": "2022-10-05", "tags": ["customer", "agg"]}
+{"id": "cust_top_001", "question": "Какой customer_id потратил больше всего денег (sum orders.order_total)? Ответь только числом.", "expected_regex": "(?<!\\d)656(?!\\d)", "tags": ["customer", "top"]}
+{"id": "cust_top_002", "question": "Сколько всего потратил customer_id 656? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)13773\\.57(?!\\d)", "tags": ["customer", "top"]}
+{"id": "cust_top_003", "question": "Сколько заказов у customer_id 656? Ответь только числом.", "expected_regex": "(?<!\\d)5(?!\\d)", "tags": ["customer", "top"]}
+{"id": "cust_top_004", "question": "В каком городе зарегистрирован customer_id 656? Ответь названием города.", "expected_contains": "San Antonio", "tags": ["customer", "top"]}
+{"id": "prod_14_cat", "question": "Какая категория (products.category) у product_id 14? Ответь одним словом.", "expected_contains": "toys", "tags": ["product", "lookup"]}
+{"id": "prod_14_price", "question": "Какая цена (products.price) у product_id 14? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)127\\.65(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_14_stock", "question": "Сколько единиц на складе (products.in_stock) у product_id 14? Ответь только числом.", "expected_regex": "(?<!\\d)432(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_16_cat", "question": "Какая категория (products.category) у product_id 16? Ответь одним словом.", "expected_contains": "toys", "tags": ["product", "lookup"]}
+{"id": "prod_16_price", "question": "Какая цена (products.price) у product_id 16? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)127\\.84(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_16_stock", "question": "Сколько единиц на складе (products.in_stock) у product_id 16? Ответь только числом.", "expected_regex": "(?<!\\d)165(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_17_cat", "question": "Какая категория (products.category) у product_id 17? Ответь одним словом.", "expected_contains": "toys", "tags": ["product", "lookup"]}
+{"id": "prod_17_price", "question": "Какая цена (products.price) у product_id 17? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)302\\.49(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_17_stock", "question": "Сколько единиц на складе (products.in_stock) у product_id 17? Ответь только числом.", "expected_regex": "(?<!\\d)30(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_45_cat", "question": "Какая категория (products.category) у product_id 45? Ответь одним словом.", "expected_contains": "electronics", "tags": ["product", "lookup"]}
+{"id": "prod_45_price", "question": "Какая цена (products.price) у product_id 45? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)238\\.15(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_45_stock", "question": "Сколько единиц на складе (products.in_stock) у product_id 45? Ответь только числом.", "expected_regex": "(?<!\\d)374(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_48_cat", "question": "Какая категория (products.category) у product_id 48? Ответь одним словом.", "expected_contains": "toys", "tags": ["product", "lookup"]}
+{"id": "prod_48_price", "question": "Какая цена (products.price) у product_id 48? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)295\\.74(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_48_stock", "question": "Сколько единиц на складе (products.in_stock) у product_id 48? Ответь только числом.", "expected_regex": "(?<!\\d)405(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_112_cat", "question": "Какая категория (products.category) у product_id 112? Ответь одним словом.", "expected_contains": "outdoors", "tags": ["product", "lookup"]}
+{"id": "prod_112_price", "question": "Какая цена (products.price) у product_id 112? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)338\\.69(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_112_stock", "question": "Сколько единиц на складе (products.in_stock) у product_id 112? Ответь только числом.", "expected_regex": "(?<!\\d)45(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_120_cat", "question": "Какая категория (products.category) у product_id 120? Ответь одним словом.", "expected_contains": "office_supplies", "tags": ["product", "lookup"]}
+{"id": "prod_120_price", "question": "Какая цена (products.price) у product_id 120? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)460\\.51(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_120_stock", "question": "Сколько единиц на складе (products.in_stock) у product_id 120? Ответь только числом.", "expected_regex": "(?<!\\d)228(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_217_cat", "question": "Какая категория (products.category) у product_id 217? Ответь одним словом.", "expected_contains": "office_supplies", "tags": ["product", "lookup"]}
+{"id": "prod_217_price", "question": "Какая цена (products.price) у product_id 217? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)303\\.63(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_217_stock", "question": "Сколько единиц на складе (products.in_stock) у product_id 217? Ответь только числом.", "expected_regex": "(?<!\\d)25(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_259_cat", "question": "Какая категория (products.category) у product_id 259? Ответь одним словом.", "expected_contains": "electronics", "tags": ["product", "lookup"]}
+{"id": "prod_259_price", "question": "Какая цена (products.price) у product_id 259? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)363\\.59(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_259_stock", "question": "Сколько единиц на складе (products.in_stock) у product_id 259? Ответь только числом.", "expected_regex": "(?<!\\d)94(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_280_cat", "question": "Какая категория (products.category) у product_id 280? Ответь одним словом.", "expected_contains": "office_supplies", "tags": ["product", "lookup"]}
+{"id": "prod_280_price", "question": "Какая цена (products.price) у product_id 280? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)497\\.07(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_280_stock", "question": "Сколько единиц на складе (products.in_stock) у product_id 280? Ответь только числом.", "expected_regex": "(?<!\\d)49(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_303_cat", "question": "Какая категория (products.category) у product_id 303? Ответь одним словом.", "expected_contains": "electronics", "tags": ["product", "lookup"]}
+{"id": "prod_303_price", "question": "Какая цена (products.price) у product_id 303? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)124\\.37(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_303_stock", "question": "Сколько единиц на складе (products.in_stock) у product_id 303? Ответь только числом.", "expected_regex": "(?<!\\d)243(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_309_cat", "question": "Какая категория (products.category) у product_id 309? Ответь одним словом.", "expected_contains": "furniture", "tags": ["product", "lookup"]}
+{"id": "prod_309_price", "question": "Какая цена (products.price) у product_id 309? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)289\\.97(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_309_stock", "question": "Сколько единиц на складе (products.in_stock) у product_id 309? Ответь только числом.", "expected_regex": "(?<!\\d)40(?!\\d)", "tags": ["product", "lookup"]}
+{"id": "prod_14_sold_qty", "question": "Сколько всего единиц было продано для product_id 14 (sum order_items.quantity)? Ответь только числом.", "expected_regex": "(?<!\\d)24(?!\\d)", "tags": ["product", "sales"]}
+{"id": "prod_14_sold_rev", "question": "Сколько выручки принес product_id 14 (sum order_items.line_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)3063\\.60(?!\\d)", "tags": ["product", "sales"]}
+{"id": "prod_16_sold_qty", "question": "Сколько всего единиц было продано для product_id 16 (sum order_items.quantity)? Ответь только числом.", "expected_regex": "(?<!\\d)15(?!\\d)", "tags": ["product", "sales"]}
+{"id": "prod_16_sold_rev", "question": "Сколько выручки принес product_id 16 (sum order_items.line_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)1917\\.60(?!\\d)", "tags": ["product", "sales"]}
+{"id": "prod_17_sold_qty", "question": "Сколько всего единиц было продано для product_id 17 (sum order_items.quantity)? Ответь только числом.", "expected_regex": "(?<!\\d)22(?!\\d)", "tags": ["product", "sales"]}
+{"id": "prod_17_sold_rev", "question": "Сколько выручки принес product_id 17 (sum order_items.line_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)6654\\.78(?!\\d)", "tags": ["product", "sales"]}
+{"id": "prod_45_sold_qty", "question": "Сколько всего единиц было продано для product_id 45 (sum order_items.quantity)? Ответь только числом.", "expected_regex": "(?<!\\d)9(?!\\d)", "tags": ["product", "sales"]}
+{"id": "prod_45_sold_rev", "question": "Сколько выручки принес product_id 45 (sum order_items.line_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)2143\\.35(?!\\d)", "tags": ["product", "sales"]}
+{"id": "prod_48_sold_qty", "question": "Сколько всего единиц было продано для product_id 48 (sum order_items.quantity)? Ответь только числом.", "expected_regex": "(?<!\\d)33(?!\\d)", "tags": ["product", "sales"]}
+{"id": "prod_48_sold_rev", "question": "Сколько выручки принес product_id 48 (sum order_items.line_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)9759\\.42(?!\\d)", "tags": ["product", "sales"]}
+{"id": "prod_112_sold_qty", "question": "Сколько всего единиц было продано для product_id 112 (sum order_items.quantity)? Ответь только числом.", "expected_regex": "(?<!\\d)9(?!\\d)", "tags": ["product", "sales"]}
+{"id": "prod_112_sold_rev", "question": "Сколько выручки принес product_id 112 (sum order_items.line_total)? Ответь числом с 2 знаками после точки.", "expected_regex": "(?<!\\d)1741\\.18(?!\\d)", "tags": ["product", "sales"]}
+{"id": "qa_001", "question": "Расскажи всё о клиенте 42: профиль (city, segment, signup_date), количество заказов, общая сумма, дата/статус/канал последнего заказа и 5 самых дорогих покупок (по line_total).", "tags": ["qa", "narrative"]}
+{"id": "qa_002", "question": "Покажи последние 10 заказов клиента 42 (order_id, order_date, status, channel, order_total) и по каждому — сколько позиций и общая quantity.", "tags": ["qa", "narrative"]}
+{"id": "qa_003", "question": "Кто из клиентов потратил больше всего в категории 'electronics'? Выведи top-5 клиентов (customer_id, name, city, сумма).", "tags": ["qa", "narrative"]}
+{"id": "qa_004", "question": "Сделай топ-10 товаров по выручке (product_id, name, category, revenue).", "tags": ["qa", "narrative"]}
+{"id": "qa_005", "question": "Сделай топ-10 товаров по количеству проданных единиц (product_id, name, category, units_sold).", "tags": ["qa", "narrative"]}
+{"id": "qa_006", "question": "Какие 10 заказов самые дорогие? Выведи order_id, customer_id, city клиента, order_date, status, channel, order_total.", "tags": ["qa", "narrative"]}
+{"id": "qa_007", "question": "Сколько заказов и какая выручка по каждому каналу (online/retail/partner/phone)? Дай ответ таблицей.", "tags": ["qa", "narrative"]}
+{"id": "qa_008", "question": "Сколько заказов и какая выручка по каждому статусу (delivered/shipped/processing/pending/cancelled)? Дай ответ таблицей.", "tags": ["qa", "narrative"]}
+{"id": "qa_009", "question": "Построй помесячную динамику выручки (YYYY-MM -> sum order_total) за весь период.", "tags": ["qa", "narrative"]}
+{"id": "qa_010", "question": "Построй помесячную динамику количества заказов за весь период; отдельно для статусов delivered и cancelled.", "tags": ["qa", "narrative"]}
+{"id": "qa_011", "question": "Сколько мы 'отгрузили' (status='shipped') в каждом городе за 2023 год? Сортируй по убыванию количества.", "tags": ["qa", "narrative"]}
+{"id": "qa_012", "question": "Сколько уникальных клиентов (distinct customer_id) сделали заказы в каждом городе?", "tags": ["qa", "narrative"]}
+{"id": "qa_013", "question": "Найди клиентов без единого заказа. Сколько их и приведи первые 20 customer_id + city + segment.", "tags": ["qa", "narrative"]}
+{"id": "qa_014", "question": "Найди товары с нулевым остатком (in_stock=0). Сколько их и приведи первые 20 (product_id, name, category, price).", "tags": ["qa", "narrative"]}
+{"id": "qa_015", "question": "Есть ли заказы, где сумма order_total не равна сумме line_total по order_items? Если есть — перечисли их (order_id, order_total, sum_line_total).", "tags": ["qa", "narrative"]}
+{"id": "qa_016", "question": "Для клиента 882: покажи разбивку его выручки по категориям (category -> revenue) и укажи топ-3 категории.", "tags": ["qa", "narrative"]}
+{"id": "qa_017", "question": "Для клиента 914: найди его самый дорогой заказ и перечисли товары в этом заказе (product_id, name, category, quantity, unit_price, line_total).", "tags": ["qa", "narrative"]}
+{"id": "qa_018", "question": "Для заказа 204: перечисли товары (product_id, name, category) и суммы по ним; проверь, что сумма равна order_total.", "tags": ["qa", "narrative"]}
+{"id": "qa_019", "question": "Сколько заказов было в 2022 году в городе 'San Diego', и как они распределены по каналам?", "tags": ["qa", "narrative"]}
+{"id": "qa_020", "question": "Какая средняя сумма заказа (avg order_total) по каждому сегменту клиента (consumer/corporate/home_office)?", "tags": ["qa", "narrative"]}
+{"id": "qa_021", "question": "Сравни средний чек по городам: выведи город, число заказов, avg order_total, median order_total.", "tags": ["qa", "narrative"]}
+{"id": "qa_022", "question": "Найди 20 клиентов с самым высоким числом заказов; выведи customer_id, name, city, segment, orders_count, total_spend.", "tags": ["qa", "narrative"]}
+{"id": "qa_023", "question": "Какие категории чаще всего покупают в канале 'phone'? Дай top-5 категорий по выручке в этом канале.", "tags": ["qa", "narrative"]}
+{"id": "qa_024", "question": "Для категории 'books': выведи 10 самых продаваемых товаров по количеству (product_id, name, units_sold).", "tags": ["qa", "narrative"]}
+{"id": "qa_025", "question": "Сформируй короткий executive summary по бизнесу: общий оборот, топ-3 категории, топ-3 города по выручке, доля отменённых заказов, и 3 инсайта по данным.", "tags": ["qa", "narrative"]}

From a3d9d121c7984154f69ea30a8eb447067719e78d Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 11:16:32 +0300
Subject: [PATCH 42/92] Fix REPL error artifacts to use run-specific
 directories

---
 examples/demo_qa/chat_repl.py | 5 +++--
 examples/demo_qa/runner.py    | 9 +++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/examples/demo_qa/chat_repl.py b/examples/demo_qa/chat_repl.py
index 0990ee3..2ebd14c 100644
--- a/examples/demo_qa/chat_repl.py
+++ b/examples/demo_qa/chat_repl.py
@@ -99,12 +99,13 @@ def start_repl(
             continue
 
         run_id = uuid.uuid4().hex[:8]
+        run_dir = runs_root / f"{run_id}_{uuid.uuid4().hex[:8]}"
         event_logger = EventLogger(path=None, run_id=run_id)
 
         artifacts: RunArtifacts | None = None
         try:
             case = Case(id=run_id, question=line, tags=[])
-            result = run_one(case, runner, runs_root, plan_only=False, event_logger=event_logger)
+            result = run_one(case, runner, runs_root, plan_only=False, event_logger=event_logger, run_dir=run_dir)
             plan_obj = _load_json(Path(result.artifacts_dir) / "plan.json")
             ctx_obj = _load_json(Path(result.artifacts_dir) / "context.json") or {}
             artifacts = RunArtifacts(
@@ -124,7 +125,7 @@ def start_repl(
             print(result.answer or "")
             print(f"Events: {Path(result.artifacts_dir) / 'events.jsonl'}")
         except Exception as exc:  # pragma: no cover - REPL resilience
-            error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=runs_root, question=line)
+            error_artifacts = artifacts or RunArtifacts(run_id=run_id, run_dir=run_dir, question=line)
             error_artifacts.error = error_artifacts.error or str(exc)
             last_artifacts = error_artifacts
             save_artifacts(error_artifacts)
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 5f32f47..f0575dd 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -273,9 +273,14 @@ def run_one(
     *,
     plan_only: bool = False,
     event_logger: EventLogger | None = None,
+    run_dir: Path | None = None,
 ) -> RunResult:
-    run_id = uuid.uuid4().hex[:8]
-    run_dir = artifacts_root / f"{case.id}_{run_id}"
+    if run_dir is None:
+        run_id = uuid.uuid4().hex[:8]
+        run_dir = artifacts_root / f"{case.id}_{run_id}"
+    else:
+        run_id = run_dir.name.split("_")[-1]
+
     case_logger = event_logger.for_case(case.id, run_dir / "events.jsonl") if event_logger else None
     if case_logger:
         case_logger.emit({"type": "case_started", "case_id": case.id, "run_dir": str(run_dir)})

From 34675c13aba8128857f927c3546a6e1e83814f46 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sun, 21 Dec 2025 22:25:17 +0300
Subject: [PATCH 43/92] =?UTF-8?q?=D0=B8=D1=81=D0=BA=D0=BB=D1=8E=D1=87?=
 =?UTF-8?q?=D0=B0=D0=B5=D0=BC=20=D0=BB=D0=BE=D0=B3=D0=B8=20=D0=B8=D0=B7=20?=
 =?UTF-8?q?=D0=BE=D1=82=D1=81=D0=BB=D0=B5=D0=B6=D0=B8=D0=B2=D0=B0=D0=BD?=
 =?UTF-8?q?=D0=B8=D1=8F=20=D0=B2=20=D1=80=D0=B5=D0=BF=D0=BE=D0=B7=D0=B8?=
 =?UTF-8?q?=D1=82=D0=BE=D1=80=D0=B8=D0=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index a4f7611..2ca3c0a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ build/
 examples/demo_qa/demo_qa.toml
 **/demo_qa.toml
 .env.demo_qa
+_demo_data/*/.runs/*

From 9deaed739551cbf0146b4ff753bcf428cba4d84a Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 23:06:45 +0300
Subject: [PATCH 44/92] Handle partial batch runs and missed case selection

---
 examples/demo_qa/batch.py   | 191 ++++++++++++++++++++++++++++++------
 examples/demo_qa/cli.py     |   7 ++
 tests/test_demo_qa_batch.py |  60 ++++++++++-
 3 files changed, 228 insertions(+), 30 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 7abdeb4..7ae8337 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -114,8 +114,21 @@ def _git_sha() -> Optional[str]:
     return result.stdout.strip() or None
 
 
-def _load_latest_run(artifacts_dir: Path) -> Optional[Path]:
-    latest_file = artifacts_dir / "runs" / "latest.txt"
+def _sanitize_tag(tag: str) -> str:
+    cleaned = "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "_" for ch in tag)
+    return cleaned or "tag"
+
+
+def _latest_markers(artifacts_dir: Path, tag: str | None) -> tuple[Path, Path]:
+    runs_dir = artifacts_dir / "runs"
+    if tag:
+        slug = _sanitize_tag(tag)
+        return runs_dir / f"tag-latest-{slug}.txt", runs_dir / f"tag-latest-results-{slug}.txt"
+    return runs_dir / "latest.txt", runs_dir / "latest_results.txt"
+
+
+def _load_latest_run(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]:
+    latest_file, _ = _latest_markers(artifacts_dir, tag)
     if latest_file.exists():
         content = latest_file.read_text(encoding="utf-8").strip()
         if content:
@@ -123,13 +136,13 @@ def _load_latest_run(artifacts_dir: Path) -> Optional[Path]:
     return None
 
 
-def _load_latest_results(artifacts_dir: Path) -> Optional[Path]:
-    latest_file = artifacts_dir / "runs" / "latest_results.txt"
+def _load_latest_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]:
+    _, latest_file = _latest_markers(artifacts_dir, tag)
     if latest_file.exists():
         content = latest_file.read_text(encoding="utf-8").strip()
         if content:
             return Path(content)
-    latest_run = _load_latest_run(artifacts_dir)
+    latest_run = _load_latest_run(artifacts_dir, tag)
     if latest_run:
         summary_path = latest_run / "summary.json"
         if summary_path.exists():
@@ -143,6 +156,39 @@ def _load_latest_results(artifacts_dir: Path) -> Optional[Path]:
     return None
 
 
+def _load_run_meta(run_path: Path | None) -> Optional[dict]:
+    if run_path is None:
+        return None
+    meta_path = run_path / "run_meta.json"
+    if not meta_path.exists():
+        return None
+    try:
+        return json.loads(meta_path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+
+
+def _missed_case_ids(planned_case_ids: Iterable[str], executed_results: Mapping[str, RunResult] | None) -> set[str]:
+    planned_set = set(planned_case_ids)
+    if not executed_results:
+        return planned_set
+    try:
+        executed_ids = set(executed_results.keys())
+    except Exception:
+        executed_ids = set()
+    return planned_set - executed_ids
+
+
+def _update_latest_markers(run_folder: Path, results_path: Path, artifacts_dir: Path, tag: str | None) -> None:
+    marker_pairs = {_latest_markers(artifacts_dir, None)}
+    if tag:
+        marker_pairs.add(_latest_markers(artifacts_dir, tag))
+    for latest_path, latest_results_path in marker_pairs:
+        latest_path.parent.mkdir(parents=True, exist_ok=True)
+        latest_path.write_text(str(run_folder), encoding="utf-8")
+        latest_results_path.write_text(str(results_path), encoding="utf-8")
+
+
 def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]:
     cases_dir = run_path / "cases"
     if not cases_dir.exists():
@@ -336,6 +382,8 @@ def _select_cases_for_rerun(
 def handle_batch(args) -> int:
     started_at = datetime.datetime.utcnow()
     run_id = uuid.uuid4().hex[:8]
+    interrupted = False
+    interrupted_at_case_id: str | None = None
 
     try:
         settings = load_settings(config_path=args.config, data_dir=args.data)
@@ -355,13 +403,18 @@ def handle_batch(args) -> int:
     if artifacts_dir is None:
         artifacts_dir = args.data / ".runs"
 
+    include_tags = _split_csv(args.include_tags)
+    exclude_tags = _split_csv(args.exclude_tags)
+    include_ids = _load_ids(args.include_ids)
+    exclude_ids = _load_ids(args.exclude_ids)
+
     baseline_filter_path = args.only_failed_from
     if args.only_failed and not baseline_filter_path:
-        latest_results = _load_latest_results(artifacts_dir)
+        latest_results = _load_latest_results(artifacts_dir, args.tag)
         if latest_results:
             baseline_filter_path = latest_results
         else:
-            latest_run = _load_latest_run(artifacts_dir)
+            latest_run = _load_latest_run(artifacts_dir, args.tag)
             if latest_run:
                 candidate = latest_run / "results.jsonl"
                 if candidate.exists():
@@ -391,12 +444,47 @@ def handle_batch(args) -> int:
         baseline_for_filter,
         require_assert=args.require_assert,
         fail_on=args.fail_on,
-        include_tags=_split_csv(args.include_tags),
-        exclude_tags=_split_csv(args.exclude_tags),
-        include_ids=_load_ids(args.include_ids),
-        exclude_ids=_load_ids(args.exclude_ids),
+        include_tags=include_tags,
+        exclude_tags=exclude_tags,
+        include_ids=include_ids,
+        exclude_ids=exclude_ids,
     )
 
+    baseline_planned_ids: set[str] | None = None
+    missed_baseline_results: Optional[Mapping[str, RunResult]] = None
+    missed_baseline_path: Path | None = None
+    missed_baseline_run: Path | None = None
+    if args.only_missed:
+        missed_baseline_path = _load_latest_results(artifacts_dir, args.tag)
+        missed_baseline_run = _load_latest_run(artifacts_dir, args.tag)
+        if missed_baseline_path:
+            try:
+                missed_baseline_results = load_results(missed_baseline_path)
+            except Exception as exc:
+                print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr)
+                return 2
+        else:
+            print("No baseline results found for --only-missed; running all filtered cases.", file=sys.stderr)
+        baseline_meta = _load_run_meta(missed_baseline_run)
+        if isinstance(baseline_meta, dict):
+            planned_from_meta = baseline_meta.get("planned_case_ids")
+            if isinstance(planned_from_meta, list):
+                baseline_planned_ids = {str(cid) for cid in planned_from_meta}
+            else:
+                try:
+                    planned_total_meta = int(baseline_meta.get("planned_total", 0))
+                except Exception:
+                    planned_total_meta = 0
+                if planned_total_meta:
+                    baseline_planned_ids = {case.id for case in cases}
+
+    planned_case_ids = [case.id for case in cases]
+    if args.only_missed:
+        planned_pool = baseline_planned_ids or set(planned_case_ids)
+        missed_ids = _missed_case_ids(planned_pool, missed_baseline_results)
+        cases = [case for case in cases if case.id in missed_ids]
+        planned_case_ids = [case.id for case in cases]
+
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}"
     results_path = args.out or (run_folder / "results.jsonl")
@@ -427,15 +515,22 @@ def handle_batch(args) -> int:
 
     results: list[RunResult] = []
     failures = 0
-    for case in cases:
-        result = run_one(case, runner, artifacts_root, plan_only=args.plan_only, event_logger=event_logger)
-        results.append(result)
-        if not args.quiet:
-            print(format_status_line(result))
-        if is_failure(result.status, args.fail_on, args.require_assert):
-            failures += 1
-            if args.fail_fast or (args.max_fails and failures >= args.max_fails):
-                break
+    current_case_id: str | None = None
+    try:
+        for case in cases:
+            current_case_id = case.id
+            result = run_one(case, runner, artifacts_root, plan_only=args.plan_only, event_logger=event_logger)
+            results.append(result)
+            if not args.quiet:
+                print(format_status_line(result))
+            if is_failure(result.status, args.fail_on, args.require_assert):
+                failures += 1
+                if args.fail_fast or (args.max_fails and failures >= args.max_fails):
+                    break
+    except KeyboardInterrupt:
+        interrupted = True
+        interrupted_at_case_id = current_case_id
+        print("Interrupted; finalizing partial results...", file=sys.stderr)
 
     write_results(results_path, results)
     counts = summarize(results)
@@ -456,10 +551,14 @@ def handle_batch(args) -> int:
 
     policy_bad = bad_statuses(args.fail_on, args.require_assert)
     bad_count = sum(int(counts.get(status, 0) or 0) for status in policy_bad)
-    exit_code = 1 if bad_count else 0
+    exit_code = 130 if interrupted else (1 if bad_count else 0)
 
     ended_at = datetime.datetime.utcnow()
     duration_ms = int((ended_at - started_at).total_seconds() * 1000)
+    executed_results = {res.id: res for res in results}
+    planned_total = len(planned_case_ids)
+    executed_total = len(results)
+    missed_total = len(_missed_case_ids(planned_case_ids, executed_results))
     summary = {
         "run_id": run_id,
         "started_at": started_at.isoformat() + "Z",
@@ -471,6 +570,13 @@ def handle_batch(args) -> int:
         "results_path": str(results_path),
         "require_assert": args.require_assert,
         "fail_on": args.fail_on,
+        "planned_total": planned_total,
+        "executed_total": executed_total,
+        "missed_total": missed_total,
+        "interrupted": interrupted,
+        "interrupted_at_case_id": interrupted_at_case_id,
+        "tag": args.tag,
+        "note": args.note,
     }
     if diff_block:
         summary["diff"] = diff_block
@@ -490,14 +596,14 @@ def handle_batch(args) -> int:
                 "duration_ms": duration_ms,
                 "run_dir": str(run_folder),
                 "results_path": str(results_path),
+                "interrupted": interrupted,
+                "planned_total": planned_total,
+                "executed_total": executed_total,
+                "missed_total": missed_total,
             }
         )
 
-    latest_path = run_folder.parent / "latest.txt"
-    latest_results_path = run_folder.parent / "latest_results.txt"
-    latest_path.parent.mkdir(parents=True, exist_ok=True)
-    latest_path.write_text(str(run_folder), encoding="utf-8")
-    latest_results_path.write_text(str(results_path), encoding="utf-8")
+    _update_latest_markers(run_folder, results_path, artifacts_dir, args.tag)
 
     config_hash = _hash_file(args.config) if args.config else None
     schema_hash = _hash_file(args.schema)
@@ -507,6 +613,8 @@ def handle_batch(args) -> int:
     run_meta = {
         "run_id": run_id,
         "timestamp": started_at.isoformat() + "Z",
+        "tag": args.tag,
+        "note": args.note,
         "inputs": {
             "cases_path": str(args.cases),
             "cases_hash": cases_hash,
@@ -516,6 +624,23 @@ def handle_batch(args) -> int:
             "schema_hash": schema_hash,
             "data_dir": str(args.data),
         },
+        "planned_case_ids": planned_case_ids,
+        "planned_total": planned_total,
+        "selected_filters": {
+            "include_tags": sorted(include_tags) if include_tags else None,
+            "exclude_tags": sorted(exclude_tags) if exclude_tags else None,
+            "include_ids_path": str(args.include_ids) if args.include_ids else None,
+            "exclude_ids_path": str(args.exclude_ids) if args.exclude_ids else None,
+            "only_failed": bool(args.only_failed or args.only_failed_from),
+            "only_failed_from": str(baseline_filter_path) if baseline_filter_path else None,
+            "only_missed": args.only_missed,
+            "only_missed_from": str(missed_baseline_path) if missed_baseline_path else None,
+            "plan_only": args.plan_only,
+            "fail_fast": args.fail_fast,
+            "max_fails": args.max_fails,
+        },
+        "interrupted": interrupted,
+        "interrupted_at_case_id": interrupted_at_case_id,
         "data_fingerprint": data_fingerprint,
         "llm": {
             "plan_model": llm_settings.plan_model,
@@ -539,6 +664,8 @@ def handle_batch(args) -> int:
         "config_hash": config_hash,
         "schema_hash": schema_hash,
         "cases_hash": cases_hash,
+        "tag": args.tag,
+        "note": args.note,
         "ok": counts.get("ok", 0),
         "mismatch": counts.get("mismatch", 0),
         "error": counts.get("error", 0),
@@ -554,6 +681,11 @@ def handle_batch(args) -> int:
         "fail_on": args.fail_on,
         "require_assert": args.require_assert,
         "fail_count": bad_count,
+        "planned_total": planned_total,
+        "executed_total": executed_total,
+        "missed_total": missed_total,
+        "interrupted": interrupted,
+        "interrupted_at_case_id": interrupted_at_case_id,
     }
     history_path.parent.mkdir(parents=True, exist_ok=True)
     with history_path.open("a", encoding="utf-8") as f:
@@ -562,9 +694,10 @@ def handle_batch(args) -> int:
     unchecked = counts.get("unchecked", 0)
     plan_only = counts.get("plan_only", 0)
     summary_line = (
-        f"Batch: {counts.get('total', 0)} cases | Checked: {counts.get('checked_total', 0)} | "
-        f"Checked OK: {counts.get('checked_ok', 0)} | Unchecked(no-assert): {unchecked} | "
-        f"Plan-only: {plan_only} | FAIL(policy): {bad_count} | Skipped: {counts.get('skipped', 0)}"
+        f"Batch: planned {planned_total}, executed {executed_total}, missed {missed_total} | "
+        f"Checked: {counts.get('checked_total', 0)} | Checked OK: {counts.get('checked_ok', 0)} | "
+        f"Unchecked(no-assert): {unchecked} | Plan-only: {plan_only} | "
+        f"FAIL(policy): {bad_count} | Skipped: {counts.get('skipped', 0)}"
     )
 
     if args.quiet:
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index dbb279d..16127b6 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -52,6 +52,13 @@ def build_parser() -> argparse.ArgumentParser:
     batch_p.add_argument("--schema", type=Path, required=True)
     batch_p.add_argument("--config", type=Path, default=None, help="Path to demo_qa.toml")
     batch_p.add_argument("--cases", type=Path, required=True, help="Path to cases jsonl")
+    batch_p.add_argument("--tag", type=str, default=None, help="Label this run and use tag-specific latest pointers")
+    batch_p.add_argument("--note", type=str, default=None, help="Free-form note to attach to the run metadata")
+    batch_p.add_argument(
+        "--only-missed",
+        action="store_true",
+        help="Run only cases missing in the latest (or tag-latest) effective results",
+    )
     batch_p.add_argument("--out", type=Path, required=False, default=None, help="Path to results jsonl")
     batch_p.add_argument("--artifacts-dir", type=Path, default=None, help="Where to store per-case artifacts")
     batch_p.add_argument("--enable-semantic", action="store_true")
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 40ee32d..16d7cb5 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -3,12 +3,46 @@
 import itertools
 import json
 import os
+import sys
 import time
+import types
 from pathlib import Path
 
 import pytest
+from pydantic import BaseModel
 
-from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown, write_results
+if "pydantic_settings" not in sys.modules:
+    stub = types.ModuleType("pydantic_settings")
+
+    class BaseSettings(BaseModel):
+        model_config = {}
+
+    def SettingsConfigDict(**kwargs):
+        return kwargs
+
+    stub.BaseSettings = BaseSettings
+    stub.SettingsConfigDict = SettingsConfigDict
+
+    sources_mod = types.ModuleType("pydantic_settings.sources")
+
+    def TomlConfigSettingsSource(settings_cls, toml_file):
+        return {}
+
+    sources_mod.TomlConfigSettingsSource = TomlConfigSettingsSource
+    stub.sources = sources_mod
+    sys.modules["pydantic_settings"] = stub
+    sys.modules["pydantic_settings.sources"] = sources_mod
+
+from examples.demo_qa.batch import (
+    _fingerprint_dir,
+    _latest_markers,
+    _missed_case_ids,
+    _update_latest_markers,
+    bad_statuses,
+    is_failure,
+    render_markdown,
+    write_results,
+)
 from examples.demo_qa.runner import RunResult, diff_runs
 
 
@@ -92,3 +126,27 @@ def test_write_results_is_deterministic(tmp_path: Path) -> None:
     line = out.read_text(encoding="utf-8").strip()
     expected = json.dumps(res.to_json(), ensure_ascii=False, sort_keys=True, separators=(",", ":"))
     assert line == expected
+
+
+def test_missed_case_ids_diff_planned_and_executed() -> None:
+    planned = ["a", "b", "c", "a"]
+    executed = {_mk_result("b", "ok").id: _mk_result("b", "ok")}
+    assert _missed_case_ids(planned, executed) == {"a", "c"}
+
+
+def test_update_latest_markers_handles_tag(tmp_path: Path) -> None:
+    artifacts_dir = tmp_path / "data" / ".runs"
+    run_dir = artifacts_dir / "runs" / "20240101_cases"
+    results_path = run_dir / "results.jsonl"
+    run_dir.mkdir(parents=True)
+    results_path.write_text("{}", encoding="utf-8")
+
+    _update_latest_markers(run_dir, results_path, artifacts_dir, "feature/beta")
+
+    latest_default, latest_results_default = _latest_markers(artifacts_dir, None)
+    assert latest_default.read_text(encoding="utf-8").strip() == str(run_dir)
+    assert latest_results_default.read_text(encoding="utf-8").strip() == str(results_path)
+
+    latest_tag, latest_results_tag = _latest_markers(artifacts_dir, "feature/beta")
+    assert latest_tag.read_text(encoding="utf-8").strip() == str(run_dir)
+    assert latest_results_tag.read_text(encoding="utf-8").strip() == str(results_path)

From 9eb71782ac1308b26aec126d6f0032ffd171ebb7 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 23:14:28 +0300
Subject: [PATCH 45/92] Improve only-missed baseline resolution and interrupt
 handling

---
 examples/demo_qa/batch.py   | 61 ++++++++++++++++++++++++++++++++-----
 tests/test_demo_qa_batch.py | 25 ---------------
 2 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 7ae8337..d903f72 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -16,6 +16,7 @@
     Case,
     EventLogger,
     RunResult,
+    RunTimings,
     bad_statuses,
     build_agent,
     diff_runs,
@@ -24,6 +25,7 @@
     load_cases,
     load_results,
     run_one,
+    save_status,
     summarize,
 )
 from .settings import load_settings
@@ -168,6 +170,22 @@ def _load_run_meta(run_path: Path | None) -> Optional[dict]:
         return None
 
 
+def _run_dir_from_results_path(results_path: Path | None) -> Optional[Path]:
+    if results_path is None:
+        return None
+    run_dir = results_path.parent
+    summary_path = run_dir / "summary.json"
+    if summary_path.exists():
+        try:
+            summary = json.loads(summary_path.read_text(encoding="utf-8"))
+            run_dir_from_summary = summary.get("run_dir")
+            if run_dir_from_summary:
+                return Path(run_dir_from_summary)
+        except Exception:
+            pass
+    return run_dir
+
+
 def _missed_case_ids(planned_case_ids: Iterable[str], executed_results: Mapping[str, RunResult] | None) -> set[str]:
     planned_set = set(planned_case_ids)
     if not executed_results:
@@ -456,7 +474,9 @@ def handle_batch(args) -> int:
     missed_baseline_run: Path | None = None
     if args.only_missed:
         missed_baseline_path = _load_latest_results(artifacts_dir, args.tag)
-        missed_baseline_run = _load_latest_run(artifacts_dir, args.tag)
+        missed_baseline_run = _run_dir_from_results_path(missed_baseline_path)
+        if missed_baseline_run is None:
+            missed_baseline_run = _load_latest_run(artifacts_dir, args.tag)
         if missed_baseline_path:
             try:
                 missed_baseline_results = load_results(missed_baseline_path)
@@ -471,12 +491,11 @@ def handle_batch(args) -> int:
             if isinstance(planned_from_meta, list):
                 baseline_planned_ids = {str(cid) for cid in planned_from_meta}
             else:
-                try:
-                    planned_total_meta = int(baseline_meta.get("planned_total", 0))
-                except Exception:
-                    planned_total_meta = 0
-                if planned_total_meta:
-                    baseline_planned_ids = {case.id for case in cases}
+                print(
+                    "Baseline run meta missing planned_case_ids; computing missed relative to current filtered cases.",
+                    file=sys.stderr,
+                )
+                baseline_planned_ids = {case.id for case in cases}
 
     planned_case_ids = [case.id for case in cases]
     if args.only_missed:
@@ -519,7 +538,33 @@ def handle_batch(args) -> int:
     try:
         for case in cases:
             current_case_id = case.id
-            result = run_one(case, runner, artifacts_root, plan_only=args.plan_only, event_logger=event_logger)
+            try:
+                result = run_one(case, runner, artifacts_root, plan_only=args.plan_only, event_logger=event_logger)
+            except KeyboardInterrupt:
+                interrupted = True
+                interrupted_at_case_id = current_case_id
+                run_dir = artifacts_root / f"{case.id}_{uuid.uuid4().hex[:8]}"
+                run_dir.mkdir(parents=True, exist_ok=True)
+                stub = RunResult(
+                    id=case.id,
+                    question=case.question,
+                    status="error",
+                    checked=case.has_asserts,
+                    reason="KeyboardInterrupt",
+                    details={"error": "KeyboardInterrupt"},
+                    artifacts_dir=str(run_dir),
+                    duration_ms=0,
+                    tags=list(case.tags),
+                    answer=None,
+                    error="KeyboardInterrupt",
+                    plan_path=None,
+                    timings=RunTimings(),
+                    expected_check=None,
+                )
+                save_status(stub)
+                results.append(stub)
+                print("Interrupted during case execution; saved partial status.", file=sys.stderr)
+                break
             results.append(result)
             if not args.quiet:
                 print(format_status_line(result))
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 16d7cb5..e9d92fa 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -3,35 +3,10 @@
 import itertools
 import json
 import os
-import sys
 import time
-import types
 from pathlib import Path
 
 import pytest
-from pydantic import BaseModel
-
-if "pydantic_settings" not in sys.modules:
-    stub = types.ModuleType("pydantic_settings")
-
-    class BaseSettings(BaseModel):
-        model_config = {}
-
-    def SettingsConfigDict(**kwargs):
-        return kwargs
-
-    stub.BaseSettings = BaseSettings
-    stub.SettingsConfigDict = SettingsConfigDict
-
-    sources_mod = types.ModuleType("pydantic_settings.sources")
-
-    def TomlConfigSettingsSource(settings_cls, toml_file):
-        return {}
-
-    sources_mod.TomlConfigSettingsSource = TomlConfigSettingsSource
-    stub.sources = sources_mod
-    sys.modules["pydantic_settings"] = stub
-    sys.modules["pydantic_settings.sources"] = sources_mod
 
 from examples.demo_qa.batch import (
     _fingerprint_dir,

From bfff2f6c9f84866a875b855c49976a6896097e97 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 23:17:31 +0300
Subject: [PATCH 46/92] Add only-missed from path and baseline metadata

---
 examples/demo_qa/batch.py | 17 ++++++++++++++++-
 examples/demo_qa/cli.py   |  6 ++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index d903f72..75a2f40 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -427,16 +427,21 @@ def handle_batch(args) -> int:
     exclude_ids = _load_ids(args.exclude_ids)
 
     baseline_filter_path = args.only_failed_from
+    only_failed_baseline_kind: str | None = None
     if args.only_failed and not baseline_filter_path:
         latest_results = _load_latest_results(artifacts_dir, args.tag)
         if latest_results:
             baseline_filter_path = latest_results
+            only_failed_baseline_kind = "latest"
         else:
             latest_run = _load_latest_run(artifacts_dir, args.tag)
             if latest_run:
                 candidate = latest_run / "results.jsonl"
                 if candidate.exists():
                     baseline_filter_path = candidate
+                    only_failed_baseline_kind = "latest"
+    if args.only_failed_from:
+        only_failed_baseline_kind = "path"
     if baseline_filter_path:
         try:
             baseline_for_filter = load_results(baseline_filter_path)
@@ -472,8 +477,13 @@ def handle_batch(args) -> int:
     missed_baseline_results: Optional[Mapping[str, RunResult]] = None
     missed_baseline_path: Path | None = None
     missed_baseline_run: Path | None = None
+    only_missed_baseline_kind: str | None = None
     if args.only_missed:
-        missed_baseline_path = _load_latest_results(artifacts_dir, args.tag)
+        missed_baseline_path = args.only_missed_from or _load_latest_results(artifacts_dir, args.tag)
+        if args.only_missed_from:
+            only_missed_baseline_kind = "path"
+        elif missed_baseline_path:
+            only_missed_baseline_kind = "latest"
         missed_baseline_run = _run_dir_from_results_path(missed_baseline_path)
         if missed_baseline_run is None:
             missed_baseline_run = _load_latest_run(artifacts_dir, args.tag)
@@ -503,6 +513,8 @@ def handle_batch(args) -> int:
         missed_ids = _missed_case_ids(planned_pool, missed_baseline_results)
         cases = [case for case in cases if case.id in missed_ids]
         planned_case_ids = [case.id for case in cases]
+        if not cases:
+            print("0 missed cases selected.", file=sys.stderr)
 
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}"
@@ -678,8 +690,11 @@ def handle_batch(args) -> int:
             "exclude_ids_path": str(args.exclude_ids) if args.exclude_ids else None,
             "only_failed": bool(args.only_failed or args.only_failed_from),
             "only_failed_from": str(baseline_filter_path) if baseline_filter_path else None,
+            "only_failed_baseline_kind": only_failed_baseline_kind,
             "only_missed": args.only_missed,
             "only_missed_from": str(missed_baseline_path) if missed_baseline_path else None,
+            "only_missed_baseline_kind": only_missed_baseline_kind,
+            "baseline_tag": args.tag,
             "plan_only": args.plan_only,
             "fail_fast": args.fail_fast,
             "max_fails": args.max_fails,
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index 16127b6..e4048bc 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -59,6 +59,12 @@ def build_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Run only cases missing in the latest (or tag-latest) effective results",
     )
+    batch_p.add_argument(
+        "--only-missed-from",
+        type=Path,
+        default=None,
+        help="Run only cases missing in the provided results.jsonl (or latest if omitted)",
+    )
     batch_p.add_argument("--out", type=Path, required=False, default=None, help="Path to results jsonl")
     batch_p.add_argument("--artifacts-dir", type=Path, default=None, help="Where to store per-case artifacts")
     batch_p.add_argument("--enable-semantic", action="store_true")

From 5a5f1ee3643599dd93f574a7ea803d416d47311d Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 23:26:23 +0300
Subject: [PATCH 47/92] Add effective tag baselines and baseline metadata

---
 examples/demo_qa/batch.py | 195 +++++++++++++++++++++++++++++++++-----
 examples/demo_qa/cli.py   |   2 +-
 2 files changed, 174 insertions(+), 23 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 75a2f40..7bc0302 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -121,6 +121,11 @@ def _sanitize_tag(tag: str) -> str:
     return cleaned or "tag"
 
 
+def _effective_paths(artifacts_dir: Path, tag: str) -> tuple[Path, Path]:
+    base = artifacts_dir / "runs" / "tags" / _sanitize_tag(tag)
+    return base / "effective_results.jsonl", base / "effective_meta.json"
+
+
 def _latest_markers(artifacts_dir: Path, tag: str | None) -> tuple[Path, Path]:
     runs_dir = artifacts_dir / "runs"
     if tag:
@@ -207,6 +212,79 @@ def _update_latest_markers(run_folder: Path, results_path: Path, artifacts_dir:
         latest_results_path.write_text(str(results_path), encoding="utf-8")
 
 
+def _load_effective_results(artifacts_dir: Path, tag: str) -> tuple[dict[str, RunResult], Optional[dict], Path]:
+    results_path, meta_path = _effective_paths(artifacts_dir, tag)
+    meta: Optional[dict] = None
+    results: dict[str, RunResult] = {}
+    if results_path.exists():
+        results = load_results(results_path)
+    if meta_path.exists():
+        try:
+            meta = json.loads(meta_path.read_text(encoding="utf-8"))
+        except Exception:
+            meta = None
+    return results, meta, results_path
+
+
+def _write_effective_results(results_path: Path, results: Mapping[str, RunResult]) -> None:
+    results_path.parent.mkdir(parents=True, exist_ok=True)
+    ordered = [results[cid] for cid in sorted(results)]
+    write_results(results_path, ordered)
+
+
+def _update_effective_snapshot(
+    *,
+    artifacts_dir: Path,
+    tag: str,
+    cases_hash: str,
+    cases_path: Path,
+    planned_case_ids: list[str],
+    executed_results: list[RunResult],
+    run_folder: Path,
+    planned_case_ids_source: list[str] | None,
+) -> tuple[Path, Path]:
+    effective_results, effective_meta, effective_results_path = _load_effective_results(artifacts_dir, tag)
+    if effective_meta and effective_meta.get("cases_hash") and effective_meta["cases_hash"] != cases_hash:
+        raise ValueError(
+            f"Existing effective results for tag {tag!r} use a different cases_hash; refusing to merge."
+        )
+
+    planned_pool: set[str]
+    if effective_meta and isinstance(effective_meta.get("planned_case_ids"), list):
+        planned_pool = {str(cid) for cid in effective_meta["planned_case_ids"]}
+    elif planned_case_ids_source:
+        planned_pool = set(planned_case_ids_source)
+    else:
+        planned_pool = set(planned_case_ids)
+
+    for res in executed_results:
+        effective_results[res.id] = res
+    _write_effective_results(effective_results_path, effective_results)
+
+    summary_counts = summarize(effective_results.values())
+    executed_total = len(effective_results)
+    missed_total = len(_missed_case_ids(planned_pool, effective_results))
+    meta_path = effective_results_path.with_name("effective_meta.json")
+    built_from = set(effective_meta.get("built_from_runs", [])) if effective_meta else set()
+    built_from.add(str(run_folder))
+    effective_meta_payload = {
+        "tag": tag,
+        "cases_hash": cases_hash,
+        "cases_path": str(cases_path),
+        "planned_case_ids": sorted(planned_pool),
+        "planned_total": len(planned_pool),
+        "executed_total": executed_total,
+        "missed_total": missed_total,
+        "counts": summary_counts,
+        "updated_at": datetime.datetime.utcnow().isoformat() + "Z",
+        "built_from_runs": sorted(built_from),
+        "effective_results_path": str(effective_results_path),
+    }
+    meta_path.parent.mkdir(parents=True, exist_ok=True)
+    dump_json(meta_path, effective_meta_payload)
+    return effective_results_path, meta_path
+
+
 def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]:
     cases_dir = run_path / "cases"
     if not cases_dir.exists():
@@ -402,6 +480,7 @@ def handle_batch(args) -> int:
     run_id = uuid.uuid4().hex[:8]
     interrupted = False
     interrupted_at_case_id: str | None = None
+    cases_hash = _hash_file(args.cases)
 
     try:
         settings = load_settings(config_path=args.config, data_dir=args.data)
@@ -428,7 +507,25 @@ def handle_batch(args) -> int:
 
     baseline_filter_path = args.only_failed_from
     only_failed_baseline_kind: str | None = None
-    if args.only_failed and not baseline_filter_path:
+    effective_results_path: Path | None = None
+    if args.only_failed_from:
+        only_failed_baseline_kind = "path"
+    elif args.tag and args.only_failed:
+        effective_results, effective_meta, eff_path = _load_effective_results(artifacts_dir, args.tag)
+        if not effective_results:
+            print(f"No effective results found for tag {args.tag!r}; run a tagged batch first.", file=sys.stderr)
+            return 2
+        if effective_meta and effective_meta.get("cases_hash") not in (None, cases_hash):
+            print(
+                f"Effective results cases_hash {effective_meta.get('cases_hash')} does not match current cases file.",
+                file=sys.stderr,
+            )
+            return 2
+        baseline_for_filter = effective_results
+        baseline_filter_path = eff_path
+        effective_results_path = eff_path
+        only_failed_baseline_kind = "effective"
+    elif args.only_failed:
         latest_results = _load_latest_results(artifacts_dir, args.tag)
         if latest_results:
             baseline_filter_path = latest_results
@@ -440,14 +537,15 @@ def handle_batch(args) -> int:
                 if candidate.exists():
                     baseline_filter_path = candidate
                     only_failed_baseline_kind = "latest"
-    if args.only_failed_from:
-        only_failed_baseline_kind = "path"
-    if baseline_filter_path:
+    if baseline_filter_path and baseline_for_filter is None:
         try:
             baseline_for_filter = load_results(baseline_filter_path)
         except Exception as exc:
             print(f"Failed to read baseline for --only-failed-from: {exc}", file=sys.stderr)
             return 2
+    if args.only_failed and baseline_for_filter is None:
+        print("No baseline found for --only-failed.", file=sys.stderr)
+        return 2
 
     compare_path = args.compare_to
     if compare_path is None and args.only_failed and baseline_filter_path:
@@ -479,33 +577,70 @@ def handle_batch(args) -> int:
     missed_baseline_run: Path | None = None
     only_missed_baseline_kind: str | None = None
     if args.only_missed:
-        missed_baseline_path = args.only_missed_from or _load_latest_results(artifacts_dir, args.tag)
         if args.only_missed_from:
+            missed_baseline_path = args.only_missed_from
             only_missed_baseline_kind = "path"
-        elif missed_baseline_path:
-            only_missed_baseline_kind = "latest"
-        missed_baseline_run = _run_dir_from_results_path(missed_baseline_path)
-        if missed_baseline_run is None:
-            missed_baseline_run = _load_latest_run(artifacts_dir, args.tag)
-        if missed_baseline_path:
             try:
                 missed_baseline_results = load_results(missed_baseline_path)
             except Exception as exc:
-                print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr)
+                print(f"Failed to read baseline for --only-missed-from: {exc}", file=sys.stderr)
                 return 2
-        else:
-            print("No baseline results found for --only-missed; running all filtered cases.", file=sys.stderr)
-        baseline_meta = _load_run_meta(missed_baseline_run)
-        if isinstance(baseline_meta, dict):
-            planned_from_meta = baseline_meta.get("planned_case_ids")
-            if isinstance(planned_from_meta, list):
-                baseline_planned_ids = {str(cid) for cid in planned_from_meta}
-            else:
+        elif args.tag:
+            effective_results, effective_meta, eff_path = _load_effective_results(artifacts_dir, args.tag)
+            if not effective_results:
+                print(f"No effective results found for tag {args.tag!r}; run a tagged batch first.", file=sys.stderr)
+                return 2
+            if effective_meta and effective_meta.get("cases_hash") not in (None, cases_hash):
+                print(
+                    f"Effective results cases_hash {effective_meta.get('cases_hash')} does not match current cases file.",
+                    file=sys.stderr,
+                )
+                return 2
+            missed_baseline_path = eff_path
+            missed_baseline_results = effective_results
+            only_missed_baseline_kind = "effective"
+            baseline_planned_ids = (
+                {str(cid) for cid in effective_meta.get("planned_case_ids", [])}
+                if isinstance(effective_meta, dict)
+                else None
+            )
+            if not baseline_planned_ids:
                 print(
-                    "Baseline run meta missing planned_case_ids; computing missed relative to current filtered cases.",
+                    "Effective results missing planned_case_ids; computing missed relative to current filtered cases.",
                     file=sys.stderr,
                 )
                 baseline_planned_ids = {case.id for case in cases}
+        else:
+            missed_baseline_path = args.only_missed_from or _load_latest_results(artifacts_dir, args.tag)
+            if args.only_missed_from:
+                only_missed_baseline_kind = "path"
+            elif missed_baseline_path:
+                only_missed_baseline_kind = "latest"
+            missed_baseline_run = _run_dir_from_results_path(missed_baseline_path)
+            if missed_baseline_run is None:
+                missed_baseline_run = _load_latest_run(artifacts_dir, args.tag)
+            if missed_baseline_path:
+                try:
+                    missed_baseline_results = load_results(missed_baseline_path)
+                except Exception as exc:
+                    print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr)
+                    return 2
+            else:
+                print("No baseline results found for --only-missed; running all filtered cases.", file=sys.stderr)
+            baseline_meta = _load_run_meta(missed_baseline_run)
+            if isinstance(baseline_meta, dict):
+                planned_from_meta = baseline_meta.get("planned_case_ids")
+                if isinstance(planned_from_meta, list):
+                    baseline_planned_ids = {str(cid) for cid in planned_from_meta}
+                else:
+                    print(
+                        "Baseline run meta missing planned_case_ids; computing missed relative to current filtered cases.",
+                        file=sys.stderr,
+                    )
+                    baseline_planned_ids = {case.id for case in cases}
+        if args.only_missed and missed_baseline_results is None:
+            print("No baseline found for --only-missed.", file=sys.stderr)
+            return 2
 
     planned_case_ids = [case.id for case in cases]
     if args.only_missed:
@@ -661,10 +796,25 @@ def handle_batch(args) -> int:
         )
 
     _update_latest_markers(run_folder, results_path, artifacts_dir, args.tag)
+    effective_path = None
+    effective_meta_path = None
+    if args.tag:
+        try:
+            effective_path, effective_meta_path = _update_effective_snapshot(
+                artifacts_dir=artifacts_dir,
+                tag=args.tag,
+                cases_hash=cases_hash,
+                cases_path=args.cases,
+                planned_case_ids=planned_case_ids,
+                executed_results=results,
+                run_folder=run_folder,
+                planned_case_ids_source=planned_case_ids,
+            )
+        except Exception as exc:
+            print(f"Failed to update effective results for tag {args.tag!r}: {exc}", file=sys.stderr)
 
     config_hash = _hash_file(args.config) if args.config else None
     schema_hash = _hash_file(args.schema)
-    cases_hash = _hash_file(args.cases)
     data_fingerprint = _fingerprint_dir(args.data, verbose=args.fingerprint_verbose)
     llm_settings = settings.llm
     run_meta = {
@@ -695,6 +845,7 @@ def handle_batch(args) -> int:
             "only_missed_from": str(missed_baseline_path) if missed_baseline_path else None,
             "only_missed_baseline_kind": only_missed_baseline_kind,
             "baseline_tag": args.tag,
+            "effective_path": str(effective_path) if effective_path else None,
             "plan_only": args.plan_only,
             "fail_fast": args.fail_fast,
             "max_fails": args.max_fails,
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index e4048bc..3a431d5 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -57,7 +57,7 @@ def build_parser() -> argparse.ArgumentParser:
     batch_p.add_argument(
         "--only-missed",
         action="store_true",
-        help="Run only cases missing in the latest (or tag-latest) effective results",
+        help="Run only cases missing in effective results for --tag (or latest results when no tag is set)",
     )
     batch_p.add_argument(
         "--only-missed-from",

From 8582ae8222ee754526db02b76d44bc889d7fa888 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sun, 21 Dec 2025 23:47:24 +0300
Subject: [PATCH 48/92] Stabilize effective scope and suite planning

---
 examples/demo_qa/batch.py | 76 +++++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 10 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 7bc0302..70407fa 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -126,6 +126,28 @@ def _effective_paths(artifacts_dir: Path, tag: str) -> tuple[Path, Path]:
     return base / "effective_results.jsonl", base / "effective_meta.json"
 
 
+def _scope_payload(
+    *,
+    cases_hash: str,
+    include_tags: set[str] | None,
+    exclude_tags: set[str] | None,
+    include_ids: set[str] | None,
+    exclude_ids: set[str] | None,
+) -> dict[str, object]:
+    return {
+        "cases_hash": cases_hash,
+        "include_tags": sorted(include_tags) if include_tags else None,
+        "exclude_tags": sorted(exclude_tags) if exclude_tags else None,
+        "include_ids": sorted(include_ids) if include_ids else None,
+        "exclude_ids": sorted(exclude_ids) if exclude_ids else None,
+    }
+
+
+def _scope_hash(scope: Mapping[str, object]) -> str:
+    payload = json.dumps(scope, sort_keys=True, ensure_ascii=False)
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
 def _latest_markers(artifacts_dir: Path, tag: str | None) -> tuple[Path, Path]:
     runs_dir = artifacts_dir / "runs"
     if tag:
@@ -238,24 +260,27 @@ def _update_effective_snapshot(
     tag: str,
     cases_hash: str,
     cases_path: Path,
-    planned_case_ids: list[str],
+    suite_case_ids: list[str],
     executed_results: list[RunResult],
     run_folder: Path,
-    planned_case_ids_source: list[str] | None,
+    scope: Mapping[str, object],
+    scope_hash: str,
 ) -> tuple[Path, Path]:
     effective_results, effective_meta, effective_results_path = _load_effective_results(artifacts_dir, tag)
     if effective_meta and effective_meta.get("cases_hash") and effective_meta["cases_hash"] != cases_hash:
         raise ValueError(
             f"Existing effective results for tag {tag!r} use a different cases_hash; refusing to merge."
         )
+    if effective_meta and effective_meta.get("scope_hash") and effective_meta["scope_hash"] != scope_hash:
+        raise ValueError(
+            f"Existing effective results for tag {tag!r} have a different scope; refusing to merge."
+        )
 
     planned_pool: set[str]
     if effective_meta and isinstance(effective_meta.get("planned_case_ids"), list):
         planned_pool = {str(cid) for cid in effective_meta["planned_case_ids"]}
-    elif planned_case_ids_source:
-        planned_pool = set(planned_case_ids_source)
     else:
-        planned_pool = set(planned_case_ids)
+        planned_pool = set(suite_case_ids)
 
     for res in executed_results:
         effective_results[res.id] = res
@@ -279,6 +304,8 @@ def _update_effective_snapshot(
         "updated_at": datetime.datetime.utcnow().isoformat() + "Z",
         "built_from_runs": sorted(built_from),
         "effective_results_path": str(effective_results_path),
+        "scope": scope,
+        "scope_hash": scope_hash,
     }
     meta_path.parent.mkdir(parents=True, exist_ok=True)
     dump_json(meta_path, effective_meta_payload)
@@ -504,6 +531,14 @@ def handle_batch(args) -> int:
     exclude_tags = _split_csv(args.exclude_tags)
     include_ids = _load_ids(args.include_ids)
     exclude_ids = _load_ids(args.exclude_ids)
+    scope = _scope_payload(
+        cases_hash=cases_hash,
+        include_tags=include_tags,
+        exclude_tags=exclude_tags,
+        include_ids=include_ids,
+        exclude_ids=exclude_ids,
+    )
+    scope_id = _scope_hash(scope)
 
     baseline_filter_path = args.only_failed_from
     only_failed_baseline_kind: str | None = None
@@ -521,6 +556,9 @@ def handle_batch(args) -> int:
                 file=sys.stderr,
             )
             return 2
+        if effective_meta and effective_meta.get("scope_hash") not in (None, scope_id):
+            print("Effective results scope does not match current selection; refusing to merge.", file=sys.stderr)
+            return 2
         baseline_for_filter = effective_results
         baseline_filter_path = eff_path
         effective_results_path = eff_path
@@ -560,8 +598,19 @@ def handle_batch(args) -> int:
             print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr)
             return 2
 
-    cases = _select_cases_for_rerun(
+    filtered_cases = _select_cases_for_rerun(
         cases,
+        None,
+        require_assert=args.require_assert,
+        fail_on=args.fail_on,
+        include_tags=include_tags,
+        exclude_tags=exclude_tags,
+        include_ids=include_ids,
+        exclude_ids=exclude_ids,
+    )
+    suite_case_ids = [case.id for case in filtered_cases]
+    cases = _select_cases_for_rerun(
+        filtered_cases,
         baseline_for_filter,
         require_assert=args.require_assert,
         fail_on=args.fail_on,
@@ -596,6 +645,9 @@ def handle_batch(args) -> int:
                     file=sys.stderr,
                 )
                 return 2
+            if effective_meta and effective_meta.get("scope_hash") not in (None, scope_id):
+                print("Effective results scope does not match current selection; refusing to merge.", file=sys.stderr)
+                return 2
             missed_baseline_path = eff_path
             missed_baseline_results = effective_results
             only_missed_baseline_kind = "effective"
@@ -609,7 +661,7 @@ def handle_batch(args) -> int:
                     "Effective results missing planned_case_ids; computing missed relative to current filtered cases.",
                     file=sys.stderr,
                 )
-                baseline_planned_ids = {case.id for case in cases}
+                baseline_planned_ids = set(suite_case_ids)
         else:
             missed_baseline_path = args.only_missed_from or _load_latest_results(artifacts_dir, args.tag)
             if args.only_missed_from:
@@ -637,7 +689,7 @@ def handle_batch(args) -> int:
                         "Baseline run meta missing planned_case_ids; computing missed relative to current filtered cases.",
                         file=sys.stderr,
                     )
-                    baseline_planned_ids = {case.id for case in cases}
+                    baseline_planned_ids = set(suite_case_ids)
         if args.only_missed and missed_baseline_results is None:
             print("No baseline found for --only-missed.", file=sys.stderr)
             return 2
@@ -805,10 +857,11 @@ def handle_batch(args) -> int:
                 tag=args.tag,
                 cases_hash=cases_hash,
                 cases_path=args.cases,
-                planned_case_ids=planned_case_ids,
+                suite_case_ids=suite_case_ids,
                 executed_results=results,
                 run_folder=run_folder,
-                planned_case_ids_source=planned_case_ids,
+                scope=scope,
+                scope_hash=scope_id,
             )
         except Exception as exc:
             print(f"Failed to update effective results for tag {args.tag!r}: {exc}", file=sys.stderr)
@@ -831,6 +884,7 @@ def handle_batch(args) -> int:
             "schema_hash": schema_hash,
             "data_dir": str(args.data),
         },
+        "suite_case_ids": suite_case_ids,
         "planned_case_ids": planned_case_ids,
         "planned_total": planned_total,
         "selected_filters": {
@@ -846,6 +900,8 @@ def handle_batch(args) -> int:
             "only_missed_baseline_kind": only_missed_baseline_kind,
             "baseline_tag": args.tag,
             "effective_path": str(effective_path) if effective_path else None,
+            "scope_hash": scope_id,
+            "scope": scope,
             "plan_only": args.plan_only,
             "fail_fast": args.fail_fast,
             "max_fails": args.max_fails,

From 4279c57fe353bba78a088165d1cb131ef3fe6d54 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Mon, 22 Dec 2025 00:10:53 +0300
Subject: [PATCH 49/92] Differentiate suite and selected coverage

---
 examples/demo_qa/batch.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 70407fa..972cac9 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -694,12 +694,12 @@ def handle_batch(args) -> int:
             print("No baseline found for --only-missed.", file=sys.stderr)
             return 2
 
-    planned_case_ids = [case.id for case in cases]
+    selected_case_ids = [case.id for case in cases]
     if args.only_missed:
-        planned_pool = baseline_planned_ids or set(planned_case_ids)
+        planned_pool = baseline_planned_ids or set(selected_case_ids)
         missed_ids = _missed_case_ids(planned_pool, missed_baseline_results)
         cases = [case for case in cases if case.id in missed_ids]
-        planned_case_ids = [case.id for case in cases]
+        selected_case_ids = [case.id for case in cases]
         if not cases:
             print("0 missed cases selected.", file=sys.stderr)
 
@@ -800,9 +800,11 @@ def handle_batch(args) -> int:
     ended_at = datetime.datetime.utcnow()
     duration_ms = int((ended_at - started_at).total_seconds() * 1000)
     executed_results = {res.id: res for res in results}
-    planned_total = len(planned_case_ids)
+    planned_total = len(selected_case_ids)
     executed_total = len(results)
-    missed_total = len(_missed_case_ids(planned_case_ids, executed_results))
+    missed_total = len(_missed_case_ids(selected_case_ids, executed_results))
+    suite_planned_total = len(suite_case_ids)
+    suite_missed_total = len(_missed_case_ids(suite_case_ids, executed_results))
     summary = {
         "run_id": run_id,
         "started_at": started_at.isoformat() + "Z",
@@ -817,6 +819,8 @@ def handle_batch(args) -> int:
         "planned_total": planned_total,
         "executed_total": executed_total,
         "missed_total": missed_total,
+        "suite_planned_total": suite_planned_total,
+        "suite_missed_total": suite_missed_total,
         "interrupted": interrupted,
         "interrupted_at_case_id": interrupted_at_case_id,
         "tag": args.tag,
@@ -885,7 +889,7 @@ def handle_batch(args) -> int:
             "data_dir": str(args.data),
         },
         "suite_case_ids": suite_case_ids,
-        "planned_case_ids": planned_case_ids,
+        "selected_case_ids": selected_case_ids,
         "planned_total": planned_total,
         "selected_filters": {
             "include_tags": sorted(include_tags) if include_tags else None,

From 941a0015c832c819d8768008cded0b53e9db30d0 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Mon, 22 Dec 2025 00:15:47 +0300
Subject: [PATCH 50/92] Add case history indexing and tag reports

---
 examples/demo_qa/batch.py | 284 +++++++++++++++++++++++++++++++++++++-
 examples/demo_qa/cli.py   |  33 +++++
 2 files changed, 313 insertions(+), 4 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 972cac9..24acef0 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -254,6 +254,115 @@ def _write_effective_results(results_path: Path, results: Mapping[str, RunResult
     write_results(results_path, ordered)
 
 
+def _append_case_history(
+    artifacts_dir: Path,
+    result: RunResult,
+    *,
+    run_id: str,
+    tag: str | None,
+    note: str | None,
+    fail_on: str,
+    require_assert: bool,
+    scope_hash: str,
+    cases_hash: str,
+    git_sha: str | None,
+    run_dir: Path,
+    results_path: Path,
+) -> None:
+    history_dir = artifacts_dir / "runs" / "cases"
+    history_dir.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
+        "run_id": run_id,
+        "tag": tag,
+        "note": note,
+        "status": result.status,
+        "reason": _reason(result),
+        "duration_ms": result.duration_ms,
+        "artifacts_dir": result.artifacts_dir,
+        "run_dir": str(run_dir),
+        "results_path": str(results_path),
+        "fail_on": fail_on,
+        "require_assert": require_assert,
+        "scope_hash": scope_hash,
+        "cases_hash": cases_hash,
+        "git_sha": git_sha,
+    }
+    target = history_dir / f"{result.id}.jsonl"
+    with target.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")
+
+
+def _build_effective_diff(
+    before: Mapping[str, RunResult],
+    after: Mapping[str, RunResult],
+    *,
+    fail_on: str,
+    require_assert: bool,
+    run_id: str,
+    tag: str,
+    note: str | None,
+    run_dir: Path,
+    results_path: Path,
+    scope_hash: str,
+) -> dict[str, object]:
+    bad = bad_statuses(fail_on, require_assert)
+    before_bad = {cid for cid, res in before.items() if res.status in bad}
+    after_bad = {cid for cid, res in after.items() if res.status in bad}
+    ids = set(before) | set(after)
+    regressed: list[dict[str, object]] = []
+    fixed: list[dict[str, object]] = []
+    changed_bad: list[dict[str, object]] = []
+    new_cases: list[dict[str, object]] = []
+    other_changed: list[dict[str, object]] = []
+    for cid in ids:
+        prev = before.get(cid)
+        cur = after.get(cid)
+        prev_status = prev.status if prev else None
+        cur_status = cur.status if cur else None
+        if prev is None and cur is not None:
+            new_cases.append({"id": cid, "to": cur_status})
+            continue
+        if cur is None or prev is None:
+            continue
+        if prev_status == cur_status:
+            continue
+        entry = {"id": cid, "from": prev_status, "to": cur_status, "reason": _reason(cur)}
+        was_bad = cid in before_bad
+        now_bad = cid in after_bad
+        if not was_bad and now_bad:
+            regressed.append(entry)
+        elif was_bad and not now_bad:
+            fixed.append(entry)
+        elif was_bad and now_bad:
+            changed_bad.append(entry)
+        else:
+            other_changed.append(entry)
+    return {
+        "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
+        "tag": tag,
+        "note": note,
+        "run_id": run_id,
+        "run_dir": str(run_dir),
+        "results_path": str(results_path),
+        "fail_on": fail_on,
+        "require_assert": require_assert,
+        "scope_hash": scope_hash,
+        "regressed": sorted(regressed, key=lambda r: r["id"]),
+        "fixed": sorted(fixed, key=lambda r: r["id"]),
+        "changed_bad": sorted(changed_bad, key=lambda r: r["id"]),
+        "changed_other": sorted(other_changed, key=lambda r: r["id"]),
+        "new_cases": sorted(new_cases, key=lambda r: r["id"]),
+    }
+
+
+def _append_effective_diff(tag_dir: Path, diff_entry: Mapping[str, object]) -> None:
+    tag_dir.mkdir(parents=True, exist_ok=True)
+    changes_path = tag_dir / "effective_changes.jsonl"
+    with changes_path.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(diff_entry, ensure_ascii=False, sort_keys=True) + "\n")
+
+
 def _update_effective_snapshot(
     *,
     artifacts_dir: Path,
@@ -265,7 +374,7 @@ def _update_effective_snapshot(
     run_folder: Path,
     scope: Mapping[str, object],
     scope_hash: str,
-) -> tuple[Path, Path]:
+) -> tuple[Path, Path, dict[str, RunResult], dict[str, RunResult]]:
     effective_results, effective_meta, effective_results_path = _load_effective_results(artifacts_dir, tag)
     if effective_meta and effective_meta.get("cases_hash") and effective_meta["cases_hash"] != cases_hash:
         raise ValueError(
@@ -282,6 +391,7 @@ def _update_effective_snapshot(
     else:
         planned_pool = set(suite_case_ids)
 
+    before_effective = dict(effective_results)
     for res in executed_results:
         effective_results[res.id] = res
     _write_effective_results(effective_results_path, effective_results)
@@ -309,7 +419,7 @@ def _update_effective_snapshot(
     }
     meta_path.parent.mkdir(parents=True, exist_ok=True)
     dump_json(meta_path, effective_meta_payload)
-    return effective_results_path, meta_path
+    return effective_results_path, meta_path, before_effective, effective_results
 
 
 def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]:
@@ -856,7 +966,7 @@ def handle_batch(args) -> int:
     effective_meta_path = None
     if args.tag:
         try:
-            effective_path, effective_meta_path = _update_effective_snapshot(
+            effective_path, effective_meta_path, prev_effective, new_effective = _update_effective_snapshot(
                 artifacts_dir=artifacts_dir,
                 tag=args.tag,
                 cases_hash=cases_hash,
@@ -867,12 +977,26 @@ def handle_batch(args) -> int:
                 scope=scope,
                 scope_hash=scope_id,
             )
+            diff_entry = _build_effective_diff(
+                prev_effective,
+                new_effective,
+                fail_on=args.fail_on,
+                require_assert=args.require_assert,
+                run_id=run_id,
+                tag=args.tag,
+                note=args.note,
+                run_dir=run_folder,
+                results_path=results_path,
+                scope_hash=scope_id,
+            )
+            _append_effective_diff(effective_path.parent, diff_entry)
         except Exception as exc:
             print(f"Failed to update effective results for tag {args.tag!r}: {exc}", file=sys.stderr)
 
     config_hash = _hash_file(args.config) if args.config else None
     schema_hash = _hash_file(args.schema)
     data_fingerprint = _fingerprint_dir(args.data, verbose=args.fingerprint_verbose)
+    git_sha = _git_sha()
     llm_settings = settings.llm
     run_meta = {
         "run_id": run_id,
@@ -921,7 +1045,7 @@ def handle_batch(args) -> int:
             "base_url": llm_settings.base_url or "https://api.openai.com/v1",
         },
         "enable_semantic": args.enable_semantic,
-        "git_sha": _git_sha(),
+        "git_sha": git_sha,
         "results_path": str(results_path),
         "summary_path": str(summary_path),
         "run_dir": str(run_folder),
@@ -955,9 +1079,27 @@ def handle_batch(args) -> int:
         "planned_total": planned_total,
         "executed_total": executed_total,
         "missed_total": missed_total,
+        "suite_planned_total": suite_planned_total,
+        "suite_missed_total": suite_missed_total,
         "interrupted": interrupted,
         "interrupted_at_case_id": interrupted_at_case_id,
+        "scope_hash": scope_id,
     }
+    for res in results:
+        _append_case_history(
+            artifacts_dir,
+            res,
+            run_id=run_id,
+            tag=args.tag,
+            note=args.note,
+            fail_on=args.fail_on,
+            require_assert=args.require_assert,
+            scope_hash=scope_id,
+            cases_hash=cases_hash,
+            git_sha=git_sha,
+            run_dir=run_folder,
+            results_path=results_path,
+        )
     history_path.parent.mkdir(parents=True, exist_ok=True)
     with history_path.open("a", encoding="utf-8") as f:
         f.write(json.dumps(history_entry, ensure_ascii=False, sort_keys=True) + "\n")
@@ -1169,11 +1311,145 @@ def handle_compare(args) -> int:
     return 0
 
 
+def _load_case_history(path: Path) -> list[dict]:
+    if not path.exists():
+        return []
+    entries: list[dict] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                entries.append(json.loads(line))
+            except Exception:
+                continue
+    return entries
+
+
+def handle_history_case(args) -> int:
+    artifacts_dir = args.data / ".runs"
+    path = artifacts_dir / "runs" / "cases" / f"{args.case_id}.jsonl"
+    entries = _load_case_history(path)
+    if args.tag:
+        entries = [e for e in entries if e.get("tag") == args.tag]
+    if not entries:
+        print(f"No history found for case {args.case_id}.")
+        return 0
+    entries = list(reversed(entries))[: args.limit]
+    header = (
+        f"{'timestamp':<25} {'run_id':<12} {'tag':<15} {'status':<10} "
+        f"{'reason':<30} {'note':<15} {'run_dir':<30}"
+    )
+    print(header)
+    for e in entries:
+        ts = str(e.get("timestamp", ""))[:25]
+        print(
+            f"{ts:<25} {str(e.get('run_id','')):<12} {str(e.get('tag','')):<15} "
+            f"{str(e.get('status','')):<10} {str(e.get('reason','')):<30} {str(e.get('note','')):<15} "
+            f"{str(e.get('run_dir','')):<30}"
+        )
+    return 0
+
+
+def _resolve_run_dir_arg(run_arg: Path, artifacts_dir: Path) -> Optional[Path]:
+    if run_arg.exists():
+        return run_arg
+    candidate = artifacts_dir / "runs" / run_arg
+    if candidate.exists():
+        return candidate
+    return None
+
+
+def handle_report_run(args) -> int:
+    artifacts_dir = args.data / ".runs"
+    run_dir = _resolve_run_dir_arg(args.run, artifacts_dir)
+    if not run_dir:
+        print("Run directory not found.", file=sys.stderr)
+        return 2
+    summary_path = run_dir / "summary.json"
+    if not summary_path.exists():
+        print(f"summary.json not found in {run_dir}", file=sys.stderr)
+        return 2
+    try:
+        summary = json.loads(summary_path.read_text(encoding="utf-8"))
+    except Exception as exc:
+        print(f"Failed to read summary: {exc}", file=sys.stderr)
+        return 2
+    print(f"Run: {run_dir}")
+    for key in ["run_id", "tag", "note", "exit_code", "interrupted", "interrupted_at_case_id", "results_path"]:
+        if key in summary:
+            print(f"{key}: {summary.get(key)}")
+    counts = summary.get("counts") or {}
+    if counts:
+        print("Counts:", counts)
+    return 0
+
+
+def _load_effective_diff(tag_dir: Path) -> Optional[dict]:
+    path = tag_dir / "effective_changes.jsonl"
+    if not path.exists():
+        return None
+    last: Optional[dict] = None
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                last = json.loads(line)
+            except Exception:
+                continue
+    return last
+
+
+def handle_report_tag(args) -> int:
+    artifacts_dir = args.data / ".runs"
+    eff_results_path, eff_meta_path = _effective_paths(artifacts_dir, args.tag)
+    if not eff_results_path.exists() or not eff_meta_path.exists():
+        print(f"No effective snapshot found for tag {args.tag!r}.", file=sys.stderr)
+        return 2
+    try:
+        meta = json.loads(eff_meta_path.read_text(encoding="utf-8"))
+    except Exception as exc:
+        print(f"Failed to read effective_meta.json: {exc}", file=sys.stderr)
+        return 2
+    try:
+        results = load_results(eff_results_path)
+    except Exception as exc:
+        print(f"Failed to read effective results: {exc}", file=sys.stderr)
+        return 2
+    counts = meta.get("counts") or summarize(results.values())
+    print(f"Tag: {args.tag}")
+    print(f"Planned: {meta.get('planned_total')} | Executed: {meta.get('executed_total')} | Missed: {meta.get('missed_total')}")
+    print("Counts:", counts)
+    bad = bad_statuses("bad", False)
+    failing = [res for res in results.values() if res.status in bad]
+    failing = sorted(failing, key=lambda r: r.id)[:10]
+    if failing:
+        print("Failing cases (top 10):")
+        for res in failing:
+            print(f"- {res.id}: {res.status} ({_reason(res)}) [{res.artifacts_dir}]")
+    diff_entry = _load_effective_diff(eff_results_path.parent)
+    if diff_entry:
+        print("Last effective change:")
+        for key in ["timestamp", "run_id", "note"]:
+            if key in diff_entry:
+                print(f"  {key}: {diff_entry.get(key)}")
+        for label in ["regressed", "fixed", "changed_bad", "new_cases"]:
+            items = diff_entry.get(label) or []
+            print(f"  {label}: {len(items)}")
+    return 0
+
+
 __all__ = [
     "handle_batch",
     "handle_case_open",
     "handle_case_run",
     "handle_chat",
+    "handle_history_case",
+    "handle_report_run",
+    "handle_report_tag",
     "bad_statuses",
     "is_failure",
     "write_results",
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index 3a431d5..d228a1c 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -146,6 +146,23 @@ def build_parser() -> argparse.ArgumentParser:
     )
     compare_p.add_argument("--require-assert", action="store_true", help="Treat unchecked cases as failures when diffing")
 
+    history_p = sub.add_parser("history", help="History utilities")
+    history_sub = history_p.add_subparsers(dest="history_command", required=True)
+    case_hist = history_sub.add_parser("case", help="Show history for a case id")
+    case_hist.add_argument("case_id")
+    case_hist.add_argument("--data", type=Path, required=True, help="Data dir containing .runs")
+    case_hist.add_argument("--tag", type=str, default=None, help="Filter by tag")
+    case_hist.add_argument("--limit", type=int, default=20, help="Limit rows")
+
+    report_p = sub.add_parser("report", help="Reports over runs/effective snapshots")
+    report_sub = report_p.add_subparsers(dest="report_command", required=True)
+    tag_report = report_sub.add_parser("tag", help="Report current effective snapshot for a tag")
+    tag_report.add_argument("--data", type=Path, required=True, help="Data dir containing .runs")
+    tag_report.add_argument("--tag", type=str, required=True, help="Tag to report")
+    run_report = report_sub.add_parser("run", help="Report a specific run folder or run_id")
+    run_report.add_argument("--data", type=Path, required=True, help="Data dir containing .runs")
+    run_report.add_argument("--run", type=Path, required=True, help="Run dir or run_id under runs/")
+
     return parser
 
 
@@ -173,6 +190,22 @@ def main() -> None:
         code = handle_stats(args)
     elif args.command == "compare":
         code = handle_compare(args)
+    elif args.command == "history":
+        from .batch import handle_history_case
+
+        if args.history_command == "case":
+            code = handle_history_case(args)
+        else:
+            code = 1
+    elif args.command == "report":
+        from .batch import handle_report_run, handle_report_tag
+
+        if args.report_command == "tag":
+            code = handle_report_tag(args)
+        elif args.report_command == "run":
+            code = handle_report_run(args)
+        else:
+            code = 1
     else:
         code = 0
     raise SystemExit(code)

From 7d7bb86bff33e6701cd44e542b8059c8a6f01d0f Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Mon, 22 Dec 2025 00:20:37 +0300
Subject: [PATCH 51/92] Clarify only-missed baseline requirement

---
 examples/demo_qa/batch.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 24acef0..dce089c 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -788,7 +788,11 @@ def handle_batch(args) -> int:
                     print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr)
                     return 2
             else:
-                print("No baseline results found for --only-missed; running all filtered cases.", file=sys.stderr)
+                print(
+                    "No baseline found for --only-missed. Provide --only-missed-from or run a tagged batch first.",
+                    file=sys.stderr,
+                )
+                return 2
             baseline_meta = _load_run_meta(missed_baseline_run)
             if isinstance(baseline_meta, dict):
                 planned_from_meta = baseline_meta.get("planned_case_ids")

From 303a601e36c00110cff24daba4871f12423a42d5 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Mon, 22 Dec 2025 01:57:38 +0300
Subject: [PATCH 52/92] Align report policy and clean helper imports

---
 examples/demo_qa/batch.py             | 464 +-------------------------
 examples/demo_qa/cli.py               |   6 +-
 examples/demo_qa/commands/__init__.py |   6 +
 examples/demo_qa/commands/history.py  |  31 ++
 examples/demo_qa/commands/report.py   |  99 ++++++
 examples/demo_qa/runs/__init__.py     |   3 +
 examples/demo_qa/runs/case_history.py |  77 +++++
 examples/demo_qa/runs/coverage.py     |  19 ++
 examples/demo_qa/runs/effective.py    | 203 +++++++++++
 examples/demo_qa/runs/io.py           |  17 +
 examples/demo_qa/runs/layout.py       | 102 ++++++
 examples/demo_qa/runs/scope.py        |  30 ++
 tests/test_demo_qa_batch.py           |  13 +-
 tests/test_demo_qa_commands.py        |  18 +
 14 files changed, 627 insertions(+), 461 deletions(-)
 create mode 100644 examples/demo_qa/commands/__init__.py
 create mode 100644 examples/demo_qa/commands/history.py
 create mode 100644 examples/demo_qa/commands/report.py
 create mode 100644 examples/demo_qa/runs/__init__.py
 create mode 100644 examples/demo_qa/runs/case_history.py
 create mode 100644 examples/demo_qa/runs/coverage.py
 create mode 100644 examples/demo_qa/runs/effective.py
 create mode 100644 examples/demo_qa/runs/io.py
 create mode 100644 examples/demo_qa/runs/layout.py
 create mode 100644 examples/demo_qa/runs/scope.py
 create mode 100644 tests/test_demo_qa_commands.py

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index dce089c..8fa3266 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -28,17 +28,23 @@
     save_status,
     summarize,
 )
+from .runs.case_history import _append_case_history
+from .runs.coverage import _missed_case_ids
+from .runs.effective import _append_effective_diff, _build_effective_diff, _load_effective_results, _update_effective_snapshot
+from .runs.io import write_results
+from .runs.layout import (
+    _latest_markers,
+    _load_latest_results,
+    _load_latest_run,
+    _load_run_meta,
+    _run_dir_from_results_path,
+    _update_latest_markers,
+)
+from .runs.scope import _scope_hash, _scope_payload
 from .settings import load_settings
 from .utils import dump_json
 
 
-def write_results(out_path: Path, results: Iterable[RunResult]) -> None:
-    out_path.parent.mkdir(parents=True, exist_ok=True)
-    with out_path.open("w", encoding="utf-8") as f:
-        for res in results:
-            f.write(json.dumps(res.to_json(), ensure_ascii=False, sort_keys=True, separators=(",", ":")) + "\n")
-
-
 def write_summary(out_path: Path, summary: dict) -> Path:
     summary_path = out_path.with_name("summary.json")
     dump_json(summary_path, summary)
@@ -116,310 +122,6 @@ def _git_sha() -> Optional[str]:
     return result.stdout.strip() or None
 
 
-def _sanitize_tag(tag: str) -> str:
-    cleaned = "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "_" for ch in tag)
-    return cleaned or "tag"
-
-
-def _effective_paths(artifacts_dir: Path, tag: str) -> tuple[Path, Path]:
-    base = artifacts_dir / "runs" / "tags" / _sanitize_tag(tag)
-    return base / "effective_results.jsonl", base / "effective_meta.json"
-
-
-def _scope_payload(
-    *,
-    cases_hash: str,
-    include_tags: set[str] | None,
-    exclude_tags: set[str] | None,
-    include_ids: set[str] | None,
-    exclude_ids: set[str] | None,
-) -> dict[str, object]:
-    return {
-        "cases_hash": cases_hash,
-        "include_tags": sorted(include_tags) if include_tags else None,
-        "exclude_tags": sorted(exclude_tags) if exclude_tags else None,
-        "include_ids": sorted(include_ids) if include_ids else None,
-        "exclude_ids": sorted(exclude_ids) if exclude_ids else None,
-    }
-
-
-def _scope_hash(scope: Mapping[str, object]) -> str:
-    payload = json.dumps(scope, sort_keys=True, ensure_ascii=False)
-    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
-
-
-def _latest_markers(artifacts_dir: Path, tag: str | None) -> tuple[Path, Path]:
-    runs_dir = artifacts_dir / "runs"
-    if tag:
-        slug = _sanitize_tag(tag)
-        return runs_dir / f"tag-latest-{slug}.txt", runs_dir / f"tag-latest-results-{slug}.txt"
-    return runs_dir / "latest.txt", runs_dir / "latest_results.txt"
-
-
-def _load_latest_run(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]:
-    latest_file, _ = _latest_markers(artifacts_dir, tag)
-    if latest_file.exists():
-        content = latest_file.read_text(encoding="utf-8").strip()
-        if content:
-            return Path(content)
-    return None
-
-
-def _load_latest_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]:
-    _, latest_file = _latest_markers(artifacts_dir, tag)
-    if latest_file.exists():
-        content = latest_file.read_text(encoding="utf-8").strip()
-        if content:
-            return Path(content)
-    latest_run = _load_latest_run(artifacts_dir, tag)
-    if latest_run:
-        summary_path = latest_run / "summary.json"
-        if summary_path.exists():
-            try:
-                summary = json.loads(summary_path.read_text(encoding="utf-8"))
-                results_path = summary.get("results_path")
-                if results_path:
-                    return Path(results_path)
-            except Exception:
-                pass
-    return None
-
-
-def _load_run_meta(run_path: Path | None) -> Optional[dict]:
-    if run_path is None:
-        return None
-    meta_path = run_path / "run_meta.json"
-    if not meta_path.exists():
-        return None
-    try:
-        return json.loads(meta_path.read_text(encoding="utf-8"))
-    except Exception:
-        return None
-
-
-def _run_dir_from_results_path(results_path: Path | None) -> Optional[Path]:
-    if results_path is None:
-        return None
-    run_dir = results_path.parent
-    summary_path = run_dir / "summary.json"
-    if summary_path.exists():
-        try:
-            summary = json.loads(summary_path.read_text(encoding="utf-8"))
-            run_dir_from_summary = summary.get("run_dir")
-            if run_dir_from_summary:
-                return Path(run_dir_from_summary)
-        except Exception:
-            pass
-    return run_dir
-
-
-def _missed_case_ids(planned_case_ids: Iterable[str], executed_results: Mapping[str, RunResult] | None) -> set[str]:
-    planned_set = set(planned_case_ids)
-    if not executed_results:
-        return planned_set
-    try:
-        executed_ids = set(executed_results.keys())
-    except Exception:
-        executed_ids = set()
-    return planned_set - executed_ids
-
-
-def _update_latest_markers(run_folder: Path, results_path: Path, artifacts_dir: Path, tag: str | None) -> None:
-    marker_pairs = {_latest_markers(artifacts_dir, None)}
-    if tag:
-        marker_pairs.add(_latest_markers(artifacts_dir, tag))
-    for latest_path, latest_results_path in marker_pairs:
-        latest_path.parent.mkdir(parents=True, exist_ok=True)
-        latest_path.write_text(str(run_folder), encoding="utf-8")
-        latest_results_path.write_text(str(results_path), encoding="utf-8")
-
-
-def _load_effective_results(artifacts_dir: Path, tag: str) -> tuple[dict[str, RunResult], Optional[dict], Path]:
-    results_path, meta_path = _effective_paths(artifacts_dir, tag)
-    meta: Optional[dict] = None
-    results: dict[str, RunResult] = {}
-    if results_path.exists():
-        results = load_results(results_path)
-    if meta_path.exists():
-        try:
-            meta = json.loads(meta_path.read_text(encoding="utf-8"))
-        except Exception:
-            meta = None
-    return results, meta, results_path
-
-
-def _write_effective_results(results_path: Path, results: Mapping[str, RunResult]) -> None:
-    results_path.parent.mkdir(parents=True, exist_ok=True)
-    ordered = [results[cid] for cid in sorted(results)]
-    write_results(results_path, ordered)
-
-
-def _append_case_history(
-    artifacts_dir: Path,
-    result: RunResult,
-    *,
-    run_id: str,
-    tag: str | None,
-    note: str | None,
-    fail_on: str,
-    require_assert: bool,
-    scope_hash: str,
-    cases_hash: str,
-    git_sha: str | None,
-    run_dir: Path,
-    results_path: Path,
-) -> None:
-    history_dir = artifacts_dir / "runs" / "cases"
-    history_dir.mkdir(parents=True, exist_ok=True)
-    payload = {
-        "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
-        "run_id": run_id,
-        "tag": tag,
-        "note": note,
-        "status": result.status,
-        "reason": _reason(result),
-        "duration_ms": result.duration_ms,
-        "artifacts_dir": result.artifacts_dir,
-        "run_dir": str(run_dir),
-        "results_path": str(results_path),
-        "fail_on": fail_on,
-        "require_assert": require_assert,
-        "scope_hash": scope_hash,
-        "cases_hash": cases_hash,
-        "git_sha": git_sha,
-    }
-    target = history_dir / f"{result.id}.jsonl"
-    with target.open("a", encoding="utf-8") as f:
-        f.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")
-
-
-def _build_effective_diff(
-    before: Mapping[str, RunResult],
-    after: Mapping[str, RunResult],
-    *,
-    fail_on: str,
-    require_assert: bool,
-    run_id: str,
-    tag: str,
-    note: str | None,
-    run_dir: Path,
-    results_path: Path,
-    scope_hash: str,
-) -> dict[str, object]:
-    bad = bad_statuses(fail_on, require_assert)
-    before_bad = {cid for cid, res in before.items() if res.status in bad}
-    after_bad = {cid for cid, res in after.items() if res.status in bad}
-    ids = set(before) | set(after)
-    regressed: list[dict[str, object]] = []
-    fixed: list[dict[str, object]] = []
-    changed_bad: list[dict[str, object]] = []
-    new_cases: list[dict[str, object]] = []
-    other_changed: list[dict[str, object]] = []
-    for cid in ids:
-        prev = before.get(cid)
-        cur = after.get(cid)
-        prev_status = prev.status if prev else None
-        cur_status = cur.status if cur else None
-        if prev is None and cur is not None:
-            new_cases.append({"id": cid, "to": cur_status})
-            continue
-        if cur is None or prev is None:
-            continue
-        if prev_status == cur_status:
-            continue
-        entry = {"id": cid, "from": prev_status, "to": cur_status, "reason": _reason(cur)}
-        was_bad = cid in before_bad
-        now_bad = cid in after_bad
-        if not was_bad and now_bad:
-            regressed.append(entry)
-        elif was_bad and not now_bad:
-            fixed.append(entry)
-        elif was_bad and now_bad:
-            changed_bad.append(entry)
-        else:
-            other_changed.append(entry)
-    return {
-        "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
-        "tag": tag,
-        "note": note,
-        "run_id": run_id,
-        "run_dir": str(run_dir),
-        "results_path": str(results_path),
-        "fail_on": fail_on,
-        "require_assert": require_assert,
-        "scope_hash": scope_hash,
-        "regressed": sorted(regressed, key=lambda r: r["id"]),
-        "fixed": sorted(fixed, key=lambda r: r["id"]),
-        "changed_bad": sorted(changed_bad, key=lambda r: r["id"]),
-        "changed_other": sorted(other_changed, key=lambda r: r["id"]),
-        "new_cases": sorted(new_cases, key=lambda r: r["id"]),
-    }
-
-
-def _append_effective_diff(tag_dir: Path, diff_entry: Mapping[str, object]) -> None:
-    tag_dir.mkdir(parents=True, exist_ok=True)
-    changes_path = tag_dir / "effective_changes.jsonl"
-    with changes_path.open("a", encoding="utf-8") as f:
-        f.write(json.dumps(diff_entry, ensure_ascii=False, sort_keys=True) + "\n")
-
-
-def _update_effective_snapshot(
-    *,
-    artifacts_dir: Path,
-    tag: str,
-    cases_hash: str,
-    cases_path: Path,
-    suite_case_ids: list[str],
-    executed_results: list[RunResult],
-    run_folder: Path,
-    scope: Mapping[str, object],
-    scope_hash: str,
-) -> tuple[Path, Path, dict[str, RunResult], dict[str, RunResult]]:
-    effective_results, effective_meta, effective_results_path = _load_effective_results(artifacts_dir, tag)
-    if effective_meta and effective_meta.get("cases_hash") and effective_meta["cases_hash"] != cases_hash:
-        raise ValueError(
-            f"Existing effective results for tag {tag!r} use a different cases_hash; refusing to merge."
-        )
-    if effective_meta and effective_meta.get("scope_hash") and effective_meta["scope_hash"] != scope_hash:
-        raise ValueError(
-            f"Existing effective results for tag {tag!r} have a different scope; refusing to merge."
-        )
-
-    planned_pool: set[str]
-    if effective_meta and isinstance(effective_meta.get("planned_case_ids"), list):
-        planned_pool = {str(cid) for cid in effective_meta["planned_case_ids"]}
-    else:
-        planned_pool = set(suite_case_ids)
-
-    before_effective = dict(effective_results)
-    for res in executed_results:
-        effective_results[res.id] = res
-    _write_effective_results(effective_results_path, effective_results)
-
-    summary_counts = summarize(effective_results.values())
-    executed_total = len(effective_results)
-    missed_total = len(_missed_case_ids(planned_pool, effective_results))
-    meta_path = effective_results_path.with_name("effective_meta.json")
-    built_from = set(effective_meta.get("built_from_runs", [])) if effective_meta else set()
-    built_from.add(str(run_folder))
-    effective_meta_payload = {
-        "tag": tag,
-        "cases_hash": cases_hash,
-        "cases_path": str(cases_path),
-        "planned_case_ids": sorted(planned_pool),
-        "planned_total": len(planned_pool),
-        "executed_total": executed_total,
-        "missed_total": missed_total,
-        "counts": summary_counts,
-        "updated_at": datetime.datetime.utcnow().isoformat() + "Z",
-        "built_from_runs": sorted(built_from),
-        "effective_results_path": str(effective_results_path),
-        "scope": scope,
-        "scope_hash": scope_hash,
-    }
-    meta_path.parent.mkdir(parents=True, exist_ok=True)
-    dump_json(meta_path, effective_meta_payload)
-    return effective_results_path, meta_path, before_effective, effective_results
 
 
 def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]:
@@ -980,6 +682,8 @@ def handle_batch(args) -> int:
                 run_folder=run_folder,
                 scope=scope,
                 scope_hash=scope_id,
+                fail_on=args.fail_on,
+                require_assert=args.require_assert,
             )
             diff_entry = _build_effective_diff(
                 prev_effective,
@@ -1315,149 +1019,15 @@ def handle_compare(args) -> int:
     return 0
 
 
-def _load_case_history(path: Path) -> list[dict]:
-    if not path.exists():
-        return []
-    entries: list[dict] = []
-    with path.open("r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                entries.append(json.loads(line))
-            except Exception:
-                continue
-    return entries
-
-
-def handle_history_case(args) -> int:
-    artifacts_dir = args.data / ".runs"
-    path = artifacts_dir / "runs" / "cases" / f"{args.case_id}.jsonl"
-    entries = _load_case_history(path)
-    if args.tag:
-        entries = [e for e in entries if e.get("tag") == args.tag]
-    if not entries:
-        print(f"No history found for case {args.case_id}.")
-        return 0
-    entries = list(reversed(entries))[: args.limit]
-    header = (
-        f"{'timestamp':<25} {'run_id':<12} {'tag':<15} {'status':<10} "
-        f"{'reason':<30} {'note':<15} {'run_dir':<30}"
-    )
-    print(header)
-    for e in entries:
-        ts = str(e.get("timestamp", ""))[:25]
-        print(
-            f"{ts:<25} {str(e.get('run_id','')):<12} {str(e.get('tag','')):<15} "
-            f"{str(e.get('status','')):<10} {str(e.get('reason','')):<30} {str(e.get('note','')):<15} "
-            f"{str(e.get('run_dir','')):<30}"
-        )
-    return 0
-
-
-def _resolve_run_dir_arg(run_arg: Path, artifacts_dir: Path) -> Optional[Path]:
-    if run_arg.exists():
-        return run_arg
-    candidate = artifacts_dir / "runs" / run_arg
-    if candidate.exists():
-        return candidate
-    return None
-
-
-def handle_report_run(args) -> int:
-    artifacts_dir = args.data / ".runs"
-    run_dir = _resolve_run_dir_arg(args.run, artifacts_dir)
-    if not run_dir:
-        print("Run directory not found.", file=sys.stderr)
-        return 2
-    summary_path = run_dir / "summary.json"
-    if not summary_path.exists():
-        print(f"summary.json not found in {run_dir}", file=sys.stderr)
-        return 2
-    try:
-        summary = json.loads(summary_path.read_text(encoding="utf-8"))
-    except Exception as exc:
-        print(f"Failed to read summary: {exc}", file=sys.stderr)
-        return 2
-    print(f"Run: {run_dir}")
-    for key in ["run_id", "tag", "note", "exit_code", "interrupted", "interrupted_at_case_id", "results_path"]:
-        if key in summary:
-            print(f"{key}: {summary.get(key)}")
-    counts = summary.get("counts") or {}
-    if counts:
-        print("Counts:", counts)
-    return 0
-
-
-def _load_effective_diff(tag_dir: Path) -> Optional[dict]:
-    path = tag_dir / "effective_changes.jsonl"
-    if not path.exists():
-        return None
-    last: Optional[dict] = None
-    with path.open("r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                last = json.loads(line)
-            except Exception:
-                continue
-    return last
-
-
-def handle_report_tag(args) -> int:
-    artifacts_dir = args.data / ".runs"
-    eff_results_path, eff_meta_path = _effective_paths(artifacts_dir, args.tag)
-    if not eff_results_path.exists() or not eff_meta_path.exists():
-        print(f"No effective snapshot found for tag {args.tag!r}.", file=sys.stderr)
-        return 2
-    try:
-        meta = json.loads(eff_meta_path.read_text(encoding="utf-8"))
-    except Exception as exc:
-        print(f"Failed to read effective_meta.json: {exc}", file=sys.stderr)
-        return 2
-    try:
-        results = load_results(eff_results_path)
-    except Exception as exc:
-        print(f"Failed to read effective results: {exc}", file=sys.stderr)
-        return 2
-    counts = meta.get("counts") or summarize(results.values())
-    print(f"Tag: {args.tag}")
-    print(f"Planned: {meta.get('planned_total')} | Executed: {meta.get('executed_total')} | Missed: {meta.get('missed_total')}")
-    print("Counts:", counts)
-    bad = bad_statuses("bad", False)
-    failing = [res for res in results.values() if res.status in bad]
-    failing = sorted(failing, key=lambda r: r.id)[:10]
-    if failing:
-        print("Failing cases (top 10):")
-        for res in failing:
-            print(f"- {res.id}: {res.status} ({_reason(res)}) [{res.artifacts_dir}]")
-    diff_entry = _load_effective_diff(eff_results_path.parent)
-    if diff_entry:
-        print("Last effective change:")
-        for key in ["timestamp", "run_id", "note"]:
-            if key in diff_entry:
-                print(f"  {key}: {diff_entry.get(key)}")
-        for label in ["regressed", "fixed", "changed_bad", "new_cases"]:
-            items = diff_entry.get(label) or []
-            print(f"  {label}: {len(items)}")
-    return 0
-
-
 __all__ = [
     "handle_batch",
     "handle_case_open",
     "handle_case_run",
     "handle_chat",
-    "handle_history_case",
-    "handle_report_run",
-    "handle_report_tag",
+    "handle_stats",
+    "handle_compare",
     "bad_statuses",
     "is_failure",
     "write_results",
     "write_summary",
-    "_load_latest_run",
-    "_find_case_artifact",
 ]
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index d228a1c..c8ae01c 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -24,6 +24,8 @@ def ensure_repo_imports() -> None:
     handle_compare,
     handle_stats,
 )  # noqa: E402
+from .commands.history import handle_history_case  # noqa: E402
+from .commands.report import handle_report_run, handle_report_tag  # noqa: E402
 from .data_gen import generate_and_save  # noqa: E402
 
 
@@ -191,15 +193,11 @@ def main() -> None:
     elif args.command == "compare":
         code = handle_compare(args)
     elif args.command == "history":
-        from .batch import handle_history_case
-
         if args.history_command == "case":
             code = handle_history_case(args)
         else:
             code = 1
     elif args.command == "report":
-        from .batch import handle_report_run, handle_report_tag
-
         if args.report_command == "tag":
             code = handle_report_tag(args)
         elif args.report_command == "run":
diff --git a/examples/demo_qa/commands/__init__.py b/examples/demo_qa/commands/__init__.py
new file mode 100644
index 0000000..4794645
--- /dev/null
+++ b/examples/demo_qa/commands/__init__.py
@@ -0,0 +1,6 @@
+"""Lightweight command entrypoints for demo QA CLI."""
+
+from .history import handle_history_case
+from .report import handle_report_run, handle_report_tag
+
+__all__ = ["handle_history_case", "handle_report_run", "handle_report_tag"]
diff --git a/examples/demo_qa/commands/history.py b/examples/demo_qa/commands/history.py
new file mode 100644
index 0000000..376f90d
--- /dev/null
+++ b/examples/demo_qa/commands/history.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from ..runs.case_history import _load_case_history
+
+
+def handle_history_case(args) -> int:
+    artifacts_dir = args.data / ".runs"
+    path = artifacts_dir / "runs" / "cases" / f"{args.case_id}.jsonl"
+    entries = _load_case_history(path)
+    if args.tag:
+        entries = [e for e in entries if e.get("tag") == args.tag]
+    if not entries:
+        print(f"No history found for case {args.case_id}.")
+        return 0
+    entries = list(reversed(entries))[: args.limit]
+    header = (
+        f"{'timestamp':<25} {'run_id':<12} {'tag':<15} {'status':<10} "
+        f"{'reason':<30} {'note':<15} {'run_dir':<30}"
+    )
+    print(header)
+    for e in entries:
+        ts = str(e.get("timestamp", ""))[:25]
+        print(
+            f"{ts:<25} {str(e.get('run_id','')):<12} {str(e.get('tag','')):<15} "
+            f"{str(e.get('status','')):<10} {str(e.get('reason','')):<30} {str(e.get('note','')):<15} "
+            f"{str(e.get('run_dir','')):<30}"
+        )
+    return 0
+
+
+__all__ = ["handle_history_case"]
diff --git a/examples/demo_qa/commands/report.py b/examples/demo_qa/commands/report.py
new file mode 100644
index 0000000..c588a31
--- /dev/null
+++ b/examples/demo_qa/commands/report.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+from typing import Optional
+
+from ..runner import bad_statuses, load_results, summarize
+from ..runs.effective import _load_effective_diff
+from ..runs.layout import _effective_paths
+
+
+def _resolve_run_dir_arg(run_arg: Path, artifacts_dir: Path) -> Optional[Path]:
+    if run_arg.exists():
+        return run_arg
+    candidate = artifacts_dir / "runs" / run_arg
+    if candidate.exists():
+        return candidate
+    return None
+
+
+def handle_report_run(args) -> int:
+    artifacts_dir = args.data / ".runs"
+    run_dir = _resolve_run_dir_arg(args.run, artifacts_dir)
+    if not run_dir:
+        print("Run directory not found.", file=sys.stderr)
+        return 2
+    summary_path = run_dir / "summary.json"
+    if not summary_path.exists():
+        print(f"summary.json not found in {run_dir}", file=sys.stderr)
+        return 2
+    try:
+        summary = json.loads(summary_path.read_text(encoding="utf-8"))
+    except Exception as exc:
+        print(f"Failed to read summary: {exc}", file=sys.stderr)
+        return 2
+    print(f"Run: {run_dir}")
+    for key in ["run_id", "tag", "note", "exit_code", "interrupted", "interrupted_at_case_id", "results_path"]:
+        if key in summary:
+            print(f"{key}: {summary.get(key)}")
+    counts = summary.get("counts") or {}
+    if counts:
+        print("Counts:", counts)
+    return 0
+
+
+def _reason_text(res) -> str:
+    if getattr(res, "reason", None):
+        return res.reason
+    if getattr(res, "error", None):
+        return res.error
+    expected = getattr(res, "expected_check", None)
+    if expected and getattr(expected, "detail", None):
+        return expected.detail
+    return ""
+
+
+def handle_report_tag(args) -> int:
+    artifacts_dir = args.data / ".runs"
+    eff_results_path, eff_meta_path = _effective_paths(artifacts_dir, args.tag)
+    if not eff_results_path.exists() or not eff_meta_path.exists():
+        print(f"No effective snapshot found for tag {args.tag!r}.", file=sys.stderr)
+        return 2
+    try:
+        meta = json.loads(eff_meta_path.read_text(encoding="utf-8"))
+    except Exception as exc:
+        print(f"Failed to read effective_meta.json: {exc}", file=sys.stderr)
+        return 2
+    try:
+        results = load_results(eff_results_path)
+    except Exception as exc:
+        print(f"Failed to read effective results: {exc}", file=sys.stderr)
+        return 2
+    counts = meta.get("counts") or summarize(results.values())
+    fail_on = meta.get("fail_on", "bad")
+    require_assert = bool(meta.get("require_assert", False))
+    print(f"Tag: {args.tag}")
+    print(f"Planned: {meta.get('planned_total')} | Executed: {meta.get('executed_total')} | Missed: {meta.get('missed_total')}")
+    print("Counts:", counts)
+    bad = bad_statuses(str(fail_on), require_assert)
+    failing = [res for res in results.values() if res.status in bad]
+    failing = sorted(failing, key=lambda r: r.id)[:10]
+    if failing:
+        print("Failing cases (top 10):")
+        for res in failing:
+            print(f"- {res.id}: {res.status} ({_reason_text(res)}) [{res.artifacts_dir}]")
+    diff_entry = _load_effective_diff(eff_results_path.parent)
+    if diff_entry:
+        print("Last effective change:")
+        for key in ["timestamp", "run_id", "note"]:
+            if key in diff_entry:
+                print(f"  {key}: {diff_entry.get(key)}")
+        for label in ["regressed", "fixed", "changed_bad", "new_cases"]:
+            items = diff_entry.get(label) or []
+            print(f"  {label}: {len(items)}")
+    return 0
+
+
+__all__ = ["handle_report_run", "handle_report_tag", "_resolve_run_dir_arg"]
diff --git a/examples/demo_qa/runs/__init__.py b/examples/demo_qa/runs/__init__.py
new file mode 100644
index 0000000..1a2dfbb
--- /dev/null
+++ b/examples/demo_qa/runs/__init__.py
@@ -0,0 +1,3 @@
+"""Utilities for managing demo QA run artifacts."""
+
+__all__ = []
diff --git a/examples/demo_qa/runs/case_history.py b/examples/demo_qa/runs/case_history.py
new file mode 100644
index 0000000..7f14267
--- /dev/null
+++ b/examples/demo_qa/runs/case_history.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import datetime
+import json
+from pathlib import Path
+from typing import Optional
+
+from ..runner import RunResult
+
+
+def _reason_text(res: RunResult) -> str:
+    if res.reason:
+        return res.reason
+    if res.error:
+        return res.error
+    expected = getattr(res, "expected_check", None)
+    if expected and getattr(expected, "detail", None):
+        return expected.detail
+    return ""
+
+
+def _append_case_history(
+    artifacts_dir: Path,
+    result: RunResult,
+    *,
+    run_id: str,
+    tag: str | None,
+    note: str | None,
+    fail_on: str,
+    require_assert: bool,
+    scope_hash: str,
+    cases_hash: str,
+    git_sha: str | None,
+    run_dir: Path,
+    results_path: Path,
+) -> None:
+    history_dir = artifacts_dir / "runs" / "cases"
+    history_dir.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
+        "run_id": run_id,
+        "tag": tag,
+        "note": note,
+        "status": result.status,
+        "reason": _reason_text(result),
+        "duration_ms": result.duration_ms,
+        "artifacts_dir": result.artifacts_dir,
+        "run_dir": str(run_dir),
+        "results_path": str(results_path),
+        "fail_on": fail_on,
+        "require_assert": require_assert,
+        "scope_hash": scope_hash,
+        "cases_hash": cases_hash,
+        "git_sha": git_sha,
+    }
+    target = history_dir / f"{result.id}.jsonl"
+    with target.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")
+
+
+def _load_case_history(path: Path) -> list[dict]:
+    if not path.exists():
+        return []
+    entries: list[dict] = []
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                entries.append(json.loads(line))
+            except Exception:
+                continue
+    return entries
+
+
+__all__ = ["_append_case_history", "_load_case_history"]
diff --git a/examples/demo_qa/runs/coverage.py b/examples/demo_qa/runs/coverage.py
new file mode 100644
index 0000000..7f21a47
--- /dev/null
+++ b/examples/demo_qa/runs/coverage.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+from typing import Iterable, Mapping, Optional
+
+from ..runner import RunResult
+
+
+def _missed_case_ids(planned_case_ids: Iterable[str], executed_results: Mapping[str, RunResult] | None) -> set[str]:
+    planned_set = set(planned_case_ids)
+    if not executed_results:
+        return planned_set
+    try:
+        executed_ids = set(executed_results.keys())
+    except Exception:
+        executed_ids = set()
+    return planned_set - executed_ids
+
+
+__all__ = ["_missed_case_ids"]
diff --git a/examples/demo_qa/runs/effective.py b/examples/demo_qa/runs/effective.py
new file mode 100644
index 0000000..e7c2b3a
--- /dev/null
+++ b/examples/demo_qa/runs/effective.py
@@ -0,0 +1,203 @@
+from __future__ import annotations
+
+import datetime
+import json
+from pathlib import Path
+from typing import Mapping, Optional
+
+from ..runner import RunResult, bad_statuses, load_results, summarize
+from ..utils import dump_json
+from .coverage import _missed_case_ids
+from .layout import _effective_paths
+from .io import write_results
+
+
+def _load_effective_results(artifacts_dir: Path, tag: str) -> tuple[dict[str, RunResult], Optional[dict], Path]:
+    results_path, meta_path = _effective_paths(artifacts_dir, tag)
+    meta: Optional[dict] = None
+    results: dict[str, RunResult] = {}
+    if results_path.exists():
+        results = load_results(results_path)
+    if meta_path.exists():
+        try:
+            meta = json.loads(meta_path.read_text(encoding="utf-8"))
+        except Exception:
+            meta = None
+    return results, meta, results_path
+
+
+def _write_effective_results(results_path: Path, results: Mapping[str, RunResult]) -> None:
+    results_path.parent.mkdir(parents=True, exist_ok=True)
+    ordered = [results[cid] for cid in sorted(results)]
+    write_results(results_path, ordered)
+
+
+def _reason_text(res: RunResult) -> str:
+    if res.reason:
+        return res.reason
+    if res.error:
+        return res.error
+    expected = getattr(res, "expected_check", None)
+    if expected and getattr(expected, "detail", None):
+        return expected.detail
+    return ""
+
+
+def _build_effective_diff(
+    before: Mapping[str, RunResult],
+    after: Mapping[str, RunResult],
+    *,
+    fail_on: str,
+    require_assert: bool,
+    run_id: str,
+    tag: str,
+    note: str | None,
+    run_dir: Path,
+    results_path: Path,
+    scope_hash: str,
+) -> dict[str, object]:
+    bad = bad_statuses(fail_on, require_assert)
+    before_bad = {cid for cid, res in before.items() if res.status in bad}
+    after_bad = {cid for cid, res in after.items() if res.status in bad}
+    ids = set(before) | set(after)
+    regressed: list[dict[str, object]] = []
+    fixed: list[dict[str, object]] = []
+    changed_bad: list[dict[str, object]] = []
+    new_cases: list[dict[str, object]] = []
+    other_changed: list[dict[str, object]] = []
+    for cid in ids:
+        prev = before.get(cid)
+        cur = after.get(cid)
+        prev_status = prev.status if prev else None
+        cur_status = cur.status if cur else None
+        if prev is None and cur is not None:
+            new_cases.append({"id": cid, "to": cur_status})
+            continue
+        if cur is None or prev is None:
+            continue
+        if prev_status == cur_status:
+            continue
+        entry = {"id": cid, "from": prev_status, "to": cur_status, "reason": _reason_text(cur)}
+        was_bad = cid in before_bad
+        now_bad = cid in after_bad
+        if not was_bad and now_bad:
+            regressed.append(entry)
+        elif was_bad and not now_bad:
+            fixed.append(entry)
+        elif was_bad and now_bad:
+            changed_bad.append(entry)
+        else:
+            other_changed.append(entry)
+    return {
+        "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
+        "tag": tag,
+        "note": note,
+        "run_id": run_id,
+        "run_dir": str(run_dir),
+        "results_path": str(results_path),
+        "fail_on": fail_on,
+        "require_assert": require_assert,
+        "scope_hash": scope_hash,
+        "regressed": sorted(regressed, key=lambda r: r["id"]),
+        "fixed": sorted(fixed, key=lambda r: r["id"]),
+        "changed_bad": sorted(changed_bad, key=lambda r: r["id"]),
+        "changed_other": sorted(other_changed, key=lambda r: r["id"]),
+        "new_cases": sorted(new_cases, key=lambda r: r["id"]),
+    }
+
+
+def _append_effective_diff(tag_dir: Path, diff_entry: Mapping[str, object]) -> None:
+    tag_dir.mkdir(parents=True, exist_ok=True)
+    changes_path = tag_dir / "effective_changes.jsonl"
+    with changes_path.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(diff_entry, ensure_ascii=False, sort_keys=True) + "\n")
+
+
+def _load_effective_diff(tag_dir: Path) -> Optional[dict]:
+    path = tag_dir / "effective_changes.jsonl"
+    if not path.exists():
+        return None
+    last: Optional[dict] = None
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                last = json.loads(line)
+            except Exception:
+                continue
+    return last
+
+
+def _update_effective_snapshot(
+    *,
+    artifacts_dir: Path,
+    tag: str,
+    cases_hash: str,
+    cases_path: Path,
+    suite_case_ids: list[str],
+    executed_results: list[RunResult],
+    run_folder: Path,
+    scope: Mapping[str, object],
+    scope_hash: str,
+    fail_on: str,
+    require_assert: bool,
+) -> tuple[Path, Path, dict[str, RunResult], dict[str, RunResult]]:
+    effective_results, effective_meta, effective_results_path = _load_effective_results(artifacts_dir, tag)
+    if effective_meta and effective_meta.get("cases_hash") and effective_meta["cases_hash"] != cases_hash:
+        raise ValueError(
+            f"Existing effective results for tag {tag!r} use a different cases_hash; refusing to merge."
+        )
+    if effective_meta and effective_meta.get("scope_hash") and effective_meta["scope_hash"] != scope_hash:
+        raise ValueError(
+            f"Existing effective results for tag {tag!r} have a different scope; refusing to merge."
+        )
+
+    planned_pool: set[str]
+    if effective_meta and isinstance(effective_meta.get("planned_case_ids"), list):
+        planned_pool = {str(cid) for cid in effective_meta["planned_case_ids"]}
+    else:
+        planned_pool = set(suite_case_ids)
+
+    before_effective = dict(effective_results)
+    for res in executed_results:
+        effective_results[res.id] = res
+    _write_effective_results(effective_results_path, effective_results)
+
+    summary_counts = summarize(effective_results.values())
+    executed_total = len(effective_results)
+    missed_total = len(_missed_case_ids(planned_pool, effective_results))
+    meta_path = effective_results_path.with_name("effective_meta.json")
+    built_from = set(effective_meta.get("built_from_runs", [])) if effective_meta else set()
+    built_from.add(str(run_folder))
+    effective_meta_payload = {
+        "tag": tag,
+        "cases_hash": cases_hash,
+        "cases_path": str(cases_path),
+        "planned_case_ids": sorted(planned_pool),
+        "planned_total": len(planned_pool),
+        "executed_total": executed_total,
+        "missed_total": missed_total,
+        "counts": summary_counts,
+        "updated_at": datetime.datetime.utcnow().isoformat() + "Z",
+        "built_from_runs": sorted(built_from),
+        "effective_results_path": str(effective_results_path),
+        "scope": scope,
+        "scope_hash": scope_hash,
+        "fail_on": fail_on,
+        "require_assert": require_assert,
+    }
+    meta_path.parent.mkdir(parents=True, exist_ok=True)
+    dump_json(meta_path, effective_meta_payload)
+    return effective_results_path, meta_path, before_effective, effective_results
+
+
+__all__ = [
+    "_append_effective_diff",
+    "_build_effective_diff",
+    "_load_effective_results",
+    "_load_effective_diff",
+    "_update_effective_snapshot",
+    "_write_effective_results",
+]
diff --git a/examples/demo_qa/runs/io.py b/examples/demo_qa/runs/io.py
new file mode 100644
index 0000000..54db989
--- /dev/null
+++ b/examples/demo_qa/runs/io.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Iterable
+
+from ..runner import RunResult
+
+
+def write_results(out_path: Path, results: Iterable[RunResult]) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", encoding="utf-8") as f:
+        for res in results:
+            f.write(json.dumps(res.to_json(), ensure_ascii=False, sort_keys=True, separators=(",", ":")) + "\n")
+
+
+__all__ = ["write_results"]
diff --git a/examples/demo_qa/runs/layout.py b/examples/demo_qa/runs/layout.py
new file mode 100644
index 0000000..704f643
--- /dev/null
+++ b/examples/demo_qa/runs/layout.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Optional
+
+
+def _sanitize_tag(tag: str) -> str:
+    cleaned = "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "_" for ch in tag)
+    return cleaned or "tag"
+
+
+def _effective_paths(artifacts_dir: Path, tag: str) -> tuple[Path, Path]:
+    base = artifacts_dir / "runs" / "tags" / _sanitize_tag(tag)
+    return base / "effective_results.jsonl", base / "effective_meta.json"
+
+
+def _latest_markers(artifacts_dir: Path, tag: str | None) -> tuple[Path, Path]:
+    runs_dir = artifacts_dir / "runs"
+    if tag:
+        slug = _sanitize_tag(tag)
+        return runs_dir / f"tag-latest-{slug}.txt", runs_dir / f"tag-latest-results-{slug}.txt"
+    return runs_dir / "latest.txt", runs_dir / "latest_results.txt"
+
+
+def _load_latest_run(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]:
+    latest_file, _ = _latest_markers(artifacts_dir, tag)
+    if latest_file.exists():
+        content = latest_file.read_text(encoding="utf-8").strip()
+        if content:
+            return Path(content)
+    return None
+
+
+def _load_latest_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]:
+    _, latest_file = _latest_markers(artifacts_dir, tag)
+    if latest_file.exists():
+        content = latest_file.read_text(encoding="utf-8").strip()
+        if content:
+            return Path(content)
+    latest_run = _load_latest_run(artifacts_dir, tag)
+    if latest_run:
+        summary_path = latest_run / "summary.json"
+        if summary_path.exists():
+            try:
+                summary = json.loads(summary_path.read_text(encoding="utf-8"))
+                results_path = summary.get("results_path")
+                if results_path:
+                    return Path(results_path)
+            except Exception:
+                pass
+    return None
+
+
+def _load_run_meta(run_path: Path | None) -> Optional[dict]:
+    if run_path is None:
+        return None
+    meta_path = run_path / "run_meta.json"
+    if not meta_path.exists():
+        return None
+    try:
+        return json.loads(meta_path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+
+
+def _run_dir_from_results_path(results_path: Path | None) -> Optional[Path]:
+    if results_path is None:
+        return None
+    run_dir = results_path.parent
+    summary_path = run_dir / "summary.json"
+    if summary_path.exists():
+        try:
+            summary = json.loads(summary_path.read_text(encoding="utf-8"))
+            run_dir_from_summary = summary.get("run_dir")
+            if run_dir_from_summary:
+                return Path(run_dir_from_summary)
+        except Exception:
+            pass
+    return run_dir
+
+
+def _update_latest_markers(run_folder: Path, results_path: Path, artifacts_dir: Path, tag: str | None) -> None:
+    marker_pairs = {_latest_markers(artifacts_dir, None)}
+    if tag:
+        marker_pairs.add(_latest_markers(artifacts_dir, tag))
+    for latest_path, latest_results_path in marker_pairs:
+        latest_path.parent.mkdir(parents=True, exist_ok=True)
+        latest_path.write_text(str(run_folder), encoding="utf-8")
+        latest_results_path.write_text(str(results_path), encoding="utf-8")
+
+
+__all__ = [
+    "_effective_paths",
+    "_latest_markers",
+    "_load_latest_results",
+    "_load_latest_run",
+    "_load_run_meta",
+    "_run_dir_from_results_path",
+    "_sanitize_tag",
+    "_update_latest_markers",
+]
diff --git a/examples/demo_qa/runs/scope.py b/examples/demo_qa/runs/scope.py
new file mode 100644
index 0000000..571ace4
--- /dev/null
+++ b/examples/demo_qa/runs/scope.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+import hashlib
+import json
+from typing import Mapping, Optional, Set
+
+
+def _scope_payload(
+    *,
+    cases_hash: str,
+    include_tags: Set[str] | None,
+    exclude_tags: Set[str] | None,
+    include_ids: Set[str] | None,
+    exclude_ids: Set[str] | None,
+) -> dict[str, object]:
+    return {
+        "cases_hash": cases_hash,
+        "include_tags": sorted(include_tags) if include_tags else None,
+        "exclude_tags": sorted(exclude_tags) if exclude_tags else None,
+        "include_ids": sorted(include_ids) if include_ids else None,
+        "exclude_ids": sorted(exclude_ids) if exclude_ids else None,
+    }
+
+
+def _scope_hash(scope: Mapping[str, object]) -> str:
+    payload = json.dumps(scope, sort_keys=True, ensure_ascii=False)
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
+__all__ = ["_scope_hash", "_scope_payload"]
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index e9d92fa..5efb13d 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -8,16 +8,9 @@
 
 import pytest
 
-from examples.demo_qa.batch import (
-    _fingerprint_dir,
-    _latest_markers,
-    _missed_case_ids,
-    _update_latest_markers,
-    bad_statuses,
-    is_failure,
-    render_markdown,
-    write_results,
-)
+from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown, write_results
+from examples.demo_qa.runs.coverage import _missed_case_ids
+from examples.demo_qa.runs.layout import _latest_markers, _update_latest_markers
 from examples.demo_qa.runner import RunResult, diff_runs
 
 
diff --git a/tests/test_demo_qa_commands.py b/tests/test_demo_qa_commands.py
new file mode 100644
index 0000000..a0afba0
--- /dev/null
+++ b/tests/test_demo_qa_commands.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+import subprocess
+import sys
+
+
+def test_commands_report_import_is_lightweight() -> None:
+    script = """
+import sys
+
+import examples.demo_qa.commands.report  # noqa: F401
+
+heavy = [name for name in sys.modules if name.startswith("examples.demo_qa.llm") or name.startswith("examples.demo_qa.provider")]
+if heavy:
+    raise SystemExit(f"Heads up: heavy deps imported: {heavy}")
+"""
+    result = subprocess.run([sys.executable, "-c", script], capture_output=True, text=True)
+    assert result.returncode == 0, result.stderr or result.stdout

From cf9d33d0b39ead42bfce5e4f34e23ebd5dca0592 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Mon, 22 Dec 2025 02:23:41 +0300
Subject: [PATCH 53/92] doc update

---
 README_demo_qa.md | 89 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 82 insertions(+), 7 deletions(-)

diff --git a/README_demo_qa.md b/README_demo_qa.md
index 2acb56f..23cea09 100644
--- a/README_demo_qa.md
+++ b/README_demo_qa.md
@@ -55,7 +55,14 @@ python -m examples.demo_qa.cli chat --data demo_data --schema demo_data/schema.j
 
 ## Batch
 
-Запустить пакетный прогон вопросов из `cases.jsonl` (по одному JSON на строку, поля `id`, `question`, опционально `expected`/`expected_regex`/`expected_contains` и `skip`):
+Запустить пакетный прогон вопросов из файла кейсов (`cases.jsonl` или `cases.json`).
+
+Поддерживаемые форматы:
+
+* **JSONL**: по одному JSON-объекту на строку.
+* **JSON**: массив объектов.
+
+Поля кейса: `id`, `question`, опционально `expected`/`expected_regex`/`expected_contains` и `skip`.
 
 ```bash
 python -m examples.demo_qa.cli batch \
@@ -65,12 +72,80 @@ python -m examples.demo_qa.cli batch \
   --out results.jsonl
 ```
 
-* Артефакты по умолчанию пишутся в `<data>/.runs/runs/<timestamp>_<cases_stem>/cases/<id>_<runid>/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`).
-* `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов и, при наличии `--compare-to`, diff по прогрессу.
-* Флаги `--fail-on (error|bad|unchecked|any|skipped)`, `--max-fails`, `--fail-fast`, `--require-assert`, `--compare-to`, `--only-failed-from/--only-failed` и `--plan-only` управляют выбором кейсов, остановкой и кодом выхода (0/1/2).
-* Без `--out` результаты складываются в `<artifacts_dir>/runs/<timestamp>_<cases_stem>/results.jsonl`, а `runs/latest.txt` указывает на последнюю папку запуска.
-* Быстрый фокус на упавших: `--only-failed` возьмёт `runs/latest/results.jsonl`, `--show-artifacts` печатает пути, репро-команды выводятся для каждого FAIL.
-* Команды уровня кейса: `demo_qa case run <id> --cases ...` и `demo_qa case open <id> --run runs/latest` для быстрого воспроизведения.
+Что сохраняется:
+
+* Артефакты по кейсам по умолчанию пишутся в `<data>/.runs/runs/<timestamp>_<cases_stem>/cases/<id>_<runid>/` (`plan.json`, `context.json`, `answer.txt`, `raw_synth.txt`, `error.txt`).
+* `results.jsonl` содержит по строке на кейс, рядом сохраняется `summary.json` с агрегацией статусов.
+* Без `--out` результаты складываются в `<data>/.runs/runs/<timestamp>_<cases_stem>/results.jsonl`, а `runs/latest.txt` указывает на последнюю папку запуска, `runs/latest_results.txt` — на путь к results.
+* При `Ctrl-C` сохраняются частичные результаты: уже пройденные кейсы попадают в `results.jsonl/summary.json`, а прогон помечается как `interrupted`.
+
+Ключевые флаги:
+
+* `--fail-on (error|bad|unchecked|any|skipped)`, `--max-fails`, `--fail-fast`, `--require-assert` — остановка/код выхода (0/1/2) и строгость проверок.
+* `--only-failed` / `--only-failed-from PATH` — перепрогон только плохих кейсов (baseline = latest либо явно заданный results).
+* `--only-missed` / `--only-missed-from PATH` — “добить” только те кейсы, которые отсутствуют в baseline results (удобно после Ctrl-C).
+* `--tag TAG` / `--note "..."` — пометить прогон как часть эксперимента. Для `--tag` поддерживается “effective snapshot”: результаты по тегу накапливаются инкрементально, так что `--only-failed/--only-missed` по тегу корректно работают даже после частичных прогонов.
+* `--plan-only` — строить планы без выполнения.
+
+Команды уровня кейса:
+
+* `python -m examples.demo_qa.cli case run <id> --cases ...` — прогнать один кейс.
+* `python -m examples.demo_qa.cli case open <id> --data ...` — открыть папку артефактов кейса.
+
+Отчёты и история:
+
+* `python -m examples.demo_qa.cli stats --data <DATA_DIR> --last 10` — последние прогоны.
+* `python -m examples.demo_qa.cli report tag --data <DATA_DIR> --tag <TAG>` — сводка по “effective” результатам тега.
+* `python -m examples.demo_qa.cli report run --data <DATA_DIR> --run runs/latest` — сводка по конкретному run.
+* `python -m examples.demo_qa.cli history case <id> --data <DATA_DIR> [--tag <TAG>]` — история по кейсу.
+
+### Удобные алиасы (bash/zsh)
+
+Добавьте в `~/.bashrc` или `~/.zshrc` и перезапустите shell.
+
+```bash
+# 1) Настройте свои дефолты под проект/датасет
+export DQ_DATA="./_demo_data/shop"
+export DQ_SCHEMA="$DQ_DATA/schema.yaml"
+export DQ_CASES="./examples/demo_qa/cases/retail_cases.json"
+export DQ_OUT="$DQ_DATA/.runs/results.jsonl"
+export DQ_TAG="retail-iter1"
+
+# 2) Базовая команда
+dq() { python -m examples.demo_qa.cli "$@"; }
+
+# 3) Самые частые сценарии
+dq-batch()  { dq batch  --data "$DQ_DATA" --schema "$DQ_SCHEMA" --cases "$DQ_CASES" --out "$DQ_OUT" "$@"; }
+dq-failed() { dq-batch --only-failed "$@"; }
+dq-missed() { dq-batch --only-missed "$@"; }
+
+# Tagged (effective) workflow
+dq-batch-tag()  { dq-batch --tag "$DQ_TAG" "$@"; }
+dq-failed-tag() { dq-batch --tag "$DQ_TAG" --only-failed "$@"; }
+dq-missed-tag() { dq-batch --tag "$DQ_TAG" --only-missed "$@"; }
+
+# Отчёты
+dq-stats()   { dq stats  --data "$DQ_DATA" "$@"; }
+dq-report()  { dq report tag --data "$DQ_DATA" --tag "$DQ_TAG" "$@"; }
+dq-run()     { dq report run --data "$DQ_DATA" --run "${1:-runs/latest}"; }
+dq-hist()    { dq history case "$1" --data "$DQ_DATA" --tag "$DQ_TAG" "${@:2}"; }
+
+# Дебаг кейса
+dq-case()    { dq case run "$1" --cases "$DQ_CASES" --data "$DQ_DATA" --schema "$DQ_SCHEMA" "${@:2}"; }
+dq-open()    { dq case open "$1" --data "$DQ_DATA" "${@:2}"; }
+```
+
+Минимальный набор, если не хочется “тегов”:
+
+```bash
+dq() { python -m examples.demo_qa.cli "$@"; }
+dq-batch()  { dq batch  --data "$DQ_DATA" --schema "$DQ_SCHEMA" --cases "$DQ_CASES" --out "$DQ_OUT" "$@"; }
+dq-failed() { dq-batch --only-failed "$@"; }
+dq-missed() { dq-batch --only-missed "$@"; }
+dq-stats()  { dq stats --data "$DQ_DATA" --last 10; }
+```
+
+
 ## Local proxy
 
 Для OpenAI-совместимых серверов (например, LM Studio) укажите `base_url` с `.../v1` и

From 9a4a9f3aad1adab9cd35b60ba90c99184d494c8c Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Mon, 22 Dec 2025 02:48:03 +0300
Subject: [PATCH 54/92] =?UTF-8?q?=D0=B0=D0=BF=D0=B3=D1=80=D0=B5=D0=B9?=
 =?UTF-8?q?=D0=B4=20=D0=B2=D0=B5=D1=80=D1=81=D0=B8=D0=B8=20=D0=B4=D0=BB?=
 =?UTF-8?q?=D1=8F=20=D0=BF=D1=80=D0=B0=D0=B2=D0=B8=D0=BB=D1=8C=D0=BD=D0=BE?=
 =?UTF-8?q?=D0=B3=D0=BE=20=D0=BC=D0=B5=D1=80=D0=B4=D0=B6=D0=B0=20=D1=81=20?=
 =?UTF-8?q?main?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 710ebde..4a9053c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fetchgraph"
-version = "0.1.2"
+version = "0.1.3"
 description = "Graph-like planning → context fetching → synthesis agent (library-style)."
 readme = "README.md"
 requires-python = ">=3.11"

From aa62371a19e1e30174176f2e83b5e246c1358797 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Thu, 25 Dec 2025 20:36:33 +0300
Subject: [PATCH 55/92] =?UTF-8?q?demo=5Faq=20(make)=20-=20=D0=B0=D0=BB?=
 =?UTF-8?q?=D0=B8=D0=B0=D1=81=D1=8B=20=D0=B4=D0=BB=D1=8F=20=D0=BA=D0=BE?=
 =?UTF-8?q?=D0=BC=D0=B0=D0=BD=D0=B4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Makefile | 299 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 299 insertions(+)
 create mode 100644 Makefile

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..0491f57
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,299 @@
+# Makefile — алиасы для examples.demo_qa (без ~/.bashrc / ~/.zshrc)
+#
+# Быстрый старт:
+#   make init
+#   make chat
+#   make batch
+#   make help
+#
+# Примечание про venv:
+# - Makefile НЕ "активирует" venv в текущем терминале (это невозможно из make).
+# - Но он автоматически использует .venv/bin/python, если он существует.
+
+SHELL := /bin/bash
+
+# ==============================================================================
+# 1) Локальный конфиг (не коммитить; удобно добавить в .gitignore)
+# ==============================================================================
+CONFIG ?= .demo_qa.mk
+-include $(CONFIG)
+
+# ==============================================================================
+# 2) Значения по умолчанию (для make init)
+# ==============================================================================
+DEFAULT_DATA   := _demo_data/shop
+DEFAULT_SCHEMA := _demo_data/shop/schema.yaml
+DEFAULT_CASES  := examples/demo_qa/cases/retail_cases.json
+
+# ==============================================================================
+# 3) Python / CLI
+# ==============================================================================
+VENV   ?= .venv
+PYTHON ?= $(if $(wildcard $(VENV)/bin/python),$(VENV)/bin/python,python)
+CLI    := $(PYTHON) -m examples.demo_qa.cli
+
+# ==============================================================================
+# 4) Пути demo_qa (можно переопределять через CLI или в $(CONFIG))
+# ==============================================================================
+DATA   ?=
+SCHEMA ?=
+CASES  ?=
+OUT    ?= $(DATA)/.runs/results.jsonl
+
+# ==============================================================================
+# 5) Параметры команд
+# ==============================================================================
+TAG   ?=
+NOTE  ?=
+CASE  ?=
+LIMIT ?= 50
+
+ONLY_FAILED_FROM ?=
+ONLY_MISSED_FROM ?=
+
+BASE     ?=
+NEW      ?=
+DIFF_OUT ?= $(DATA)/.runs/diff.md
+JUNIT    ?= $(DATA)/.runs/diff.junit.xml
+
+MAX_FAILS ?= 5
+
+# ==============================================================================
+# 6) Настройки LLM-конфига (редактирование/просмотр)
+# ==============================================================================
+# Если у тебя конфиг лежит иначе — переопредели:
+#   make llm-edit LLM_TOML=path/to/demo_qa.toml
+LLM_TOML ?= demo_qa.toml
+LLM_TOML_EXAMPLE ?= demo_qa.toml.example
+
+# macOS: открываем в TextEdit
+OPEN ?= open
+EDITOR_APP ?= TextEdit
+
+# ==============================================================================
+# 7) Вспомогательные флаги (не передавать пустые)
+# ==============================================================================
+TAG_FLAG   := $(if $(strip $(TAG)),--tag "$(TAG)",)
+NOTE_FLAG  := $(if $(strip $(NOTE)),--note "$(NOTE)",)
+LIMIT_FLAG := $(if $(strip $(LIMIT)),--limit $(LIMIT),)
+
+# ==============================================================================
+# 8) PHONY
+# ==============================================================================
+.PHONY: help init show-config check ensure-runs-dir venv-check \
+        llm-init llm-show llm-edit \
+        chat \
+        batch batch-tag batch-failed batch-failed-from \
+        batch-missed batch-missed-from batch-fail-fast batch-max-fails \
+        stats history-case report-tag case-run case-open compare
+
+# ==============================================================================
+# help (на русском)
+# ==============================================================================
+help:
+	@echo ""
+	@echo "DemoQA: Makefile-алиасы (без ~/.bashrc или ~/.zshrc)"
+	@echo "==================================================="
+	@echo ""
+	@echo "Быстрый старт:"
+	@echo "  make init"
+	@echo "  make chat"
+	@echo ""
+	@echo "Конфигурация:"
+	@echo "  Настройки хранятся в: $(CONFIG)"
+	@echo "  Можно переопределять переменные так:"
+	@echo "    make chat DATA=_demo_data/shop SCHEMA=_demo_data/shop/schema.yaml"
+	@echo ""
+	@echo "Основные переменные:"
+	@echo "  DATA     - путь к датасету (например: _demo_data/shop)"
+	@echo "  SCHEMA   - путь к schema.yaml"
+	@echo "  CASES    - путь к cases.json"
+	@echo "  OUT      - куда писать results.jsonl (по умолчанию: \$$DATA/.runs/results.jsonl)"
+	@echo ""
+	@echo "Команды:"
+	@echo "  make chat                 - интерактивный чат"
+	@echo "  make batch                - полный прогон всего набора"
+	@echo "  make batch-tag TAG=... NOTE='...'  - полный прогон с тегом и заметкой"
+	@echo "  make batch-failed         - перепрогон только упавших (baseline = latest)"
+	@echo "  make batch-failed-from ONLY_FAILED_FROM=path/results.jsonl  - only-failed от явного baseline"
+	@echo "  make batch-missed [TAG=...] - добить missed (если TAG задан — относительно effective по тегу)"
+	@echo "  make batch-missed-from ONLY_MISSED_FROM=path/results.jsonl  - добить missed от явного baseline"
+	@echo "  make batch-fail-fast      - быстрый smoke (остановиться на первом фейле)"
+	@echo "  make batch-max-fails MAX_FAILS=5 - остановиться после N фейлов"
+	@echo "  make stats                - stats по последним 10 прогонов"
+	@echo ""
+	@echo "Диагностика / анализ:"
+	@echo "  make history-case CASE=case_42 [TAG=...] [LIMIT=50] - история по кейсу"
+	@echo "  make report-tag TAG=...    - сводка по тегу (effective snapshot)"
+	@echo "  make case-run  CASE=case_42 - прогнать один кейс"
+	@echo "  make case-open CASE=case_42 - открыть артефакты кейса"
+	@echo ""
+	@echo "Сравнение результатов:"
+	@echo "  make compare BASE=... NEW=... [DIFF_OUT=...] [JUNIT=...]"
+	@echo ""
+	@echo "LLM конфиг:"
+	@echo "  make llm-init             - создать $(LLM_TOML) из $(LLM_TOML_EXAMPLE)"
+	@echo "  make llm-show             - показать первые ~200 строк $(LLM_TOML)"
+	@echo "  make llm-edit             - открыть $(LLM_TOML) в TextEdit (macOS)"
+	@echo ""
+	@echo "Сервисные:"
+	@echo "  make venv-check           - показать, какой python будет использоваться"
+	@echo "  make show-config          - показать текущие значения переменных"
+	@echo ""
+
+# ==============================================================================
+# Конфиг проекта
+# ==============================================================================
+init:
+	@set -euo pipefail; \
+	if [ -f "$(CONFIG)" ] && [ "$${FORCE:-0}" != "1" ]; then \
+	  echo "Файл $(CONFIG) уже существует. Чтобы перезаписать: FORCE=1 make init"; \
+	  exit 1; \
+	fi; \
+	DATA="$${DATA:-$(DEFAULT_DATA)}"; \
+	SCHEMA="$${SCHEMA:-$(DEFAULT_SCHEMA)}"; \
+	CASES="$${CASES:-$(DEFAULT_CASES)}"; \
+	mkdir -p "$$DATA/.runs"; \
+	{ \
+	  echo "# Локальные настройки demo_qa (генерируется командой: make init)"; \
+	  echo "# Можно редактировать руками. Рекомендуется добавить в .gitignore."; \
+	  echo "DATA=$$DATA"; \
+	  echo "SCHEMA=$$SCHEMA"; \
+	  echo "CASES=$$CASES"; \
+	  echo "# OUT можно не задавать: по умолчанию OUT=\$${DATA}/.runs/results.jsonl"; \
+	  echo "# OUT=$$DATA/.runs/results.jsonl"; \
+	} > "$(CONFIG)"; \
+	echo "Ок: создан $(CONFIG)"; \
+	echo "Создана папка: $$DATA/.runs"; \
+	echo "Дальше: make chat / make batch / make help"
+
+show-config:
+	@echo "CONFIG  = $(CONFIG)"
+	@echo "VENV    = $(VENV)"
+	@echo "PYTHON  = $(PYTHON)"
+	@echo "DATA    = $(DATA)"
+	@echo "SCHEMA  = $(SCHEMA)"
+	@echo "CASES   = $(CASES)"
+	@echo "OUT     = $(OUT)"
+	@echo "LLM_TOML= $(LLM_TOML)"
+	@echo "TAG     = $(TAG)"
+	@echo "NOTE    = $(NOTE)"
+	@echo "CASE    = $(CASE)"
+	@echo "LIMIT   = $(LIMIT)"
+
+venv-check:
+	@if [ -x "$(VENV)/bin/python" ]; then \
+	  echo "OK: venv найден: $(VENV) (использую $(VENV)/bin/python)"; \
+	else \
+	  echo "INFO: venv не найден: $(VENV) (использую системный python: $$(command -v $(PYTHON) || echo 'python'))"; \
+	fi
+
+check:
+	@test -n "$(strip $(DATA))"   || (echo "DATA не задан. Запусти: make init (или передай DATA=...)" && exit 1)
+	@test -n "$(strip $(SCHEMA))" || (echo "SCHEMA не задан. Запусти: make init (или передай SCHEMA=...)" && exit 1)
+	@test -n "$(strip $(CASES))"  || (echo "CASES не задан. Запусти: make init (или передай CASES=...)" && exit 1)
+
+ensure-runs-dir: check
+	@mkdir -p "$(DATA)/.runs"
+
+# ==============================================================================
+# LLM конфиг (без проверок доступности — это задача приложения)
+# ==============================================================================
+llm-init:
+	@set -euo pipefail; \
+	if [ -f "$(LLM_TOML)" ]; then \
+	  echo "Файл уже существует: $(LLM_TOML)"; \
+	  exit 0; \
+	fi; \
+	if [ -f "$(LLM_TOML_EXAMPLE)" ]; then \
+	  cp "$(LLM_TOML_EXAMPLE)" "$(LLM_TOML)"; \
+	  echo "Ок: создан $(LLM_TOML) из $(LLM_TOML_EXAMPLE)"; \
+	else \
+	  echo "Не найден пример: $(LLM_TOML_EXAMPLE). Создай $(LLM_TOML) вручную."; \
+	  exit 1; \
+	fi
+
+llm-show:
+	@echo "LLM config: $(LLM_TOML)"
+	@echo "----------------------------------------"
+	@sed -n '1,200p' "$(LLM_TOML)" 2>/dev/null || (echo "Файл не найден: $(LLM_TOML). Сделай: make llm-init" && exit 1)
+
+llm-edit:
+	@$(OPEN) -a "$(EDITOR_APP)" "$(LLM_TOML)"
+
+# ==============================================================================
+# Алиасы под команды CLI
+# ==============================================================================
+chat: check
+	@$(CLI) chat --data "$(DATA)" --schema "$(SCHEMA)"
+
+# 1) Полный прогон всего набора
+batch: ensure-runs-dir
+	@$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)"
+
+# 2) Полный прогон с тегом + заметка
+batch-tag: ensure-runs-dir
+	@test -n "$(strip $(TAG))" || (echo "TAG обязателен: make batch-tag TAG=..." && exit 1)
+	@$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" $(TAG_FLAG) $(NOTE_FLAG)
+
+# 3) only-failed от latest
+batch-failed: ensure-runs-dir
+	@$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" --only-failed
+
+# 4) only-failed от явного baseline
+batch-failed-from: ensure-runs-dir
+	@test -n "$(strip $(ONLY_FAILED_FROM))" || (echo "Нужно задать ONLY_FAILED_FROM=.../results.jsonl" && exit 1)
+	@$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" \
+	  --only-failed-from "$(ONLY_FAILED_FROM)"
+
+# 5) only-missed (relative to effective по TAG или latest)
+batch-missed: ensure-runs-dir
+	@$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" \
+	  $(TAG_FLAG) --only-missed
+
+# 6) only-missed от явного baseline
+batch-missed-from: ensure-runs-dir
+	@test -n "$(strip $(ONLY_MISSED_FROM))" || (echo "Нужно задать ONLY_MISSED_FROM=.../results.jsonl" && exit 1)
+	@$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" \
+	  --only-missed --only-missed-from "$(ONLY_MISSED_FROM)"
+
+# 7) fail-fast / max-fails
+batch-fail-fast: ensure-runs-dir
+	@$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" --fail-fast
+
+batch-max-fails: ensure-runs-dir
+	@$(CLI) batch --data "$(DATA)" --schema "$(SCHEMA)" --cases "$(CASES)" --out "$(OUT)" --max-fails "$(MAX_FAILS)"
+
+# stats (последние 10)
+stats: check
+	@$(CLI) stats --data "$(DATA)" --last 10
+
+# 8) История по кейсу (TAG опционален)
+history-case: check
+	@test -n "$(strip $(CASE))" || (echo "Нужно задать CASE=case_42" && exit 1)
+	@$(CLI) history case "$(CASE)" --data "$(DATA)" $(TAG_FLAG) $(LIMIT_FLAG)
+
+# 9) Сводка по тегу
+report-tag: check
+	@test -n "$(strip $(TAG))" || (echo "TAG обязателен: make report-tag TAG=..." && exit 1)
+	@$(CLI) report tag --data "$(DATA)" --tag "$(TAG)"
+
+# 10) Дебаг 1 кейса
+case-run: check
+	@test -n "$(strip $(CASE))" || (echo "Нужно задать CASE=case_42" && exit 1)
+	@$(CLI) case run "$(CASE)" --cases "$(CASES)" --data "$(DATA)" --schema "$(SCHEMA)"
+
+case-open: check
+	@test -n "$(strip $(CASE))" || (echo "Нужно задать CASE=case_42" && exit 1)
+	@$(CLI) case open "$(CASE)" --data "$(DATA)"
+
+# compare (diff.md + junit)
+compare: check
+	@test -n "$(strip $(BASE))" || (echo "Нужно задать BASE=.../results_prev.jsonl" && exit 1)
+	@test -n "$(strip $(NEW))"  || (echo "Нужно задать NEW=.../results.jsonl" && exit 1)
+	@mkdir -p "$(DATA)/.runs"
+	@$(CLI) compare \
+	  --base "$(BASE)" \
+	  --new  "$(NEW)" \
+	  --out  "$(DIFF_OUT)" \
+	  --junit "$(JUNIT)"

From daae6a3dd93b9214b5473020556d74d9f4e903ff Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Fri, 26 Dec 2025 23:25:02 +0300
Subject: [PATCH 56/92] Handle JSON array case files in demo QA runner

---
 examples/demo_qa/runner.py | 96 ++++++++++++++++++++++----------------
 1 file changed, 57 insertions(+), 39 deletions(-)

diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index f0575dd..151c4df 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -399,46 +399,64 @@ def load_cases(path: Path) -> List[Case]:
         raise FileNotFoundError(f"Cases file not found: {path}")
     cases: List[Case] = []
     seen_ids: set[str] = set()
-    with path.open("r", encoding="utf-8") as f:
-        for lineno, line in enumerate(f, start=1):
-            line = line.strip()
-            if not line:
-                continue
+    text = path.read_text(encoding="utf-8")
+    stripped = text.lstrip()
+
+    def add_case(payload: Mapping[str, object], location: str) -> None:
+        if not isinstance(payload, Mapping):
+            raise ValueError(f"Case on {location} must be an object")
+        if "id" not in payload or "question" not in payload:
+            raise ValueError(f"Case on {location} missing required fields 'id' and 'question'")
+        case_id = str(payload["id"])
+        if case_id in seen_ids:
+            raise ValueError(f"Duplicate case id {case_id!r} on {location}")
+        seen_ids.add(case_id)
+        expected = payload.get("expected")
+        expected_regex = payload.get("expected_regex")
+        expected_contains = payload.get("expected_contains")
+        for field_name, val in [
+            ("expected", expected),
+            ("expected_regex", expected_regex),
+            ("expected_contains", expected_contains),
+        ]:
+            if val is not None and str(val).strip() == "":
+                raise ValueError(f"{field_name} must not be empty on {location}")
+        if expected_regex is not None:
             try:
-                payload = json.loads(line)
-            except json.JSONDecodeError as exc:
-                raise ValueError(f"Invalid JSON on line {lineno}: {exc}") from exc
-            if "id" not in payload or "question" not in payload:
-                raise ValueError(f"Case on line {lineno} missing required fields 'id' and 'question'")
-            case_id = str(payload["id"])
-            if case_id in seen_ids:
-                raise ValueError(f"Duplicate case id {case_id!r} on line {lineno}")
-            seen_ids.add(case_id)
-            expected = payload.get("expected")
-            expected_regex = payload.get("expected_regex")
-            expected_contains = payload.get("expected_contains")
-            for field_name, val in [
-                ("expected", expected),
-                ("expected_regex", expected_regex),
-                ("expected_contains", expected_contains),
-            ]:
-                if val is not None and str(val).strip() == "":
-                    raise ValueError(f"{field_name} must not be empty on line {lineno}")
-            if expected_regex is not None:
-                try:
-                    re.compile(expected_regex)
-                except re.error as exc:
-                    raise ValueError(f"Invalid expected_regex on line {lineno}: {exc}") from exc
-            case = Case(
-                id=case_id,
-                question=str(payload["question"]),
-                expected=expected,
-                expected_regex=expected_regex,
-                expected_contains=expected_contains,
-                tags=list(payload.get("tags", []) or []),
-                skip=bool(payload.get("skip", False)),
-            )
-            cases.append(case)
+                re.compile(expected_regex)
+            except re.error as exc:
+                raise ValueError(f"Invalid expected_regex on {location}: {exc}") from exc
+        case = Case(
+            id=case_id,
+            question=str(payload["question"]),
+            expected=expected,
+            expected_regex=expected_regex,
+            expected_contains=expected_contains,
+            tags=list(payload.get("tags", []) or []),
+            skip=bool(payload.get("skip", False)),
+        )
+        cases.append(case)
+
+    if stripped.startswith("["):
+        try:
+            payloads = json.loads(text)
+        except json.JSONDecodeError as exc:
+            raise ValueError(f"Invalid JSON array: {exc}") from exc
+        if not isinstance(payloads, list):
+            raise ValueError("Cases JSON must be an array of objects")
+        for index, payload in enumerate(payloads, start=1):
+            add_case(payload, f"array index {index}")
+        return cases
+
+    for lineno, line in enumerate(text.splitlines(), start=1):
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            payload = json.loads(line)
+        except json.JSONDecodeError as exc:
+            raise ValueError(f"Invalid JSON on line {lineno}: {exc}") from exc
+        add_case(payload, f"line {lineno}")
     return cases
 
 

From 14d4e183b26d0e277e83d9d51331ccfc1f3f8db1 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Fri, 26 Dec 2025 23:56:48 +0300
Subject: [PATCH 57/92] =?UTF-8?q?llm(fix)=20=D1=84=D0=B8=D0=BA=D1=81=D0=B8?=
 =?UTF-8?q?=D0=BC=20=D1=81=D0=BA=D1=80=D1=8B=D1=82=D1=8B=D0=B9=20=D0=B1?=
 =?UTF-8?q?=D0=B0=D0=B3=20-=20=D0=B5=D1=81=D0=BB=D0=B8=20=D1=84=D0=B0?=
 =?UTF-8?q?=D0=B9=D0=BB=20=D0=BE=D1=82=D1=81=D1=83=D1=82=D1=81=D1=82=D0=B2?=
 =?UTF-8?q?=D1=83=D0=B5=D1=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/demo_qa/batch.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 8fa3266..91ab75e 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -7,7 +7,7 @@
 import sys
 import uuid
 from pathlib import Path
-from typing import Iterable, Mapping, Optional
+from typing import Mapping, Optional
 
 from .llm.factory import build_llm
 from .logging_config import configure_logging
@@ -30,10 +30,14 @@
 )
 from .runs.case_history import _append_case_history
 from .runs.coverage import _missed_case_ids
-from .runs.effective import _append_effective_diff, _build_effective_diff, _load_effective_results, _update_effective_snapshot
+from .runs.effective import (
+    _append_effective_diff,
+    _build_effective_diff,
+    _load_effective_results,
+    _update_effective_snapshot,
+)
 from .runs.io import write_results
 from .runs.layout import (
-    _latest_markers,
     _load_latest_results,
     _load_latest_run,
     _load_run_meta,
@@ -319,7 +323,6 @@ def handle_batch(args) -> int:
     run_id = uuid.uuid4().hex[:8]
     interrupted = False
     interrupted_at_case_id: str | None = None
-    cases_hash = _hash_file(args.cases)
 
     try:
         settings = load_settings(config_path=args.config, data_dir=args.data)
@@ -328,6 +331,7 @@ def handle_batch(args) -> int:
         return 2
     try:
         cases = load_cases(args.cases)
+        cases_hash = _hash_file(args.cases)
     except Exception as exc:
         print(f"Cases error: {exc}", file=sys.stderr)
         return 2

From d425cc5506d00cb2d4b10d92b7b76b2b81feec2f Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 27 Dec 2025 00:07:54 +0300
Subject: [PATCH 58/92] Improve demo QA typing and timestamps

---
 examples/demo_qa/batch.py             | 178 +++++++++++++++-----------
 examples/demo_qa/data_gen.py          |   5 +-
 examples/demo_qa/runner.py            |  24 +++-
 examples/demo_qa/runs/case_history.py |   2 +-
 examples/demo_qa/runs/effective.py    |   4 +-
 5 files changed, 128 insertions(+), 85 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 8fa3266..6a19f71 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -14,6 +14,7 @@
 from .provider_factory import build_provider
 from .runner import (
     Case,
+    DiffReport,
     EventLogger,
     RunResult,
     RunTimings,
@@ -51,13 +52,40 @@ def write_summary(out_path: Path, summary: dict) -> Path:
     return summary_path
 
 
+def _coerce_number(value: object | None) -> float | None:
+    if isinstance(value, bool):
+        return float(value)
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, str):
+        try:
+            return float(value)
+        except ValueError:
+            return None
+    return None
+
+
+def _coerce_int(value: object | None) -> int:
+    number = _coerce_number(value)
+    if number is None:
+        return 0
+    return int(number)
+
+
+def _isoformat_utc(dt: datetime.datetime) -> str:
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=datetime.timezone.utc)
+    return dt.astimezone(datetime.timezone.utc).isoformat().replace("+00:00", "Z")
+
+
 def _pass_rate(counts: Mapping[str, object]) -> Optional[float]:
-    total = int(counts.get("total", 0) or 0)
-    skipped = int(counts.get("skipped", 0) or 0)
+    total = _coerce_int(counts.get("total"))
+    skipped = _coerce_int(counts.get("skipped"))
     denom = total - skipped
     if denom <= 0:
         return None
-    return (counts.get("ok", 0) or 0) / denom
+    ok = _coerce_number(counts.get("ok"))
+    return None if ok is None else ok / denom
 
 
 def _hash_file(path: Path) -> str:
@@ -185,34 +213,28 @@ def handle_chat(args) -> int:
     return 0
 
 
-def compare_runs(base_path: Path, new_path: Path, *, fail_on: str, require_assert: bool) -> dict[str, object]:
+def compare_runs(base_path: Path, new_path: Path, *, fail_on: str, require_assert: bool) -> DiffReport:
     base = load_results(base_path)
     new = load_results(new_path)
     return diff_runs(base.values(), new.values(), fail_on=fail_on, require_assert=require_assert)
 
 
-def render_markdown(compare: dict[str, object], out_path: Optional[Path]) -> str:
+def render_markdown(compare: DiffReport, out_path: Optional[Path]) -> str:
     lines: list[str] = []
-    base_counts = compare["base_counts"]  # type: ignore[index]
-    new_counts = compare["new_counts"]  # type: ignore[index]
-    fail_on = compare.get("fail_on", "bad")  # type: ignore[assignment]
-    require_assert = bool(compare.get("require_assert", False))
-
-    def _bad_total(counts: dict) -> int:
-        bad_from_compare = compare.get("base_bad_total") if counts is base_counts else compare.get("new_bad_total")
-        if isinstance(bad_from_compare, int):
-            return bad_from_compare
-        bad_set = bad_statuses(str(fail_on), require_assert)
+    base_counts = compare["base_counts"]
+    new_counts = compare["new_counts"]
+    fail_on = compare.get("fail_on", "bad")
+    require_assert = compare.get("require_assert", False)
+
+    def _bad_total(counts: Mapping[str, object], *, fallback: int) -> int:
+        bad_set = bad_statuses(str(fail_on), bool(require_assert))
         total = 0
         for status in bad_set:
-            try:
-                total += int(counts.get(status, 0) or 0)
-            except Exception:
-                continue
-        return total
+            total += _coerce_int(counts.get(status))
+        return total or fallback
 
-    base_bad = _bad_total(base_counts)  # type: ignore[arg-type]
-    new_bad = _bad_total(new_counts)  # type: ignore[arg-type]
+    base_bad = _bad_total(base_counts, fallback=compare.get("base_bad_total", 0))
+    new_bad = _bad_total(new_counts, fallback=compare.get("new_bad_total", 0))
     lines.append("# Batch comparison report")
     lines.append("")
     lines.append("## Summary")
@@ -224,7 +246,7 @@ def _bad_total(counts: dict) -> int:
         lines.append(f"- Median total time: base {base_med:.2f}s → new {new_med:.2f}s (Δ {new_med - base_med:+.2f}s)")
     lines.append("")
 
-    def table(title: str, rows: list[dict]) -> None:
+    def table(title: str, rows: list[Mapping[str, object]]) -> None:
         lines.append(f"## {title}")
         if not rows:
             lines.append("None")
@@ -233,7 +255,8 @@ def table(title: str, rows: list[dict]) -> None:
         lines.append("| id | status | reason | artifacts |")
         lines.append("|---|---|---|---|")
         for row in sorted(rows, key=lambda r: r.get("id", "")):
-            artifacts = row.get("artifacts", {})
+            artifacts_val = row.get("artifacts", {})
+            artifacts = artifacts_val if isinstance(artifacts_val, Mapping) else {}
             links = ", ".join(f"[{k}]({v})" for k, v in sorted(artifacts.items()))
             lines.append(
                 f"| {row['id']} | {row['from']} → {row['to']} | {row.get('reason','')} | {links or ''} |"
@@ -250,13 +273,13 @@ def table(title: str, rows: list[dict]) -> None:
     return content
 
 
-def write_junit(compare: dict[str, object], out_path: Path) -> None:
+def write_junit(compare: DiffReport, out_path: Path) -> None:
     import xml.etree.ElementTree as ET
 
     suite = ET.Element("testsuite", name="demo_qa_compare")
-    bad = compare["new_fail"] + compare["still_fail"]  # type: ignore[operator]
-    fixed = compare["fixed"]  # type: ignore[assignment]
-    all_ids_list = list(compare.get("all_ids", []) or [])  # type: ignore[arg-type]
+    bad = compare["new_fail"] + compare["still_fail"]
+    fixed = compare["fixed"]
+    all_ids_list = list(compare.get("all_ids", []) or [])
     all_ids = sorted(all_ids_list)
     cases_total = len(all_ids)
     suite.set("tests", str(cases_total))
@@ -267,7 +290,8 @@ def write_junit(compare: dict[str, object], out_path: Path) -> None:
         tc = ET.SubElement(suite, "testcase", name=row["id"])
         msg = row.get("reason", "") or f"{row.get('from')} → {row.get('to')}"
         failure = ET.SubElement(tc, "failure", message=msg)
-        artifacts = row.get("artifacts", {})
+        artifacts_val = row.get("artifacts", {})
+        artifacts = artifacts_val if isinstance(artifacts_val, Mapping) else {}
         if artifacts:
             failure.text = "\n".join(f"{k}: {v}" for k, v in sorted(artifacts.items()))
 
@@ -315,19 +339,23 @@ def _select_cases_for_rerun(
 
 
 def handle_batch(args) -> int:
-    started_at = datetime.datetime.utcnow()
+    started_at = datetime.datetime.now(datetime.timezone.utc)
     run_id = uuid.uuid4().hex[:8]
     interrupted = False
     interrupted_at_case_id: str | None = None
-    cases_hash = _hash_file(args.cases)
+    data_dir = Path(args.data)
+    schema_path = Path(args.schema)
+    cases_path = Path(args.cases)
+    config_path = Path(args.config) if args.config else None
+    cases_hash = _hash_file(cases_path)
 
     try:
-        settings = load_settings(config_path=args.config, data_dir=args.data)
+        settings = load_settings(config_path=config_path, data_dir=data_dir)
     except Exception as exc:
         print(f"Configuration error: {exc}", file=sys.stderr)
         return 2
     try:
-        cases = load_cases(args.cases)
+        cases = load_cases(cases_path)
     except Exception as exc:
         print(f"Cases error: {exc}", file=sys.stderr)
         return 2
@@ -335,9 +363,7 @@ def handle_batch(args) -> int:
     baseline_for_filter: Optional[Mapping[str, RunResult]] = None
     baseline_for_compare: Optional[Mapping[str, RunResult]] = None
 
-    artifacts_dir = args.artifacts_dir
-    if artifacts_dir is None:
-        artifacts_dir = args.data / ".runs"
+    artifacts_dir = Path(args.artifacts_dir) if args.artifacts_dir else data_dir / ".runs"
 
     include_tags = _split_csv(args.include_tags)
     exclude_tags = _split_csv(args.exclude_tags)
@@ -352,9 +378,8 @@ def handle_batch(args) -> int:
     )
     scope_id = _scope_hash(scope)
 
-    baseline_filter_path = args.only_failed_from
+    baseline_filter_path = Path(args.only_failed_from) if args.only_failed_from else None
     only_failed_baseline_kind: str | None = None
-    effective_results_path: Path | None = None
     if args.only_failed_from:
         only_failed_baseline_kind = "path"
     elif args.tag and args.only_failed:
@@ -373,7 +398,6 @@ def handle_batch(args) -> int:
             return 2
         baseline_for_filter = effective_results
         baseline_filter_path = eff_path
-        effective_results_path = eff_path
         only_failed_baseline_kind = "effective"
     elif args.only_failed:
         latest_results = _load_latest_results(artifacts_dir, args.tag)
@@ -397,7 +421,7 @@ def handle_batch(args) -> int:
         print("No baseline found for --only-failed.", file=sys.stderr)
         return 2
 
-    compare_path = args.compare_to
+    compare_path: Path | None = Path(args.compare_to) if args.compare_to else None
     if compare_path is None and args.only_failed and baseline_filter_path:
         compare_path = baseline_filter_path
     if compare_path:
@@ -520,15 +544,15 @@ def handle_batch(args) -> int:
             print("0 missed cases selected.", file=sys.stderr)
 
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}"
-    results_path = args.out or (run_folder / "results.jsonl")
+    run_folder = artifacts_dir / "runs" / f"{timestamp}_{cases_path.stem}"
+    results_path = Path(args.out) if args.out else (run_folder / "results.jsonl")
     artifacts_root = run_folder / "cases"
     results_path.parent.mkdir(parents=True, exist_ok=True)
     summary_path = results_path.with_name("summary.json")
     artifacts_dir.mkdir(parents=True, exist_ok=True)
-    history_path = args.history or (args.data / ".runs" / "history.jsonl")
+    history_path = Path(args.history) if args.history else (data_dir / ".runs" / "history.jsonl")
 
-    log_dir = args.log_dir or args.data / ".runs" / "logs"
+    log_dir = Path(args.log_dir) if args.log_dir else data_dir / ".runs" / "logs"
     configure_logging(
         level=args.log_level,
         log_dir=log_dir,
@@ -537,7 +561,7 @@ def handle_batch(args) -> int:
         run_id=None,
     )
 
-    provider, _ = build_provider(args.data, args.schema, enable_semantic=args.enable_semantic)
+    provider, _ = build_provider(data_dir, schema_path, enable_semantic=args.enable_semantic)
     llm = build_llm(settings)
     runner = build_agent(llm, provider)
     events_path = None
@@ -595,10 +619,10 @@ def handle_batch(args) -> int:
     write_results(results_path, results)
     counts = summarize(results)
 
-    diff_block: dict | None = None
+    diff_block: DiffReport | None = None
     baseline_path: Path | None = None
     if baseline_for_compare:
-        baseline_path = args.compare_to or baseline_filter_path
+        baseline_path = compare_path or baseline_filter_path
         diff = diff_runs(
             baseline_for_compare.values(),
             results,
@@ -610,10 +634,10 @@ def handle_batch(args) -> int:
         diff_block = diff
 
     policy_bad = bad_statuses(args.fail_on, args.require_assert)
-    bad_count = sum(int(counts.get(status, 0) or 0) for status in policy_bad)
+    bad_count = sum(_coerce_int(counts.get(status)) for status in policy_bad)
     exit_code = 130 if interrupted else (1 if bad_count else 0)
 
-    ended_at = datetime.datetime.utcnow()
+    ended_at = datetime.datetime.now(datetime.timezone.utc)
     duration_ms = int((ended_at - started_at).total_seconds() * 1000)
     executed_results = {res.id: res for res in results}
     planned_total = len(selected_case_ids)
@@ -623,8 +647,8 @@ def handle_batch(args) -> int:
     suite_missed_total = len(_missed_case_ids(suite_case_ids, executed_results))
     summary = {
         "run_id": run_id,
-        "started_at": started_at.isoformat() + "Z",
-        "ended_at": ended_at.isoformat() + "Z",
+        "started_at": _isoformat_utc(started_at),
+        "ended_at": _isoformat_utc(ended_at),
         "duration_ms": duration_ms,
         "counts": counts,
         "summary_by_tag": counts.get("summary_by_tag"),
@@ -676,7 +700,7 @@ def handle_batch(args) -> int:
                 artifacts_dir=artifacts_dir,
                 tag=args.tag,
                 cases_hash=cases_hash,
-                cases_path=args.cases,
+                cases_path=cases_path,
                 suite_case_ids=suite_case_ids,
                 executed_results=results,
                 run_folder=run_folder,
@@ -701,24 +725,24 @@ def handle_batch(args) -> int:
         except Exception as exc:
             print(f"Failed to update effective results for tag {args.tag!r}: {exc}", file=sys.stderr)
 
-    config_hash = _hash_file(args.config) if args.config else None
-    schema_hash = _hash_file(args.schema)
-    data_fingerprint = _fingerprint_dir(args.data, verbose=args.fingerprint_verbose)
+    config_hash = _hash_file(config_path) if config_path else None
+    schema_hash = _hash_file(schema_path)
+    data_fingerprint = _fingerprint_dir(data_dir, verbose=args.fingerprint_verbose)
     git_sha = _git_sha()
     llm_settings = settings.llm
     run_meta = {
         "run_id": run_id,
-        "timestamp": started_at.isoformat() + "Z",
+        "timestamp": _isoformat_utc(started_at),
         "tag": args.tag,
         "note": args.note,
         "inputs": {
-            "cases_path": str(args.cases),
+            "cases_path": str(cases_path),
             "cases_hash": cases_hash,
-            "config_path": str(args.config) if args.config else None,
+            "config_path": str(config_path) if config_path else None,
             "config_hash": config_hash,
-            "schema_path": str(args.schema),
+            "schema_path": str(schema_path),
             "schema_hash": schema_hash,
-            "data_dir": str(args.data),
+            "data_dir": str(data_dir),
         },
         "suite_case_ids": suite_case_ids,
         "selected_case_ids": selected_case_ids,
@@ -763,7 +787,7 @@ def handle_batch(args) -> int:
     prate = _pass_rate(counts)
     history_entry = {
         "run_id": run_id,
-        "timestamp": started_at.isoformat() + "Z",
+        "timestamp": _isoformat_utc(started_at),
         "config_hash": config_hash,
         "schema_hash": schema_hash,
         "cases_hash": cases_hash,
@@ -962,15 +986,17 @@ def _print_stats(entries: list[dict]) -> None:
     print(header)
     prev = None
     for entry in entries:
-        pass_rate = entry.get("pass_rate")
-        median = entry.get("median_total_s")
+        pass_rate = _coerce_number(entry.get("pass_rate"))
+        median = _coerce_number(entry.get("median_total_s"))
         delta_pass = None
         delta_median = None
         if prev:
-            if pass_rate is not None and prev.get("pass_rate") is not None:
-                delta_pass = pass_rate - prev.get("pass_rate")
-            if median is not None and prev.get("median_total_s") is not None:
-                delta_median = median - prev.get("median_total_s")
+            prev_pass_rate = _coerce_number(prev.get("pass_rate"))
+            if pass_rate is not None and prev_pass_rate is not None:
+                delta_pass = pass_rate - prev_pass_rate
+            prev_median = _coerce_number(prev.get("median_total_s"))
+            if median is not None and prev_median is not None:
+                delta_median = median - prev_median
         pr_display = f"{pass_rate*100:.1f}%" if pass_rate is not None else "n/a"
         median_display = f"{median:.2f}" if median is not None else "n/a"
         dp = f"{delta_pass*100:+.1f}pp" if delta_pass is not None else "n/a"
@@ -986,12 +1012,12 @@ def _print_stats(entries: list[dict]) -> None:
 
 
 def handle_stats(args) -> int:
-    history_path: Optional[Path] = args.history
+    history_path: Path | None = args.history
     if history_path is None:
         if not args.data:
             print("Provide --data or --history to locate history.jsonl", file=sys.stderr)
             return 2
-        history_path = args.data / ".runs" / "history.jsonl"
+        history_path = Path(args.data) / ".runs" / "history.jsonl"
     entries = _load_history(history_path)
     if args.group_by == "config_hash":
         grouped: dict[str, list[dict]] = {}
@@ -1007,15 +1033,19 @@ def handle_stats(args) -> int:
 
 
 def handle_compare(args) -> int:
-    if not args.base.exists() or not args.new.exists():
+    base_path = Path(args.base)
+    new_path = Path(args.new)
+    if not base_path.exists() or not new_path.exists():
         print("Base or new results file not found.", file=sys.stderr)
         return 2
-    comparison = compare_runs(args.base, args.new, fail_on=args.fail_on, require_assert=args.require_assert)
-    report = render_markdown(comparison, args.out)
+    comparison = compare_runs(base_path, new_path, fail_on=args.fail_on, require_assert=args.require_assert)
+    out_path = Path(args.out) if args.out is not None else None
+    report = render_markdown(comparison, out_path)
     print(report)
     if args.junit:
-        write_junit(comparison, args.junit)
-        print(f"JUnit written to {args.junit}")
+        junit_path = Path(args.junit)
+        write_junit(comparison, junit_path)
+        print(f"JUnit written to {junit_path}")
     return 0
 
 
diff --git a/examples/demo_qa/data_gen.py b/examples/demo_qa/data_gen.py
index 8fc21dd..4732ee1 100644
--- a/examples/demo_qa/data_gen.py
+++ b/examples/demo_qa/data_gen.py
@@ -3,7 +3,7 @@
 import json
 import random
 from dataclasses import asdict, dataclass
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Dict, List
 
@@ -294,7 +294,8 @@ def generate_and_save(out_dir: Path, *, rows: int = 1000, seed: int | None = Non
     save_dataset(dataset, out_dir)
     schema = default_schema(enable_semantic=enable_semantic)
     save_schema(schema, out_dir / "schema.json")
-    meta = MetaInfo(seed=seed, rows=rows, created_at=datetime.utcnow().isoformat())
+    created_at = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
+    meta = MetaInfo(seed=seed, rows=rows, created_at=created_at)
     write_meta(out_dir / "meta.json", meta)
 
     # Simple statistics
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 151c4df..0218acf 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -8,7 +8,7 @@
 import uuid
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Dict, Iterable, List, Mapping, TypedDict
+from typing import Dict, Iterable, List, Mapping, NotRequired, TypedDict
 
 from fetchgraph.core import create_generic_agent
 from fetchgraph.core.models import TaskProfile
@@ -599,14 +599,24 @@ def _median_duration(results: Mapping[str, RunResult]) -> float | None:
     return (durations[mid - 1] + durations[mid]) / 2000
 
 
+def _coerce_int(value: object | None) -> int:
+    if isinstance(value, bool):
+        return int(value)
+    if isinstance(value, (int, float)):
+        return int(value)
+    if isinstance(value, str):
+        try:
+            return int(value)
+        except ValueError:
+            return 0
+    return 0
+
+
 def _count_bad_from_summary(counts: Mapping[str, object], fail_on: str, require_assert: bool) -> int:
     bad = bad_statuses(fail_on, require_assert)
     total = 0
     for status in bad:
-        try:
-            total += int(counts.get(status, 0) or 0)
-        except Exception:
-            continue
+        total += _coerce_int(counts.get(status, 0))
     return total
 
 
@@ -751,7 +761,8 @@ def __init__(self, path: Path | None, run_id: str):
     def emit(self, event: Dict[str, object]) -> None:
         if not self.path:
             return
-        payload = {"timestamp": datetime.datetime.utcnow().isoformat() + "Z", "run_id": self.run_id, **event}
+        now = datetime.datetime.now(datetime.timezone.utc)
+        payload = {"timestamp": now.isoformat().replace("+00:00", "Z"), "run_id": self.run_id, **event}
         with self.path.open("a", encoding="utf-8") as f:
             f.write(json.dumps(payload, ensure_ascii=False) + "\n")
 
@@ -781,6 +792,7 @@ class DiffReport(TypedDict):
     new_bad_total: int
     fail_on: str
     require_assert: bool
+    baseline_path: NotRequired[str]
 
 
 __all__ = [
diff --git a/examples/demo_qa/runs/case_history.py b/examples/demo_qa/runs/case_history.py
index 7f14267..860483c 100644
--- a/examples/demo_qa/runs/case_history.py
+++ b/examples/demo_qa/runs/case_history.py
@@ -37,7 +37,7 @@ def _append_case_history(
     history_dir = artifacts_dir / "runs" / "cases"
     history_dir.mkdir(parents=True, exist_ok=True)
     payload = {
-        "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
+        "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z"),
         "run_id": run_id,
         "tag": tag,
         "note": note,
diff --git a/examples/demo_qa/runs/effective.py b/examples/demo_qa/runs/effective.py
index e7c2b3a..ca2457d 100644
--- a/examples/demo_qa/runs/effective.py
+++ b/examples/demo_qa/runs/effective.py
@@ -89,7 +89,7 @@ def _build_effective_diff(
         else:
             other_changed.append(entry)
     return {
-        "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
+        "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z"),
         "tag": tag,
         "note": note,
         "run_id": run_id,
@@ -180,7 +180,7 @@ def _update_effective_snapshot(
         "executed_total": executed_total,
         "missed_total": missed_total,
         "counts": summary_counts,
-        "updated_at": datetime.datetime.utcnow().isoformat() + "Z",
+        "updated_at": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z"),
         "built_from_runs": sorted(built_from),
         "effective_results_path": str(effective_results_path),
         "scope": scope,

From 7c3f753ffbc3b7ea77b18711e1dbe5755813c196 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 27 Dec 2025 00:18:45 +0300
Subject: [PATCH 59/92] Improve batch typing safety

---
 examples/demo_qa/batch.py   | 44 +++++++++++++++++++++++--------------
 tests/test_demo_qa_batch.py | 10 ++++++---
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 246be54..ded52c1 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -7,7 +7,7 @@
 import sys
 import uuid
 from pathlib import Path
-from typing import Mapping, Optional
+from typing import Mapping, Optional, cast
 
 from .llm.factory import build_llm
 from .logging_config import configure_logging
@@ -223,6 +223,15 @@ def compare_runs(base_path: Path, new_path: Path, *, fail_on: str, require_asser
     return diff_runs(base.values(), new.values(), fail_on=fail_on, require_assert=require_assert)
 
 
+def _id_sort_key(row: Mapping[str, object]) -> str:
+    identifier = row.get("id")
+    if isinstance(identifier, str):
+        return identifier
+    if identifier is None:
+        return ""
+    return str(identifier)
+
+
 def render_markdown(compare: DiffReport, out_path: Optional[Path]) -> str:
     lines: list[str] = []
     base_counts = compare["base_counts"]
@@ -258,7 +267,7 @@ def table(title: str, rows: list[Mapping[str, object]]) -> None:
             return
         lines.append("| id | status | reason | artifacts |")
         lines.append("|---|---|---|---|")
-        for row in sorted(rows, key=lambda r: r.get("id", "")):
+        for row in sorted(rows, key=_id_sort_key):
             artifacts_val = row.get("artifacts", {})
             artifacts = artifacts_val if isinstance(artifacts_val, Mapping) else {}
             links = ", ".join(f"[{k}]({v})" for k, v in sorted(artifacts.items()))
@@ -290,7 +299,7 @@ def write_junit(compare: DiffReport, out_path: Path) -> None:
     suite.set("failures", str(len(bad)))
     suite.set("errors", "0")
 
-    for row in sorted(bad, key=lambda r: r.get("id", "")):
+    for row in sorted(bad, key=_id_sort_key):
         tc = ET.SubElement(suite, "testcase", name=row["id"])
         msg = row.get("reason", "") or f"{row.get('from')} → {row.get('to')}"
         failure = ET.SubElement(tc, "failure", message=msg)
@@ -299,7 +308,7 @@ def write_junit(compare: DiffReport, out_path: Path) -> None:
         if artifacts:
             failure.text = "\n".join(f"{k}: {v}" for k, v in sorted(artifacts.items()))
 
-    for row in sorted(fixed, key=lambda r: r.get("id", "")):
+    for row in sorted(fixed, key=_id_sort_key):
         ET.SubElement(suite, "testcase", name=row["id"])
 
     bad_ids = {row["id"] for row in bad}
@@ -382,9 +391,10 @@ def handle_batch(args) -> int:
     )
     scope_id = _scope_hash(scope)
 
-    baseline_filter_path = Path(args.only_failed_from) if args.only_failed_from else None
+    baseline_filter_path_arg = cast(Optional[Path], args.only_failed_from)
+    baseline_filter_path: Path | None = Path(baseline_filter_path_arg) if baseline_filter_path_arg else None
     only_failed_baseline_kind: str | None = None
-    if args.only_failed_from:
+    if baseline_filter_path_arg:
         only_failed_baseline_kind = "path"
     elif args.tag and args.only_failed:
         effective_results, effective_meta, eff_path = _load_effective_results(artifacts_dir, args.tag)
@@ -415,7 +425,7 @@ def handle_batch(args) -> int:
                 if candidate.exists():
                     baseline_filter_path = candidate
                     only_failed_baseline_kind = "latest"
-    if baseline_filter_path and baseline_for_filter is None:
+    if baseline_filter_path is not None and baseline_for_filter is None:
         try:
             baseline_for_filter = load_results(baseline_filter_path)
         except Exception as exc:
@@ -425,12 +435,13 @@ def handle_batch(args) -> int:
         print("No baseline found for --only-failed.", file=sys.stderr)
         return 2
 
-    compare_path: Path | None = Path(args.compare_to) if args.compare_to else None
+    compare_to_arg = cast(Optional[Path], args.compare_to)
+    compare_path: Path | None = Path(compare_to_arg) if compare_to_arg else None
     if compare_path is None and args.only_failed and baseline_filter_path:
         compare_path = baseline_filter_path
-    if compare_path:
+    if compare_path is not None:
         try:
-            if baseline_filter_path and compare_path.resolve() == baseline_filter_path.resolve():
+            if baseline_filter_path is not None and compare_path.resolve() == baseline_filter_path.resolve():
                 baseline_for_compare = baseline_for_filter
             else:
                 baseline_for_compare = load_results(compare_path)
@@ -466,8 +477,9 @@ def handle_batch(args) -> int:
     missed_baseline_run: Path | None = None
     only_missed_baseline_kind: str | None = None
     if args.only_missed:
-        if args.only_missed_from:
-            missed_baseline_path = args.only_missed_from
+        only_missed_from_arg = cast(Optional[Path], args.only_missed_from)
+        if only_missed_from_arg:
+            missed_baseline_path = only_missed_from_arg
             only_missed_baseline_kind = "path"
             try:
                 missed_baseline_results = load_results(missed_baseline_path)
@@ -503,15 +515,15 @@ def handle_batch(args) -> int:
                 )
                 baseline_planned_ids = set(suite_case_ids)
         else:
-            missed_baseline_path = args.only_missed_from or _load_latest_results(artifacts_dir, args.tag)
-            if args.only_missed_from:
+            missed_baseline_path = only_missed_from_arg or _load_latest_results(artifacts_dir, args.tag)
+            if only_missed_from_arg:
                 only_missed_baseline_kind = "path"
-            elif missed_baseline_path:
+            elif missed_baseline_path is not None:
                 only_missed_baseline_kind = "latest"
             missed_baseline_run = _run_dir_from_results_path(missed_baseline_path)
             if missed_baseline_run is None:
                 missed_baseline_run = _load_latest_run(artifacts_dir, args.tag)
-            if missed_baseline_path:
+            if missed_baseline_path is not None:
                 try:
                     missed_baseline_results = load_results(missed_baseline_path)
                 except Exception as exc:
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 5efb13d..cdde099 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -5,13 +5,14 @@
 import os
 import time
 from pathlib import Path
+from typing import cast
 
 import pytest
 
 from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown, write_results
 from examples.demo_qa.runs.coverage import _missed_case_ids
 from examples.demo_qa.runs.layout import _latest_markers, _update_latest_markers
-from examples.demo_qa.runner import RunResult, diff_runs
+from examples.demo_qa.runner import DiffReport, RunResult, diff_runs
 
 
 @pytest.mark.parametrize(
@@ -27,7 +28,9 @@ def test_is_failure_matches_bad_statuses(fail_on: str, require_assert: bool) ->
 
 
 def test_render_markdown_uses_fail_policy() -> None:
-    compare = {
+    compare = cast(
+        DiffReport,
+        {
         "base_counts": {"ok": 0, "mismatch": 2, "error": 1, "failed": 0},
         "new_counts": {"ok": 1, "mismatch": 0, "error": 0, "failed": 0},
         "base_bad_total": 1,
@@ -38,7 +41,8 @@ def test_render_markdown_uses_fail_policy() -> None:
         "fixed": [],
         "still_fail": [],
         "all_ids": [],
-    }
+        },
+    )
     report = render_markdown(compare, None)
     assert "- Base OK: 0, Bad: 1" in report
     assert "- New  OK: 1, Bad: 0" in report

From 832edad06f0bf6f9d46ce46c835328ecaa182413 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 27 Dec 2025 00:31:04 +0300
Subject: [PATCH 60/92] Improve typed diff structures for demo QA reports

---
 examples/demo_qa/batch.py  | 11 ++++-----
 examples/demo_qa/runner.py | 47 ++++++++++++++++++++++++++++++--------
 2 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index ded52c1..afc1f4f 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -290,9 +290,9 @@ def write_junit(compare: DiffReport, out_path: Path) -> None:
     import xml.etree.ElementTree as ET
 
     suite = ET.Element("testsuite", name="demo_qa_compare")
-    bad = compare["new_fail"] + compare["still_fail"]
-    fixed = compare["fixed"]
-    all_ids_list = list(compare.get("all_ids", []) or [])
+    bad: list[DiffCaseChange] = compare["new_fail"] + compare["still_fail"]
+    fixed: list[DiffCaseChange] = compare["fixed"]
+    all_ids_list: list[str] = list(compare.get("all_ids", []) or [])
     all_ids = sorted(all_ids_list)
     cases_total = len(all_ids)
     suite.set("tests", str(cases_total))
@@ -301,10 +301,9 @@ def write_junit(compare: DiffReport, out_path: Path) -> None:
 
     for row in sorted(bad, key=_id_sort_key):
         tc = ET.SubElement(suite, "testcase", name=row["id"])
-        msg = row.get("reason", "") or f"{row.get('from')} → {row.get('to')}"
+        msg: str = row["reason"] or f"{row.get('from')} → {row.get('to')}"
         failure = ET.SubElement(tc, "failure", message=msg)
-        artifacts_val = row.get("artifacts", {})
-        artifacts = artifacts_val if isinstance(artifacts_val, Mapping) else {}
+        artifacts = row.get("artifacts", {})
         if artifacts:
             failure.text = "\n".join(f"{k}: {v}" for k, v in sorted(artifacts.items()))
 
diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 0218acf..03906c9 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -636,19 +636,24 @@ def diff_runs(
     def _is_bad(res: RunResult | None) -> bool:
         return bool(res and res.status in bad)
 
-    def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult | None) -> dict[str, object]:
+    def _entry(case_id: str, base_res: RunResult | None, new_res: RunResult | None) -> DiffCaseChange:
+        artifacts: dict[str, str]
+        if new_res is None:
+            artifacts = {}
+        else:
+            artifacts = _artifact_links(new_res)
         return {
             "id": case_id,
             "from": base_res.status if base_res else None,
             "to": new_res.status if new_res else "missing",
             "reason": _reason(new_res) if new_res else "missing in new results",
-            "artifacts": _artifact_links(new_res) if new_res else {},
+            "artifacts": artifacts,
         }
 
-    new_fail: list[dict[str, object]] = []
-    fixed: list[dict[str, object]] = []
-    still_fail: list[dict[str, object]] = []
-    changed_status: list[dict[str, str | None]] = []
+    new_fail: list[DiffCaseChange] = []
+    fixed: list[DiffCaseChange] = []
+    still_fail: list[DiffCaseChange] = []
+    changed_status: list[DiffStatusChange] = []
     new_cases: list[str] = []
 
     for case_id in all_ids:
@@ -772,12 +777,34 @@ def for_case(self, case_id: str, path: Path | None = None) -> "EventLogger":
         return EventLogger(path, self.run_id)
 
 
+DiffCaseChange = TypedDict(
+    "DiffCaseChange",
+    {
+        "id": str,
+        "from": str | None,
+        "to": str | None,
+        "reason": str,
+        "artifacts": Mapping[str, str],
+    },
+)
+
+
+DiffStatusChange = TypedDict(
+    "DiffStatusChange",
+    {
+        "id": str,
+        "from": str | None,
+        "to": str | None,
+    },
+)
+
+
 class DiffReport(TypedDict):
     all_ids: list[str]
-    new_fail: list[dict[str, object]]
-    fixed: list[dict[str, object]]
-    still_fail: list[dict[str, object]]
-    changed_status: list[dict[str, str | None]]
+    new_fail: list[DiffCaseChange]
+    fixed: list[DiffCaseChange]
+    still_fail: list[DiffCaseChange]
+    changed_status: list[DiffStatusChange]
     new_cases: list[str]
     base_counts: Dict[str, object]
     new_counts: Dict[str, object]

From c04903e45ffb598efa7010b943845327a44fffa1 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sat, 27 Dec 2025 00:40:06 +0300
Subject: [PATCH 61/92] import fix

---
 examples/demo_qa/batch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index afc1f4f..a3c7f27 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -15,6 +15,7 @@
 from .runner import (
     Case,
     DiffReport,
+    DiffCaseChange,
     EventLogger,
     RunResult,
     RunTimings,

From 07cd3c03209b5bc992a2187844238f1eb24cca75 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 27 Dec 2025 00:54:01 +0300
Subject: [PATCH 62/92] Fix history case to honor artifacts dir

---
 examples/demo_qa/cli.py              | 1 +
 examples/demo_qa/commands/history.py | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index c8ae01c..87d940c 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -153,6 +153,7 @@ def build_parser() -> argparse.ArgumentParser:
     case_hist = history_sub.add_parser("case", help="Show history for a case id")
     case_hist.add_argument("case_id")
     case_hist.add_argument("--data", type=Path, required=True, help="Data dir containing .runs")
+    case_hist.add_argument("--artifacts-dir", type=Path, default=None, help="Base artifacts dir (default: <data>/.runs)")
     case_hist.add_argument("--tag", type=str, default=None, help="Filter by tag")
     case_hist.add_argument("--limit", type=int, default=20, help="Limit rows")
 
diff --git a/examples/demo_qa/commands/history.py b/examples/demo_qa/commands/history.py
index 376f90d..a60f7c1 100644
--- a/examples/demo_qa/commands/history.py
+++ b/examples/demo_qa/commands/history.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 
+from pathlib import Path
+
 from ..runs.case_history import _load_case_history
 
 
 def handle_history_case(args) -> int:
-    artifacts_dir = args.data / ".runs"
+    artifacts_dir = Path(args.artifacts_dir) if args.artifacts_dir else args.data / ".runs"
     path = artifacts_dir / "runs" / "cases" / f"{args.case_id}.jsonl"
     entries = _load_case_history(path)
     if args.tag:

From d3b2863697ae68ce4be884d3202ea2503973dfce Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 04:48:52 +0300
Subject: [PATCH 63/92] Handle non-string expectations in demo QA runner

---
 examples/demo_qa/runner.py   | 29 +++++++++++++++++++----------
 tests/test_demo_qa_runner.py | 13 +++++++++++++
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/examples/demo_qa/runner.py b/examples/demo_qa/runner.py
index 03906c9..0a8029b 100644
--- a/examples/demo_qa/runner.py
+++ b/examples/demo_qa/runner.py
@@ -204,25 +204,34 @@ def save_status(result: RunResult) -> None:
     _save_json(status_path, result.to_json())
 
 
+def _stringify(value: object | None) -> str | None:
+    if value is None:
+        return None
+    return str(value)
+
+
 def _match_expected(case: Case, answer: str | None) -> ExpectedCheck | None:
     if not case.has_asserts:
         return None
-    expected_value = case.expected or case.expected_regex or case.expected_contains or ""
+    expected_value = _stringify(case.expected) or _stringify(case.expected_regex) or _stringify(case.expected_contains) or ""
     if answer is None:
         return ExpectedCheck(mode="none", expected=expected_value, passed=False, detail="no answer")
     if case.expected is not None:
-        passed = answer.strip() == case.expected.strip()
-        detail = None if passed else f"expected={case.expected!r}, got={answer!r}"
-        return ExpectedCheck(mode="exact", expected=case.expected, passed=passed, detail=detail)
+        expected_str = _stringify(case.expected) or ""
+        passed = answer.strip() == expected_str.strip()
+        detail = None if passed else f"expected={expected_str!r}, got={answer!r}"
+        return ExpectedCheck(mode="exact", expected=expected_str, passed=passed, detail=detail)
     if case.expected_regex is not None:
-        pattern = re.compile(case.expected_regex)
+        expected_regex = _stringify(case.expected_regex) or ""
+        pattern = re.compile(expected_regex)
         passed = bool(pattern.search(answer))
-        detail = None if passed else f"regex {case.expected_regex!r} not found"
-        return ExpectedCheck(mode="regex", expected=case.expected_regex, passed=passed, detail=detail)
+        detail = None if passed else f"regex {expected_regex!r} not found"
+        return ExpectedCheck(mode="regex", expected=expected_regex, passed=passed, detail=detail)
     if case.expected_contains is not None:
-        passed = case.expected_contains in answer
-        detail = None if passed else f"expected to contain {case.expected_contains!r}"
-        return ExpectedCheck(mode="contains", expected=case.expected_contains, passed=passed, detail=detail)
+        expected_contains = _stringify(case.expected_contains) or ""
+        passed = expected_contains in answer
+        detail = None if passed else f"expected to contain {expected_contains!r}"
+        return ExpectedCheck(mode="contains", expected=expected_contains, passed=passed, detail=detail)
     return None
 
 
diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py
index 31773cd..0888dfe 100644
--- a/tests/test_demo_qa_runner.py
+++ b/tests/test_demo_qa_runner.py
@@ -8,6 +8,19 @@ def test_match_expected_unchecked_when_no_expectations() -> None:
     assert _match_expected(case, "anything") is None
 
 
+def test_match_expected_coerces_non_string_expected_values() -> None:
+    case = Case(id="c1", question="What is foo?", expected=42)
+
+    mismatch = _match_expected(case, "43")
+    assert mismatch is not None
+    assert mismatch.passed is False
+    assert "expected='42'" in (mismatch.detail or "")
+
+    match = _match_expected(case, "42")
+    assert match is not None
+    assert match.passed is True
+
+
 def test_match_expected_contains_pass_and_fail() -> None:
     case = Case(id="c2", question="Q", expected_contains="bar")
 

From e4b697e3159f4d907ae9652082a4905d11841015 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 05:23:16 +0300
Subject: [PATCH 64/92] Default batch history to artifacts directory

---
 examples/demo_qa/batch.py | 2 +-
 examples/demo_qa/cli.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index a3c7f27..bc3cf66 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -566,7 +566,7 @@ def handle_batch(args) -> int:
     results_path.parent.mkdir(parents=True, exist_ok=True)
     summary_path = results_path.with_name("summary.json")
     artifacts_dir.mkdir(parents=True, exist_ok=True)
-    history_path = Path(args.history) if args.history else (data_dir / ".runs" / "history.jsonl")
+    history_path = Path(args.history) if args.history else (artifacts_dir / "history.jsonl")
 
     log_dir = Path(args.log_dir) if args.log_dir else data_dir / ".runs" / "logs"
     configure_logging(
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index 87d940c..78cc697 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -95,7 +95,7 @@ def build_parser() -> argparse.ArgumentParser:
     batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code")
     batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show")
     batch_p.add_argument("--show-artifacts", action="store_true", help="Show artifact paths for failures")
-    batch_p.add_argument("--history", type=Path, default=None, help="Path to history.jsonl (default: <data>/.runs/history.jsonl)")
+    batch_p.add_argument("--history", type=Path, default=None, help="Path to history.jsonl (default: <artifacts-dir>/history.jsonl)")
     batch_p.add_argument("--include-tags", type=str, default=None, help="Comma-separated tags to include")
     batch_p.add_argument("--exclude-tags", type=str, default=None, help="Comma-separated tags to exclude")
     batch_p.add_argument("--include-ids", type=Path, default=None, help="Path to file with ids to include (one per line)")

From 4f4aac0f706351fa66762e7550bb6dbaafed5db7 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sat, 3 Jan 2026 05:39:12 +0300
Subject: [PATCH 65/92] Update quick-start aliases for schema.json, allow
 default model usage

---
 README_demo_qa.md            | 2 +-
 examples/demo_qa/settings.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README_demo_qa.md b/README_demo_qa.md
index 23cea09..8a8cc07 100644
--- a/README_demo_qa.md
+++ b/README_demo_qa.md
@@ -106,7 +106,7 @@ python -m examples.demo_qa.cli batch \
 ```bash
 # 1) Настройте свои дефолты под проект/датасет
 export DQ_DATA="./_demo_data/shop"
-export DQ_SCHEMA="$DQ_DATA/schema.yaml"
+export DQ_SCHEMA="$DQ_DATA/schema.json"
 export DQ_CASES="./examples/demo_qa/cases/retail_cases.json"
 export DQ_OUT="$DQ_DATA/.runs/results.jsonl"
 export DQ_TAG="retail-iter1"
diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py
index 064e179..be3f03a 100644
--- a/examples/demo_qa/settings.py
+++ b/examples/demo_qa/settings.py
@@ -20,8 +20,8 @@ class LLMSettings(BaseModel):
     base_url: str | None = Field(default=None)
     api_key: str | None = Field(default=None)
     model: str | None = None
-    plan_model: str = "gpt-4o-mini"
-    synth_model: str = "gpt-4o-mini"
+    plan_model: str = "default"
+    synth_model: str = "default"
     plan_temperature: float = 0.0
     synth_temperature: float = 0.2
     timeout_s: float | None = None

From b565b7a6bb281f5efcd27b67c90661c6f4f27669 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 06:08:17 +0300
Subject: [PATCH 66/92] Handle baseline overlay selection for partial runs

---
 examples/demo_qa/batch.py       | 248 +++++++++++++++++++-------------
 examples/demo_qa/cli.py         |   5 +
 examples/demo_qa/runs/layout.py | 124 +++++++++++-----
 tests/test_demo_qa_batch.py     |  66 +++++++--
 4 files changed, 302 insertions(+), 141 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index bc3cf66..26e07ee 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -7,7 +7,7 @@
 import sys
 import uuid
 from pathlib import Path
-from typing import Mapping, Optional, cast
+from typing import Iterable, Mapping, Optional, cast
 
 from .llm.factory import build_llm
 from .logging_config import configure_logging
@@ -41,8 +41,10 @@
 from .runs.io import write_results
 from .runs.layout import (
     _load_latest_results,
+    _load_latest_any_results,
     _load_latest_run,
     _load_run_meta,
+    _resolve_results_path_for_run,
     _run_dir_from_results_path,
     _update_latest_markers,
 )
@@ -116,6 +118,44 @@ def _load_ids(path: Optional[Path]) -> set[str] | None:
     return ids
 
 
+def _only_failed_selection(
+    baseline_results: Mapping[str, RunResult] | None,
+    overlay_results: Mapping[str, RunResult] | None,
+    *,
+    fail_on: str,
+    require_assert: bool,
+) -> tuple[set[str], dict[str, object]]:
+    baseline = baseline_results or {}
+    overlay = overlay_results or {}
+    bad = bad_statuses(fail_on, require_assert)
+    baseline_bad = {cid for cid, res in baseline.items() if res.status in bad}
+    overlay_bad = {cid for cid, res in overlay.items() if res.status in bad}
+    overlay_good = {cid for cid, res in overlay.items() if res.status not in bad}
+
+    healed = baseline_bad & overlay_good
+    selection = (baseline_bad - healed) | overlay_bad
+    breakdown = {
+        "baseline_failures": baseline_bad,
+        "healed": healed,
+        "new_failures": overlay_bad,
+    }
+    return selection, breakdown
+
+
+def _only_missed_selection(
+    selected_case_ids: Iterable[str],
+    baseline_results: Mapping[str, RunResult] | None,
+    overlay_results: Mapping[str, RunResult] | None,
+) -> tuple[set[str], dict[str, object]]:
+    selected = set(selected_case_ids)
+    baseline_ids = set(baseline_results.keys()) if baseline_results else set()
+    overlay_executed = set(overlay_results.keys()) if overlay_results else set()
+    missed_base = selected - baseline_ids
+    missed_final = missed_base - overlay_executed
+    breakdown = {"missed_base": missed_base, "overlay_executed": overlay_executed}
+    return missed_final, breakdown
+
+
 def _fingerprint_dir(data_dir: Path, *, verbose: bool = False) -> Mapping[str, object]:
     entries: list[dict] = []
     total_bytes = 0
@@ -170,7 +210,7 @@ def _find_case_artifact(run_path: Path, case_id: str) -> Optional[Path]:
 def _resolve_run_path(path: Path | None, artifacts_dir: Path) -> Optional[Path]:
     if path is not None:
         return path
-    return _load_latest_run(artifacts_dir)
+    return _load_latest_run(artifacts_dir, kind="any")
 
 
 def handle_chat(args) -> int:
@@ -322,10 +362,7 @@ def write_junit(compare: DiffReport, out_path: Path) -> None:
 
 def _select_cases_for_rerun(
     cases: list[Case],
-    baseline_for_filter: Optional[Mapping[str, RunResult]],
     *,
-    require_assert: bool,
-    fail_on: str,
     include_tags: set[str] | None,
     exclude_tags: set[str] | None,
     include_ids: set[str] | None,
@@ -343,12 +380,7 @@ def _select_cases_for_rerun(
         if exclude_ids and case.id in exclude_ids:
             continue
         filtered.append(case)
-    if not baseline_for_filter:
-        return filtered
-    target_ids = {
-        case_id for case_id, res in baseline_for_filter.items() if res.status in bad_statuses(fail_on, require_assert)
-    }
-    return [case for case in filtered if case.id in target_ids]
+    return filtered
 
 
 def handle_batch(args) -> int:
@@ -373,8 +405,14 @@ def handle_batch(args) -> int:
         print(f"Cases error: {exc}", file=sys.stderr)
         return 2
 
-    baseline_for_filter: Optional[Mapping[str, RunResult]] = None
     baseline_for_compare: Optional[Mapping[str, RunResult]] = None
+    failed_baseline_results: Optional[Mapping[str, RunResult]] = None
+    failed_baseline_path: Path | None = None
+    missed_baseline_results: Optional[Mapping[str, RunResult]] = None
+    missed_baseline_path: Path | None = None
+    overlay_results: Optional[Mapping[str, RunResult]] = None
+    overlay_results_path: Path | None = None
+    overlay_run_path: Path | None = None
 
     artifacts_dir = Path(args.artifacts_dir) if args.artifacts_dir else data_dir / ".runs"
 
@@ -410,28 +448,20 @@ def handle_batch(args) -> int:
         if effective_meta and effective_meta.get("scope_hash") not in (None, scope_id):
             print("Effective results scope does not match current selection; refusing to merge.", file=sys.stderr)
             return 2
-        baseline_for_filter = effective_results
+        failed_baseline_results = effective_results
         baseline_filter_path = eff_path
         only_failed_baseline_kind = "effective"
     elif args.only_failed:
-        latest_results = _load_latest_results(artifacts_dir, args.tag)
-        if latest_results:
-            baseline_filter_path = latest_results
-            only_failed_baseline_kind = "latest"
-        else:
-            latest_run = _load_latest_run(artifacts_dir, args.tag)
-            if latest_run:
-                candidate = latest_run / "results.jsonl"
-                if candidate.exists():
-                    baseline_filter_path = candidate
-                    only_failed_baseline_kind = "latest"
-    if baseline_filter_path is not None and baseline_for_filter is None:
+        baseline_filter_path = _load_latest_results(artifacts_dir, args.tag)
+        if baseline_filter_path:
+            only_failed_baseline_kind = "latest_complete"
+    if baseline_filter_path is not None and failed_baseline_results is None:
         try:
-            baseline_for_filter = load_results(baseline_filter_path)
+            failed_baseline_results = load_results(baseline_filter_path)
         except Exception as exc:
             print(f"Failed to read baseline for --only-failed-from: {exc}", file=sys.stderr)
             return 2
-    if args.only_failed and baseline_for_filter is None:
+    if args.only_failed and failed_baseline_results is None:
         print("No baseline found for --only-failed.", file=sys.stderr)
         return 2
 
@@ -442,50 +472,61 @@ def handle_batch(args) -> int:
     if compare_path is not None:
         try:
             if baseline_filter_path is not None and compare_path.resolve() == baseline_filter_path.resolve():
-                baseline_for_compare = baseline_for_filter
+                baseline_for_compare = failed_baseline_results
             else:
                 baseline_for_compare = load_results(compare_path)
         except Exception as exc:
             print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr)
             return 2
 
+    overlay_run_path = _load_latest_run(artifacts_dir, args.tag, kind="any")
+    overlay_results_path = _resolve_results_path_for_run(overlay_run_path) or _load_latest_any_results(
+        artifacts_dir, args.tag
+    )
+    if overlay_results_path and not args.no_overlay:
+        try:
+            overlay_results = load_results(overlay_results_path)
+        except Exception as exc:
+            print(f"Failed to read overlay results from latest run: {exc}", file=sys.stderr)
+            overlay_results_path = None
+            overlay_results = None
+
     filtered_cases = _select_cases_for_rerun(
         cases,
-        None,
-        require_assert=args.require_assert,
-        fail_on=args.fail_on,
         include_tags=include_tags,
         exclude_tags=exclude_tags,
         include_ids=include_ids,
         exclude_ids=exclude_ids,
     )
     suite_case_ids = [case.id for case in filtered_cases]
-    cases = _select_cases_for_rerun(
-        filtered_cases,
-        baseline_for_filter,
-        require_assert=args.require_assert,
-        fail_on=args.fail_on,
-        include_tags=include_tags,
-        exclude_tags=exclude_tags,
-        include_ids=include_ids,
-        exclude_ids=exclude_ids,
-    )
+    cases = filtered_cases
+
+    if args.only_failed:
+        selection_ids, breakdown = _only_failed_selection(
+            failed_baseline_results,
+            overlay_results if not args.no_overlay else None,
+            fail_on=args.fail_on,
+            require_assert=args.require_assert,
+        )
+        cases = [case for case in cases if case.id in selection_ids]
+        healed = breakdown.get("healed", set())
+        baseline_fails = breakdown.get("baseline_failures", set())
+        new_failures = breakdown.get("new_failures", set())
+        baseline_label = str(_run_dir_from_results_path(baseline_filter_path) or baseline_filter_path or "n/a")
+        overlay_label = str(overlay_run_path or overlay_results_path or "n/a")
+        print(f"Baseline: {baseline_label}", file=sys.stderr)
+        print(f"Overlay: {overlay_label}", file=sys.stderr)
+        print(f"Baseline failures: {len(baseline_fails)}", file=sys.stderr)
+        print(f"Healed by overlay: {len(healed)}", file=sys.stderr)
+        print(f"New failures in overlay: {len(new_failures)}", file=sys.stderr)
+        print(f"Final only-failed selection: {len(selection_ids)}", file=sys.stderr)
 
-    baseline_planned_ids: set[str] | None = None
-    missed_baseline_results: Optional[Mapping[str, RunResult]] = None
-    missed_baseline_path: Path | None = None
-    missed_baseline_run: Path | None = None
     only_missed_baseline_kind: str | None = None
     if args.only_missed:
         only_missed_from_arg = cast(Optional[Path], args.only_missed_from)
         if only_missed_from_arg:
             missed_baseline_path = only_missed_from_arg
             only_missed_baseline_kind = "path"
-            try:
-                missed_baseline_results = load_results(missed_baseline_path)
-            except Exception as exc:
-                print(f"Failed to read baseline for --only-missed-from: {exc}", file=sys.stderr)
-                return 2
         elif args.tag:
             effective_results, effective_meta, eff_path = _load_effective_results(artifacts_dir, args.tag)
             if not effective_results:
@@ -503,62 +544,37 @@ def handle_batch(args) -> int:
             missed_baseline_path = eff_path
             missed_baseline_results = effective_results
             only_missed_baseline_kind = "effective"
-            baseline_planned_ids = (
-                {str(cid) for cid in effective_meta.get("planned_case_ids", [])}
-                if isinstance(effective_meta, dict)
-                else None
-            )
-            if not baseline_planned_ids:
-                print(
-                    "Effective results missing planned_case_ids; computing missed relative to current filtered cases.",
-                    file=sys.stderr,
-                )
-                baseline_planned_ids = set(suite_case_ids)
         else:
             missed_baseline_path = only_missed_from_arg or _load_latest_results(artifacts_dir, args.tag)
             if only_missed_from_arg:
                 only_missed_baseline_kind = "path"
             elif missed_baseline_path is not None:
-                only_missed_baseline_kind = "latest"
-            missed_baseline_run = _run_dir_from_results_path(missed_baseline_path)
-            if missed_baseline_run is None:
-                missed_baseline_run = _load_latest_run(artifacts_dir, args.tag)
-            if missed_baseline_path is not None:
-                try:
-                    missed_baseline_results = load_results(missed_baseline_path)
-                except Exception as exc:
-                    print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr)
-                    return 2
-            else:
-                print(
-                    "No baseline found for --only-missed. Provide --only-missed-from or run a tagged batch first.",
-                    file=sys.stderr,
-                )
+                only_missed_baseline_kind = "latest_complete"
+        if missed_baseline_path is not None and missed_baseline_results is None:
+            try:
+                missed_baseline_results = load_results(missed_baseline_path)
+            except Exception as exc:
+                print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr)
                 return 2
-            baseline_meta = _load_run_meta(missed_baseline_run)
-            if isinstance(baseline_meta, dict):
-                planned_from_meta = baseline_meta.get("planned_case_ids")
-                if isinstance(planned_from_meta, list):
-                    baseline_planned_ids = {str(cid) for cid in planned_from_meta}
-                else:
-                    print(
-                        "Baseline run meta missing planned_case_ids; computing missed relative to current filtered cases.",
-                        file=sys.stderr,
-                    )
-                    baseline_planned_ids = set(suite_case_ids)
         if args.only_missed and missed_baseline_results is None:
             print("No baseline found for --only-missed.", file=sys.stderr)
             return 2
-
-    selected_case_ids = [case.id for case in cases]
-    if args.only_missed:
-        planned_pool = baseline_planned_ids or set(selected_case_ids)
-        missed_ids = _missed_case_ids(planned_pool, missed_baseline_results)
-        cases = [case for case in cases if case.id in missed_ids]
         selected_case_ids = [case.id for case in cases]
+        missed_ids, missed_breakdown = _only_missed_selection(
+            selected_case_ids,
+            missed_baseline_results,
+            overlay_results if not args.no_overlay else None,
+        )
+        cases = [case for case in cases if case.id in missed_ids]
+        print(f"Baseline (missed) results: {missed_baseline_path}", file=sys.stderr)
+        print(f"Overlay executed: {len(missed_breakdown.get('overlay_executed', set()))}", file=sys.stderr)
+        print(f"Missed in baseline: {len(missed_breakdown.get('missed_base', set()))}", file=sys.stderr)
+        print(f"Final only-missed selection: {len(missed_ids)}", file=sys.stderr)
         if not cases:
             print("0 missed cases selected.", file=sys.stderr)
 
+    selected_case_ids = [case.id for case in cases]
+
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     run_folder = artifacts_dir / "runs" / f"{timestamp}_{cases_path.stem}"
     results_path = Path(args.out) if args.out else (run_folder / "results.jsonl")
@@ -661,6 +677,15 @@ def handle_batch(args) -> int:
     missed_total = len(_missed_case_ids(selected_case_ids, executed_results))
     suite_planned_total = len(suite_case_ids)
     suite_missed_total = len(_missed_case_ids(suite_case_ids, executed_results))
+    results_complete = (planned_total == executed_total) and not interrupted
+    if interrupted:
+        run_status = "INTERRUPTED"
+    elif not results_complete:
+        run_status = "ERROR"
+    elif bad_count:
+        run_status = "FAILED"
+    else:
+        run_status = "SUCCESS"
     summary = {
         "run_id": run_id,
         "started_at": _isoformat_utc(started_at),
@@ -681,6 +706,10 @@ def handle_batch(args) -> int:
         "interrupted_at_case_id": interrupted_at_case_id,
         "tag": args.tag,
         "note": args.note,
+        "run_status": run_status,
+        "results_complete": results_complete,
+        "total_selected": planned_total,
+        "total_executed": executed_total,
     }
     if diff_block:
         summary["diff"] = diff_block
@@ -704,10 +733,12 @@ def handle_batch(args) -> int:
                 "planned_total": planned_total,
                 "executed_total": executed_total,
                 "missed_total": missed_total,
+                "run_status": run_status,
+                "results_complete": results_complete,
             }
         )
 
-    _update_latest_markers(run_folder, results_path, artifacts_dir, args.tag)
+    _update_latest_markers(run_folder, results_path, artifacts_dir, args.tag, results_complete=results_complete)
     effective_path = None
     effective_meta_path = None
     if args.tag:
@@ -763,6 +794,12 @@ def handle_batch(args) -> int:
         "suite_case_ids": suite_case_ids,
         "selected_case_ids": selected_case_ids,
         "planned_total": planned_total,
+        "executed_total": executed_total,
+        "run_status": run_status,
+        "results_complete": results_complete,
+        "exit_code": exit_code,
+        "total_selected": planned_total,
+        "total_executed": executed_total,
         "selected_filters": {
             "include_tags": sorted(include_tags) if include_tags else None,
             "exclude_tags": sorted(exclude_tags) if exclude_tags else None,
@@ -774,6 +811,7 @@ def handle_batch(args) -> int:
             "only_missed": args.only_missed,
             "only_missed_from": str(missed_baseline_path) if missed_baseline_path else None,
             "only_missed_baseline_kind": only_missed_baseline_kind,
+            "overlay_results_path": str(overlay_results_path) if overlay_results_path else None,
             "baseline_tag": args.tag,
             "effective_path": str(effective_path) if effective_path else None,
             "scope_hash": scope_id,
@@ -781,6 +819,7 @@ def handle_batch(args) -> int:
             "plan_only": args.plan_only,
             "fail_fast": args.fail_fast,
             "max_fails": args.max_fails,
+            "no_overlay": args.no_overlay,
         },
         "interrupted": interrupted,
         "interrupted_at_case_id": interrupted_at_case_id,
@@ -826,12 +865,17 @@ def handle_batch(args) -> int:
         "fail_count": bad_count,
         "planned_total": planned_total,
         "executed_total": executed_total,
+        "total_selected": planned_total,
+        "total_executed": executed_total,
         "missed_total": missed_total,
         "suite_planned_total": suite_planned_total,
         "suite_missed_total": suite_missed_total,
         "interrupted": interrupted,
         "interrupted_at_case_id": interrupted_at_case_id,
         "scope_hash": scope_id,
+        "run_status": run_status,
+        "results_complete": results_complete,
+        "exit_code": exit_code,
     }
     for res in results:
         _append_case_history(
@@ -935,6 +979,10 @@ def handle_case_run(args) -> int:
     result = run_one(cases[args.case_id], runner, artifacts_root, plan_only=args.plan_only)
     write_results(results_path, [result])
     counts = summarize([result])
+    bad = bad_statuses("bad", False)
+    bad_count = sum(_coerce_int(counts.get(status)) for status in bad)
+    run_status = "FAILED" if bad_count else "SUCCESS"
+    exit_code = 1 if bad_count else 0
     summary = {
         "run_id": run_folder.name,
         "timestamp": timestamp + "Z",
@@ -942,17 +990,19 @@ def handle_case_run(args) -> int:
         "results_path": str(results_path),
         "fail_on": "bad",
         "require_assert": False,
+        "run_status": run_status,
+        "results_complete": True,
+        "total_selected": 1,
+        "total_executed": 1,
+        "exit_code": exit_code,
     }
     summary_path = write_summary(results_path, summary)
-    save_dir = run_folder.parent
-    save_dir.mkdir(parents=True, exist_ok=True)
-    (save_dir / "latest.txt").write_text(str(run_folder), encoding="utf-8")
-    (save_dir / "latest_results.txt").write_text(str(results_path), encoding="utf-8")
+    _update_latest_markers(run_folder, results_path, artifacts_dir, None, results_complete=True)
 
     print(format_status_line(result))
     print(f"Artifacts: {result.artifacts_dir}")
     print(f"Summary: {summary_path}")
-    return 0
+    return exit_code
 
 
 def handle_case_open(args) -> int:
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index 78cc697..6bcab36 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -91,6 +91,11 @@ def build_parser() -> argparse.ArgumentParser:
         help="Run only cases that failed/mismatched/errored in a previous results.jsonl",
     )
     batch_p.add_argument("--only-failed", action="store_true", help="Use latest run for --only-failed-from automatically")
+    batch_p.add_argument(
+        "--no-overlay",
+        action="store_true",
+        help="Ignore latest partial run when selecting only-failed/only-missed (use baseline only)",
+    )
     batch_p.add_argument("--plan-only", action="store_true", help="Run planner only (no fetch/synthesize)")
     batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code")
     batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show")
diff --git a/examples/demo_qa/runs/layout.py b/examples/demo_qa/runs/layout.py
index 704f643..8c0d61e 100644
--- a/examples/demo_qa/runs/layout.py
+++ b/examples/demo_qa/runs/layout.py
@@ -2,7 +2,14 @@
 
 import json
 from pathlib import Path
-from typing import Optional
+from typing import NamedTuple, Optional
+
+
+class LatestMarkers(NamedTuple):
+    complete: Path
+    results: Path
+    any_run: Path
+    legacy_run: Path
 
 
 def _sanitize_tag(tag: str) -> str:
@@ -15,43 +22,84 @@ def _effective_paths(artifacts_dir: Path, tag: str) -> tuple[Path, Path]:
     return base / "effective_results.jsonl", base / "effective_meta.json"
 
 
-def _latest_markers(artifacts_dir: Path, tag: str | None) -> tuple[Path, Path]:
+def _latest_markers(artifacts_dir: Path, tag: str | None) -> LatestMarkers:
     runs_dir = artifacts_dir / "runs"
     if tag:
         slug = _sanitize_tag(tag)
-        return runs_dir / f"tag-latest-{slug}.txt", runs_dir / f"tag-latest-results-{slug}.txt"
-    return runs_dir / "latest.txt", runs_dir / "latest_results.txt"
-
-
-def _load_latest_run(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]:
-    latest_file, _ = _latest_markers(artifacts_dir, tag)
-    if latest_file.exists():
-        content = latest_file.read_text(encoding="utf-8").strip()
+        return LatestMarkers(
+            runs_dir / f"tag-latest-complete-{slug}.txt",
+            runs_dir / f"tag-latest-results-{slug}.txt",
+            runs_dir / f"tag-latest-any-{slug}.txt",
+            runs_dir / f"tag-latest-{slug}.txt",
+        )
+    return LatestMarkers(
+        runs_dir / "latest_complete.txt",
+        runs_dir / "latest_results.txt",
+        runs_dir / "latest_any.txt",
+        runs_dir / "latest.txt",
+    )
+
+
+def _read_marker(path: Path) -> Optional[Path]:
+    if path.exists():
+        content = path.read_text(encoding="utf-8").strip()
         if content:
             return Path(content)
     return None
 
 
-def _load_latest_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]:
-    _, latest_file = _latest_markers(artifacts_dir, tag)
-    if latest_file.exists():
-        content = latest_file.read_text(encoding="utf-8").strip()
-        if content:
-            return Path(content)
-    latest_run = _load_latest_run(artifacts_dir, tag)
-    if latest_run:
-        summary_path = latest_run / "summary.json"
-        if summary_path.exists():
-            try:
-                summary = json.loads(summary_path.read_text(encoding="utf-8"))
-                results_path = summary.get("results_path")
-                if results_path:
-                    return Path(results_path)
-            except Exception:
-                pass
+def _load_latest_run(artifacts_dir: Path, tag: str | None = None, *, kind: str = "complete") -> Optional[Path]:
+    markers = _latest_markers(artifacts_dir, tag)
+    candidates: list[Path] = []
+    if kind == "any":
+        candidates.append(markers.any_run)
+    candidates.append(markers.complete)
+    candidates.append(markers.legacy_run)
+    for marker in candidates:
+        resolved = _read_marker(marker)
+        if resolved:
+            return resolved
     return None
 
 
+def _resolve_results_path_for_run(run_path: Path | None) -> Optional[Path]:
+    if run_path is None:
+        return None
+    summary_path = run_path / "summary.json"
+    if summary_path.exists():
+        try:
+            summary = json.loads(summary_path.read_text(encoding="utf-8"))
+            results_path = summary.get("results_path")
+            if results_path:
+                return Path(results_path)
+        except Exception:
+            pass
+    candidate = run_path / "results.jsonl"
+    if candidate.exists():
+        return candidate
+    return None
+
+
+def _load_latest_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]:
+    markers = _latest_markers(artifacts_dir, tag)
+    resolved = _read_marker(markers.results)
+    if resolved:
+        return resolved
+    latest_run = _load_latest_run(artifacts_dir, tag, kind="complete")
+    return _resolve_results_path_for_run(latest_run)
+
+
+def _load_latest_any_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]:
+    latest_run = _load_latest_run(artifacts_dir, tag, kind="any")
+    if latest_run is None:
+        return None
+    results = _resolve_results_path_for_run(latest_run)
+    if results:
+        return results
+    markers = _latest_markers(artifacts_dir, tag)
+    return _read_marker(markers.results)
+
+
 def _load_run_meta(run_path: Path | None) -> Optional[dict]:
     if run_path is None:
         return None
@@ -80,22 +128,30 @@ def _run_dir_from_results_path(results_path: Path | None) -> Optional[Path]:
     return run_dir
 
 
-def _update_latest_markers(run_folder: Path, results_path: Path, artifacts_dir: Path, tag: str | None) -> None:
-    marker_pairs = {_latest_markers(artifacts_dir, None)}
+def _update_latest_markers(
+    run_folder: Path, results_path: Path, artifacts_dir: Path, tag: str | None, *, results_complete: bool
+) -> None:
+    marker_sets = {_latest_markers(artifacts_dir, None)}
     if tag:
-        marker_pairs.add(_latest_markers(artifacts_dir, tag))
-    for latest_path, latest_results_path in marker_pairs:
-        latest_path.parent.mkdir(parents=True, exist_ok=True)
-        latest_path.write_text(str(run_folder), encoding="utf-8")
-        latest_results_path.write_text(str(results_path), encoding="utf-8")
+        marker_sets.add(_latest_markers(artifacts_dir, tag))
+    for markers in marker_sets:
+        markers.complete.parent.mkdir(parents=True, exist_ok=True)
+        markers.any_run.write_text(str(run_folder), encoding="utf-8")
+        markers.legacy_run.write_text(str(run_folder), encoding="utf-8")
+        if results_complete:
+            markers.complete.write_text(str(run_folder), encoding="utf-8")
+            markers.results.write_text(str(results_path), encoding="utf-8")
 
 
 __all__ = [
+    "LatestMarkers",
     "_effective_paths",
+    "_load_latest_any_results",
     "_latest_markers",
     "_load_latest_results",
     "_load_latest_run",
     "_load_run_meta",
+    "_resolve_results_path_for_run",
     "_run_dir_from_results_path",
     "_sanitize_tag",
     "_update_latest_markers",
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index cdde099..6226fc4 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -9,7 +9,15 @@
 
 import pytest
 
-from examples.demo_qa.batch import _fingerprint_dir, bad_statuses, is_failure, render_markdown, write_results
+from examples.demo_qa.batch import (
+    _fingerprint_dir,
+    _only_failed_selection,
+    _only_missed_selection,
+    bad_statuses,
+    is_failure,
+    render_markdown,
+    write_results,
+)
 from examples.demo_qa.runs.coverage import _missed_case_ids
 from examples.demo_qa.runs.layout import _latest_markers, _update_latest_markers
 from examples.demo_qa.runner import DiffReport, RunResult, diff_runs
@@ -106,6 +114,29 @@ def test_missed_case_ids_diff_planned_and_executed() -> None:
     assert _missed_case_ids(planned, executed) == {"a", "c"}
 
 
+def test_only_failed_selection_uses_overlay_and_baseline() -> None:
+    baseline = {"a": _mk_result("a", "failed"), "b": _mk_result("b", "failed")}
+    overlay = {"a": _mk_result("a", "ok"), "c": _mk_result("c", "failed")}
+
+    selection, breakdown = _only_failed_selection(baseline, overlay, fail_on="bad", require_assert=False)
+
+    assert selection == {"b", "c"}
+    assert breakdown["healed"] == {"a"}
+    assert breakdown["baseline_failures"] == {"a", "b"}
+    assert breakdown["new_failures"] == {"c"}
+
+
+def test_only_missed_selection_uses_overlay_executed() -> None:
+    baseline = {"a": _mk_result("a", "ok")}
+    overlay = {"c": _mk_result("c", "ok")}
+
+    missed, breakdown = _only_missed_selection(["a", "b", "c"], baseline, overlay)
+
+    assert missed == {"b"}
+    assert breakdown["missed_base"] == {"b", "c"}
+    assert breakdown["overlay_executed"] == {"c"}
+
+
 def test_update_latest_markers_handles_tag(tmp_path: Path) -> None:
     artifacts_dir = tmp_path / "data" / ".runs"
     run_dir = artifacts_dir / "runs" / "20240101_cases"
@@ -113,12 +144,31 @@ def test_update_latest_markers_handles_tag(tmp_path: Path) -> None:
     run_dir.mkdir(parents=True)
     results_path.write_text("{}", encoding="utf-8")
 
-    _update_latest_markers(run_dir, results_path, artifacts_dir, "feature/beta")
+    _update_latest_markers(run_dir, results_path, artifacts_dir, "feature/beta", results_complete=True)
+
+    latest_default = _latest_markers(artifacts_dir, None)
+    assert latest_default.complete.read_text(encoding="utf-8").strip() == str(run_dir)
+    assert latest_default.results.read_text(encoding="utf-8").strip() == str(results_path)
+    assert latest_default.any_run.read_text(encoding="utf-8").strip() == str(run_dir)
+
+    latest_tag = _latest_markers(artifacts_dir, "feature/beta")
+    assert latest_tag.complete.read_text(encoding="utf-8").strip() == str(run_dir)
+    assert latest_tag.results.read_text(encoding="utf-8").strip() == str(results_path)
+    assert latest_tag.any_run.read_text(encoding="utf-8").strip() == str(run_dir)
+
+    partial_dir = artifacts_dir / "runs" / "20240102_cases"
+    partial_results = partial_dir / "results.jsonl"
+    partial_dir.mkdir(parents=True)
+    partial_results.write_text("{}", encoding="utf-8")
+
+    _update_latest_markers(partial_dir, partial_results, artifacts_dir, "feature/beta", results_complete=False)
 
-    latest_default, latest_results_default = _latest_markers(artifacts_dir, None)
-    assert latest_default.read_text(encoding="utf-8").strip() == str(run_dir)
-    assert latest_results_default.read_text(encoding="utf-8").strip() == str(results_path)
+    refreshed_default = _latest_markers(artifacts_dir, None)
+    assert refreshed_default.complete.read_text(encoding="utf-8").strip() == str(run_dir)
+    assert refreshed_default.results.read_text(encoding="utf-8").strip() == str(results_path)
+    assert refreshed_default.any_run.read_text(encoding="utf-8").strip() == str(partial_dir)
 
-    latest_tag, latest_results_tag = _latest_markers(artifacts_dir, "feature/beta")
-    assert latest_tag.read_text(encoding="utf-8").strip() == str(run_dir)
-    assert latest_results_tag.read_text(encoding="utf-8").strip() == str(results_path)
+    refreshed_tag = _latest_markers(artifacts_dir, "feature/beta")
+    assert refreshed_tag.complete.read_text(encoding="utf-8").strip() == str(run_dir)
+    assert refreshed_tag.results.read_text(encoding="utf-8").strip() == str(results_path)
+    assert refreshed_tag.any_run.read_text(encoding="utf-8").strip() == str(partial_dir)

From 958f2ae7ad17de1ff65985e3e68c95ac8feda5bd Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 10:41:23 +0300
Subject: [PATCH 67/92] Prevent partial runs from becoming latest baseline

---
 examples/demo_qa/runs/layout.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/demo_qa/runs/layout.py b/examples/demo_qa/runs/layout.py
index 8c0d61e..c527d04 100644
--- a/examples/demo_qa/runs/layout.py
+++ b/examples/demo_qa/runs/layout.py
@@ -62,7 +62,7 @@ def _load_latest_run(artifacts_dir: Path, tag: str | None = None, *, kind: str =
     return None
 
 
-def _resolve_results_path_for_run(run_path: Path | None) -> Optional[Path]:
+def _resolve_results_path_for_run(run_path: Path | None, *, require_complete: bool = False) -> Optional[Path]:
     if run_path is None:
         return None
     summary_path = run_path / "summary.json"
@@ -70,12 +70,14 @@ def _resolve_results_path_for_run(run_path: Path | None) -> Optional[Path]:
         try:
             summary = json.loads(summary_path.read_text(encoding="utf-8"))
             results_path = summary.get("results_path")
+            if require_complete and summary.get("results_complete") is False:
+                return None
             if results_path:
                 return Path(results_path)
         except Exception:
             pass
     candidate = run_path / "results.jsonl"
-    if candidate.exists():
+    if candidate.exists() and not require_complete:
         return candidate
     return None
 
@@ -86,7 +88,7 @@ def _load_latest_results(artifacts_dir: Path, tag: str | None = None) -> Optiona
     if resolved:
         return resolved
     latest_run = _load_latest_run(artifacts_dir, tag, kind="complete")
-    return _resolve_results_path_for_run(latest_run)
+    return _resolve_results_path_for_run(latest_run, require_complete=True)
 
 
 def _load_latest_any_results(artifacts_dir: Path, tag: str | None = None) -> Optional[Path]:
@@ -137,8 +139,8 @@ def _update_latest_markers(
     for markers in marker_sets:
         markers.complete.parent.mkdir(parents=True, exist_ok=True)
         markers.any_run.write_text(str(run_folder), encoding="utf-8")
-        markers.legacy_run.write_text(str(run_folder), encoding="utf-8")
         if results_complete:
+            markers.legacy_run.write_text(str(run_folder), encoding="utf-8")
             markers.complete.write_text(str(run_folder), encoding="utf-8")
             markers.results.write_text(str(results_path), encoding="utf-8")
 

From 2995f016fcd639110d3d5a63abfe42beb1e74276 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 10:41:27 +0300
Subject: [PATCH 68/92] Fix overlay lookup and only-missed base selection

---
 examples/demo_qa/batch.py       | 15 +++++++++++++--
 examples/demo_qa/runs/layout.py |  3 +--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 26e07ee..494881e 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -499,7 +499,9 @@ def handle_batch(args) -> int:
         exclude_ids=exclude_ids,
     )
     suite_case_ids = [case.id for case in filtered_cases]
+    filtered_case_lookup = {case.id: case for case in filtered_cases}
     cases = filtered_cases
+    failed_selection_ids: set[str] | None = None
 
     if args.only_failed:
         selection_ids, breakdown = _only_failed_selection(
@@ -509,6 +511,7 @@ def handle_batch(args) -> int:
             require_assert=args.require_assert,
         )
         cases = [case for case in cases if case.id in selection_ids]
+        failed_selection_ids = selection_ids
         healed = breakdown.get("healed", set())
         baseline_fails = breakdown.get("baseline_failures", set())
         new_failures = breakdown.get("new_failures", set())
@@ -559,13 +562,21 @@ def handle_batch(args) -> int:
         if args.only_missed and missed_baseline_results is None:
             print("No baseline found for --only-missed.", file=sys.stderr)
             return 2
-        selected_case_ids = [case.id for case in cases]
+        selected_case_ids = suite_case_ids
         missed_ids, missed_breakdown = _only_missed_selection(
             selected_case_ids,
             missed_baseline_results,
             overlay_results if not args.no_overlay else None,
         )
-        cases = [case for case in cases if case.id in missed_ids]
+        base_pool = filtered_case_lookup
+        target_ids = missed_ids
+        if args.only_failed and failed_selection_ids is not None:
+            target_ids = target_ids & failed_selection_ids
+            print(
+                f"Combining --only-failed and --only-missed via intersection: {len(target_ids)} cases remain.",
+                file=sys.stderr,
+            )
+        cases = [case for cid, case in base_pool.items() if cid in target_ids]
         print(f"Baseline (missed) results: {missed_baseline_path}", file=sys.stderr)
         print(f"Overlay executed: {len(missed_breakdown.get('overlay_executed', set()))}", file=sys.stderr)
         print(f"Missed in baseline: {len(missed_breakdown.get('missed_base', set()))}", file=sys.stderr)
diff --git a/examples/demo_qa/runs/layout.py b/examples/demo_qa/runs/layout.py
index c527d04..022fc08 100644
--- a/examples/demo_qa/runs/layout.py
+++ b/examples/demo_qa/runs/layout.py
@@ -98,8 +98,7 @@ def _load_latest_any_results(artifacts_dir: Path, tag: str | None = None) -> Opt
     results = _resolve_results_path_for_run(latest_run)
     if results:
         return results
-    markers = _latest_markers(artifacts_dir, tag)
-    return _read_marker(markers.results)
+    return None
 
 
 def _load_run_meta(run_path: Path | None) -> Optional[dict]:

From ce0d9c799199e2db7ae2646cd230d9ec236d0be6 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 10:41:32 +0300
Subject: [PATCH 69/92] Add anti-flake passes and update messaging

---
 examples/demo_qa/batch.py | 62 +++++++++++++++++++++++++++++++++++++--
 examples/demo_qa/cli.py   |  6 ++++
 2 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 494881e..f42f9de 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -30,7 +30,7 @@
     save_status,
     summarize,
 )
-from .runs.case_history import _append_case_history
+from .runs.case_history import _append_case_history, _load_case_history
 from .runs.coverage import _missed_case_ids
 from .runs.effective import (
     _append_effective_diff,
@@ -118,19 +118,70 @@ def _load_ids(path: Optional[Path]) -> set[str] | None:
     return ids
 
 
+def _consecutive_passes(
+    case_id: str,
+    overlay_result: RunResult,
+    artifacts_dir: Path,
+    *,
+    tag: str | None,
+    scope_hash: str,
+    passes_required: int,
+    fail_on: str,
+    require_assert: bool,
+) -> bool:
+    if passes_required <= 1:
+        return True
+    bad = bad_statuses(fail_on, require_assert)
+    if overlay_result.status in bad:
+        return False
+    count = 1
+    history_path = artifacts_dir / "runs" / "cases" / f"{case_id}.jsonl"
+    entries = list(reversed(_load_case_history(history_path)))
+    for entry in entries:
+        if tag is not None and entry.get("tag") != tag:
+            continue
+        if scope_hash and entry.get("scope_hash") not in (None, scope_hash):
+            continue
+        status = str(entry.get("status", ""))
+        if status in bad:
+            break
+        count += 1
+        if count >= passes_required:
+            return True
+    return count >= passes_required
+
+
 def _only_failed_selection(
     baseline_results: Mapping[str, RunResult] | None,
     overlay_results: Mapping[str, RunResult] | None,
     *,
     fail_on: str,
     require_assert: bool,
+    artifacts_dir: Path,
+    tag: str | None,
+    scope_hash: str,
+    anti_flake_passes: int,
 ) -> tuple[set[str], dict[str, object]]:
     baseline = baseline_results or {}
     overlay = overlay_results or {}
     bad = bad_statuses(fail_on, require_assert)
     baseline_bad = {cid for cid, res in baseline.items() if res.status in bad}
     overlay_bad = {cid for cid, res in overlay.items() if res.status in bad}
-    overlay_good = {cid for cid, res in overlay.items() if res.status not in bad}
+    overlay_good = {
+        cid
+        for cid, res in overlay.items()
+        if res.status not in bad
+        and _consecutive_passes(
+            cid,
+            res,
+            artifacts_dir,
+            tag=tag,
+            scope_hash=scope_hash,
+            passes_required=anti_flake_passes,
+            fail_on=fail_on,
+            require_assert=require_assert,
+        )
+    }
 
     healed = baseline_bad & overlay_good
     selection = (baseline_bad - healed) | overlay_bad
@@ -509,6 +560,10 @@ def handle_batch(args) -> int:
             overlay_results if not args.no_overlay else None,
             fail_on=args.fail_on,
             require_assert=args.require_assert,
+            artifacts_dir=artifacts_dir,
+            tag=args.tag,
+            scope_hash=scope_id,
+            anti_flake_passes=max(1, int(args.anti_flake_passes)),
         )
         cases = [case for case in cases if case.id in selection_ids]
         failed_selection_ids = selection_ids
@@ -831,6 +886,7 @@ def handle_batch(args) -> int:
             "fail_fast": args.fail_fast,
             "max_fails": args.max_fails,
             "no_overlay": args.no_overlay,
+            "anti_flake_passes": args.anti_flake_passes,
         },
         "interrupted": interrupted,
         "interrupted_at_case_id": interrupted_at_case_id,
@@ -1020,7 +1076,7 @@ def handle_case_open(args) -> int:
     artifacts_dir = args.artifacts_dir or (args.data / ".runs")
     run_path = _resolve_run_path(args.run, artifacts_dir)
     if not run_path:
-        print("No run found. Provide --run or ensure runs/latest.txt exists.", file=sys.stderr)
+        print("No run found. Provide --run or ensure runs/latest_any.txt exists (run a batch first).", file=sys.stderr)
         return 2
     case_dir = _find_case_artifact(run_path, args.case_id)
     if not case_dir:
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index 6bcab36..7acc050 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -96,6 +96,12 @@ def build_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Ignore latest partial run when selecting only-failed/only-missed (use baseline only)",
     )
+    batch_p.add_argument(
+        "--anti-flake-passes",
+        type=int,
+        default=2,
+        help="Require N consecutive PASS results to consider a test healed (applies to --only-failed overlay logic)",
+    )
     batch_p.add_argument("--plan-only", action="store_true", help="Run planner only (no fetch/synthesize)")
     batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code")
     batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show")

From 6ec96a10ee0a76e42d6bac80f884ace07c8c4fa8 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 10:41:36 +0300
Subject: [PATCH 70/92] Improve overlay logging and baseline flag help

---
 examples/demo_qa/batch.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index f42f9de..4af12c2 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -485,6 +485,10 @@ def handle_batch(args) -> int:
     only_failed_baseline_kind: str | None = None
     if baseline_filter_path_arg:
         only_failed_baseline_kind = "path"
+        print(
+            "Using explicit baseline from --only-failed-from; overlay (latest any run) will still be considered unless --no-overlay is set.",
+            file=sys.stderr,
+        )
     elif args.tag and args.only_failed:
         effective_results, effective_meta, eff_path = _load_effective_results(artifacts_dir, args.tag)
         if not effective_results:
@@ -570,10 +574,23 @@ def handle_batch(args) -> int:
         healed = breakdown.get("healed", set())
         baseline_fails = breakdown.get("baseline_failures", set())
         new_failures = breakdown.get("new_failures", set())
-        baseline_label = str(_run_dir_from_results_path(baseline_filter_path) or baseline_filter_path or "n/a")
-        overlay_label = str(overlay_run_path or overlay_results_path or "n/a")
-        print(f"Baseline: {baseline_label}", file=sys.stderr)
-        print(f"Overlay: {overlay_label}", file=sys.stderr)
+        baseline_meta = _load_run_meta(_run_dir_from_results_path(baseline_filter_path))
+        baseline_label = baseline_meta.get("run_id") if isinstance(baseline_meta, dict) else None
+        baseline_status = baseline_meta.get("run_status") if isinstance(baseline_meta, dict) else None
+        overlay_meta = _load_run_meta(overlay_run_path)
+        overlay_label = overlay_meta.get("run_id") if isinstance(overlay_meta, dict) else None
+        overlay_status = overlay_meta.get("run_status") if isinstance(overlay_meta, dict) else None
+        baseline_complete = baseline_meta.get("results_complete") if isinstance(baseline_meta, dict) else None
+        overlay_complete = overlay_meta.get("results_complete") if isinstance(overlay_meta, dict) else None
+        scope_display = scope_id or "n/a"
+        print(
+            f"Baseline: run_id={baseline_label or 'n/a'} status={baseline_status or 'n/a'} complete={baseline_complete} scope={scope_display}",
+            file=sys.stderr,
+        )
+        print(
+            f"Overlay: run_id={overlay_label or 'n/a'} status={overlay_status or 'n/a'} complete={overlay_complete} scope={scope_display}",
+            file=sys.stderr,
+        )
         print(f"Baseline failures: {len(baseline_fails)}", file=sys.stderr)
         print(f"Healed by overlay: {len(healed)}", file=sys.stderr)
         print(f"New failures in overlay: {len(new_failures)}", file=sys.stderr)

From ef193309bcfc7bf591bf89beb1290c9603a6fe4a Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 10:41:41 +0300
Subject: [PATCH 71/92] Harden selection API defaults and stabilize only-missed
 logging

---
 examples/demo_qa/batch.py   | 35 +++++++++++++++++------------------
 tests/test_demo_qa_batch.py |  9 ++++++++-
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 4af12c2..d7a6012 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -121,16 +121,16 @@ def _load_ids(path: Optional[Path]) -> set[str] | None:
 def _consecutive_passes(
     case_id: str,
     overlay_result: RunResult,
-    artifacts_dir: Path,
+    artifacts_dir: Path | None = None,
     *,
-    tag: str | None,
-    scope_hash: str,
-    passes_required: int,
-    fail_on: str,
-    require_assert: bool,
+    tag: str | None = None,
+    scope_hash: str = "",
+    passes_required: int = 1,
+    fail_on: str = "bad",
+    require_assert: bool = False,
 ) -> bool:
-    if passes_required <= 1:
-        return True
+    if passes_required <= 1 or artifacts_dir is None:
+        return overlay_result.status not in bad_statuses(fail_on, require_assert)
     bad = bad_statuses(fail_on, require_assert)
     if overlay_result.status in bad:
         return False
@@ -155,12 +155,12 @@ def _only_failed_selection(
     baseline_results: Mapping[str, RunResult] | None,
     overlay_results: Mapping[str, RunResult] | None,
     *,
-    fail_on: str,
-    require_assert: bool,
-    artifacts_dir: Path,
-    tag: str | None,
-    scope_hash: str,
-    anti_flake_passes: int,
+    fail_on: str = "bad",
+    require_assert: bool = False,
+    artifacts_dir: Path | None = None,
+    tag: str | None = None,
+    scope_hash: str = "",
+    anti_flake_passes: int = 1,
 ) -> tuple[set[str], dict[str, object]]:
     baseline = baseline_results or {}
     overlay = overlay_results or {}
@@ -640,19 +640,18 @@ def handle_batch(args) -> int:
             missed_baseline_results,
             overlay_results if not args.no_overlay else None,
         )
-        base_pool = filtered_case_lookup
         target_ids = missed_ids
         if args.only_failed and failed_selection_ids is not None:
             target_ids = target_ids & failed_selection_ids
             print(
-                f"Combining --only-failed and --only-missed via intersection: {len(target_ids)} cases remain.",
+                f"Combining --only-failed and --only-missed via intersection: {len(target_ids)} cases remain (missed={len(missed_ids)}).",
                 file=sys.stderr,
             )
-        cases = [case for cid, case in base_pool.items() if cid in target_ids]
+        cases = [case for case in filtered_cases if case.id in target_ids]
         print(f"Baseline (missed) results: {missed_baseline_path}", file=sys.stderr)
         print(f"Overlay executed: {len(missed_breakdown.get('overlay_executed', set()))}", file=sys.stderr)
         print(f"Missed in baseline: {len(missed_breakdown.get('missed_base', set()))}", file=sys.stderr)
-        print(f"Final only-missed selection: {len(missed_ids)}", file=sys.stderr)
+        print(f"Final only-missed selection: {len(target_ids)}", file=sys.stderr)
         if not cases:
             print("0 missed cases selected.", file=sys.stderr)
 
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 6226fc4..ddf1b0f 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -118,7 +118,14 @@ def test_only_failed_selection_uses_overlay_and_baseline() -> None:
     baseline = {"a": _mk_result("a", "failed"), "b": _mk_result("b", "failed")}
     overlay = {"a": _mk_result("a", "ok"), "c": _mk_result("c", "failed")}
 
-    selection, breakdown = _only_failed_selection(baseline, overlay, fail_on="bad", require_assert=False)
+    selection, breakdown = _only_failed_selection(
+        baseline,
+        overlay,
+        fail_on="bad",
+        require_assert=False,
+        artifacts_dir=None,
+        anti_flake_passes=1,
+    )
 
     assert selection == {"b", "c"}
     assert breakdown["healed"] == {"a"}

From 59c1a3f18f4654097272bf30332216ff696f0638 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 10:41:46 +0300
Subject: [PATCH 72/92] Clarify scope handling and streamline overlay
 resolution

---
 examples/demo_qa/batch.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index d7a6012..a0f1281 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -140,6 +140,7 @@ def _consecutive_passes(
     for entry in entries:
         if tag is not None and entry.get("tag") != tag:
             continue
+        # Old history entries may not contain scope_hash; treat missing as compatible for migration.
         if scope_hash and entry.get("scope_hash") not in (None, scope_hash):
             continue
         status = str(entry.get("status", ""))
@@ -535,9 +536,7 @@ def handle_batch(args) -> int:
             return 2
 
     overlay_run_path = _load_latest_run(artifacts_dir, args.tag, kind="any")
-    overlay_results_path = _resolve_results_path_for_run(overlay_run_path) or _load_latest_any_results(
-        artifacts_dir, args.tag
-    )
+    overlay_results_path = _load_latest_any_results(artifacts_dir, args.tag)
     if overlay_results_path and not args.no_overlay:
         try:
             overlay_results = load_results(overlay_results_path)
@@ -1092,7 +1091,10 @@ def handle_case_open(args) -> int:
     artifacts_dir = args.artifacts_dir or (args.data / ".runs")
     run_path = _resolve_run_path(args.run, artifacts_dir)
     if not run_path:
-        print("No run found. Provide --run or ensure runs/latest_any.txt exists (run a batch first).", file=sys.stderr)
+        print(
+            "No run found. Provide --run or ensure latest markers exist (latest_any/latest_complete); run a batch first.",
+            file=sys.stderr,
+        )
         return 2
     case_dir = _find_case_artifact(run_path, args.case_id)
     if not case_dir:

From 17e8154dd8b014ab9ebf3a5f7ccf61cfbbf24de0 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 10:50:24 +0300
Subject: [PATCH 73/92] Refine overlay logging and handling when disabled

---
 examples/demo_qa/batch.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index a0f1281..49b8b54 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -535,8 +535,12 @@ def handle_batch(args) -> int:
             print(f"Failed to read baseline for --compare-to: {exc}", file=sys.stderr)
             return 2
 
-    overlay_run_path = _load_latest_run(artifacts_dir, args.tag, kind="any")
-    overlay_results_path = _load_latest_any_results(artifacts_dir, args.tag)
+    overlay_run_path = None
+    overlay_results_path = None
+    overlay_disabled = args.no_overlay
+    if not overlay_disabled:
+        overlay_run_path = _load_latest_run(artifacts_dir, args.tag, kind="any")
+        overlay_results_path = _load_latest_any_results(artifacts_dir, args.tag)
     if overlay_results_path and not args.no_overlay:
         try:
             overlay_results = load_results(overlay_results_path)
@@ -586,10 +590,14 @@ def handle_batch(args) -> int:
             f"Baseline: run_id={baseline_label or 'n/a'} status={baseline_status or 'n/a'} complete={baseline_complete} scope={scope_display}",
             file=sys.stderr,
         )
-        print(
-            f"Overlay: run_id={overlay_label or 'n/a'} status={overlay_status or 'n/a'} complete={overlay_complete} scope={scope_display}",
-            file=sys.stderr,
+        overlay_line = (
+            "Overlay: disabled (--no-overlay)"
+            if args.no_overlay
+            else f"Overlay: run_id={overlay_label or 'n/a'} status={overlay_status or 'n/a'} complete={overlay_complete} scope={scope_display}"
         )
+        if overlay_results_path is None and not args.no_overlay:
+            overlay_line = "Overlay: none (no latest_any run)"
+        print(overlay_line, file=sys.stderr)
         print(f"Baseline failures: {len(baseline_fails)}", file=sys.stderr)
         print(f"Healed by overlay: {len(healed)}", file=sys.stderr)
         print(f"New failures in overlay: {len(new_failures)}", file=sys.stderr)

From 5036908d24917f42b91f36814a865f2e2900eb05 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 10:54:28 +0300
Subject: [PATCH 74/92] Tighten anti-flake scope handling and marker coverage

---
 examples/demo_qa/batch.py   | 21 ++++++++++++++-------
 examples/demo_qa/cli.py     |  5 +++++
 tests/test_demo_qa_batch.py |  4 ++++
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 49b8b54..fba9d11 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -44,7 +44,6 @@
     _load_latest_any_results,
     _load_latest_run,
     _load_run_meta,
-    _resolve_results_path_for_run,
     _run_dir_from_results_path,
     _update_latest_markers,
 )
@@ -128,6 +127,7 @@ def _consecutive_passes(
     passes_required: int = 1,
     fail_on: str = "bad",
     require_assert: bool = False,
+    strict_scope_history: bool = False,
 ) -> bool:
     if passes_required <= 1 or artifacts_dir is None:
         return overlay_result.status not in bad_statuses(fail_on, require_assert)
@@ -140,9 +140,11 @@ def _consecutive_passes(
     for entry in entries:
         if tag is not None and entry.get("tag") != tag:
             continue
-        # Old history entries may not contain scope_hash; treat missing as compatible for migration.
-        if scope_hash and entry.get("scope_hash") not in (None, scope_hash):
-            continue
+        # Old history entries may not contain scope_hash; treat missing as compatible for migration unless strict.
+        if scope_hash:
+            entry_scope = entry.get("scope_hash")
+            if entry_scope != scope_hash and (strict_scope_history or entry_scope is not None):
+                continue
         status = str(entry.get("status", ""))
         if status in bad:
             break
@@ -162,6 +164,7 @@ def _only_failed_selection(
     tag: str | None = None,
     scope_hash: str = "",
     anti_flake_passes: int = 1,
+    strict_scope_history: bool = False,
 ) -> tuple[set[str], dict[str, object]]:
     baseline = baseline_results or {}
     overlay = overlay_results or {}
@@ -181,6 +184,7 @@ def _only_failed_selection(
             passes_required=anti_flake_passes,
             fail_on=fail_on,
             require_assert=require_assert,
+            strict_scope_history=strict_scope_history,
         )
     }
 
@@ -459,7 +463,6 @@ def handle_batch(args) -> int:
 
     baseline_for_compare: Optional[Mapping[str, RunResult]] = None
     failed_baseline_results: Optional[Mapping[str, RunResult]] = None
-    failed_baseline_path: Path | None = None
     missed_baseline_results: Optional[Mapping[str, RunResult]] = None
     missed_baseline_path: Path | None = None
     overlay_results: Optional[Mapping[str, RunResult]] = None
@@ -571,6 +574,7 @@ def handle_batch(args) -> int:
             tag=args.tag,
             scope_hash=scope_id,
             anti_flake_passes=max(1, int(args.anti_flake_passes)),
+            strict_scope_history=args.strict_scope_history,
         )
         cases = [case for case in cases if case.id in selection_ids]
         failed_selection_ids = selection_ids
@@ -910,6 +914,7 @@ def handle_batch(args) -> int:
             "max_fails": args.max_fails,
             "no_overlay": args.no_overlay,
             "anti_flake_passes": args.anti_flake_passes,
+            "strict_scope_history": args.strict_scope_history,
         },
         "interrupted": interrupted,
         "interrupted_at_case_id": interrupted_at_case_id,
@@ -1055,7 +1060,8 @@ def handle_case_run(args) -> int:
 
     artifacts_dir = args.artifacts_dir or (args.data / ".runs")
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}"
+    run_id = uuid.uuid4().hex[:8]
+    run_folder = artifacts_dir / "runs" / f"{timestamp}_{args.cases.stem}_{run_id}"
     artifacts_root = run_folder / "cases"
     results_path = run_folder / "results.jsonl"
 
@@ -1074,7 +1080,7 @@ def handle_case_run(args) -> int:
     run_status = "FAILED" if bad_count else "SUCCESS"
     exit_code = 1 if bad_count else 0
     summary = {
-        "run_id": run_folder.name,
+        "run_id": run_id,
         "timestamp": timestamp + "Z",
         "counts": counts,
         "results_path": str(results_path),
@@ -1085,6 +1091,7 @@ def handle_case_run(args) -> int:
         "total_selected": 1,
         "total_executed": 1,
         "exit_code": exit_code,
+        "run_dir": str(run_folder),
     }
     summary_path = write_summary(results_path, summary)
     _update_latest_markers(run_folder, results_path, artifacts_dir, None, results_complete=True)
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index 7acc050..cf66445 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -102,6 +102,11 @@ def build_parser() -> argparse.ArgumentParser:
         default=2,
         help="Require N consecutive PASS results to consider a test healed (applies to --only-failed overlay logic)",
     )
+    batch_p.add_argument(
+        "--strict-scope-history",
+        action="store_true",
+        help="Require scope_hash match in history when counting consecutive passes (disable migration fallback)",
+    )
     batch_p.add_argument("--plan-only", action="store_true", help="Run planner only (no fetch/synthesize)")
     batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code")
     batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show")
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index ddf1b0f..26791c5 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -157,11 +157,13 @@ def test_update_latest_markers_handles_tag(tmp_path: Path) -> None:
     assert latest_default.complete.read_text(encoding="utf-8").strip() == str(run_dir)
     assert latest_default.results.read_text(encoding="utf-8").strip() == str(results_path)
     assert latest_default.any_run.read_text(encoding="utf-8").strip() == str(run_dir)
+    assert latest_default.legacy_run.read_text(encoding="utf-8").strip() == str(run_dir)
 
     latest_tag = _latest_markers(artifacts_dir, "feature/beta")
     assert latest_tag.complete.read_text(encoding="utf-8").strip() == str(run_dir)
     assert latest_tag.results.read_text(encoding="utf-8").strip() == str(results_path)
     assert latest_tag.any_run.read_text(encoding="utf-8").strip() == str(run_dir)
+    assert latest_tag.legacy_run.read_text(encoding="utf-8").strip() == str(run_dir)
 
     partial_dir = artifacts_dir / "runs" / "20240102_cases"
     partial_results = partial_dir / "results.jsonl"
@@ -174,8 +176,10 @@ def test_update_latest_markers_handles_tag(tmp_path: Path) -> None:
     assert refreshed_default.complete.read_text(encoding="utf-8").strip() == str(run_dir)
     assert refreshed_default.results.read_text(encoding="utf-8").strip() == str(results_path)
     assert refreshed_default.any_run.read_text(encoding="utf-8").strip() == str(partial_dir)
+    assert refreshed_default.legacy_run.read_text(encoding="utf-8").strip() == str(run_dir)
 
     refreshed_tag = _latest_markers(artifacts_dir, "feature/beta")
     assert refreshed_tag.complete.read_text(encoding="utf-8").strip() == str(run_dir)
     assert refreshed_tag.results.read_text(encoding="utf-8").strip() == str(results_path)
     assert refreshed_tag.any_run.read_text(encoding="utf-8").strip() == str(partial_dir)
+    assert refreshed_tag.legacy_run.read_text(encoding="utf-8").strip() == str(run_dir)

From b894f8860d06e3f779a01db0ae2c498c175d7abd Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sat, 3 Jan 2026 11:02:54 +0300
Subject: [PATCH 75/92] =?UTF-8?q?=D0=BA=D0=BE=D1=81=D0=BC=D0=B5=D1=82?=
 =?UTF-8?q?=D0=B8=D0=BA=D0=B0:=20=D1=81=D0=BE=D0=BE=D0=B1=D1=89=D0=B5?=
 =?UTF-8?q?=D0=BD=D0=B8=D0=B5=20=D0=BE=D0=B1=20=D0=BE=D1=88=D0=B8=D0=B1?=
 =?UTF-8?q?=D0=BA=D0=B5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/demo_qa/batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index bc3cf66..c8ad04d 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -618,7 +618,7 @@ def handle_batch(args) -> int:
                 )
                 save_status(stub)
                 results.append(stub)
-                print("Interrupted during case execution; saved partial status.", file=sys.stderr)
+                print("\nInterrupted during case execution; saved partial status.", file=sys.stderr)
                 break
             results.append(result)
             if not args.quiet:

From b27e9d0dda964d29c74aaadb95a1a1da1ef7eb4a Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 11:03:55 +0300
Subject: [PATCH 76/92] Clean unused imports and tighten marker handling

---
 examples/demo_qa/batch.py       | 1 -
 examples/demo_qa/runs/layout.py | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index fba9d11..2fbd41e 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -560,7 +560,6 @@ def handle_batch(args) -> int:
         exclude_ids=exclude_ids,
     )
     suite_case_ids = [case.id for case in filtered_cases]
-    filtered_case_lookup = {case.id: case for case in filtered_cases}
     cases = filtered_cases
     failed_selection_ids: set[str] | None = None
 
diff --git a/examples/demo_qa/runs/layout.py b/examples/demo_qa/runs/layout.py
index 022fc08..89bbc09 100644
--- a/examples/demo_qa/runs/layout.py
+++ b/examples/demo_qa/runs/layout.py
@@ -136,7 +136,8 @@ def _update_latest_markers(
     if tag:
         marker_sets.add(_latest_markers(artifacts_dir, tag))
     for markers in marker_sets:
-        markers.complete.parent.mkdir(parents=True, exist_ok=True)
+        for path in [markers.complete, markers.results, markers.any_run, markers.legacy_run]:
+            path.parent.mkdir(parents=True, exist_ok=True)
         markers.any_run.write_text(str(run_folder), encoding="utf-8")
         if results_complete:
             markers.legacy_run.write_text(str(run_folder), encoding="utf-8")

From 1e14d104d958a0b02ee7a9962624d038d86d6eeb Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 11:14:06 +0300
Subject: [PATCH 77/92] Document scope migration and include run_dir in case
 summaries

---
 examples/demo_qa/batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 2fbd41e..fbb3918 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -140,7 +140,7 @@ def _consecutive_passes(
     for entry in entries:
         if tag is not None and entry.get("tag") != tag:
             continue
-        # Old history entries may not contain scope_hash; treat missing as compatible for migration unless strict.
+        # Old history entries may not contain scope_hash; treat missing as compatible for migration unless strict_scope_history is set.
         if scope_hash:
             entry_scope = entry.get("scope_hash")
             if entry_scope != scope_hash and (strict_scope_history or entry_scope is not None):

From 9dfaec7b83aef7d16a6f23b2b7d91b9848e0a003 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 11:27:30 +0300
Subject: [PATCH 78/92] Avoid overlay logging when disabled and remove unused
 imports

---
 examples/demo_qa/batch.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index fbb3918..fac8c16 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -544,13 +544,13 @@ def handle_batch(args) -> int:
     if not overlay_disabled:
         overlay_run_path = _load_latest_run(artifacts_dir, args.tag, kind="any")
         overlay_results_path = _load_latest_any_results(artifacts_dir, args.tag)
-    if overlay_results_path and not args.no_overlay:
-        try:
-            overlay_results = load_results(overlay_results_path)
-        except Exception as exc:
-            print(f"Failed to read overlay results from latest run: {exc}", file=sys.stderr)
-            overlay_results_path = None
-            overlay_results = None
+        if overlay_results_path:
+            try:
+                overlay_results = load_results(overlay_results_path)
+            except Exception as exc:
+                print(f"Failed to read overlay results from latest run: {exc}", file=sys.stderr)
+                overlay_results_path = None
+                overlay_results = None
 
     filtered_cases = _select_cases_for_rerun(
         cases,

From b65f81b3bdb01bc2f53eb32d5a4aca3d5e72d57d Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 11:34:10 +0300
Subject: [PATCH 79/92] Fix anti-flake double counting and respect baseline
 planned pool

---
 examples/demo_qa/batch.py   | 17 ++++++++++++++-
 tests/test_demo_qa_batch.py | 43 +++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index fac8c16..0f1b9bd 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -137,7 +137,11 @@ def _consecutive_passes(
     count = 1
     history_path = artifacts_dir / "runs" / "cases" / f"{case_id}.jsonl"
     entries = list(reversed(_load_case_history(history_path)))
+    skip_first = True  # overlay_result already counted; skip the most recent history entry
     for entry in entries:
+        if skip_first:
+            skip_first = False
+            continue
         if tag is not None and entry.get("tag") != tag:
             continue
         # Old history entries may not contain scope_hash; treat missing as compatible for migration unless strict_scope_history is set.
@@ -607,6 +611,7 @@ def handle_batch(args) -> int:
         print(f"Final only-failed selection: {len(selection_ids)}", file=sys.stderr)
 
     only_missed_baseline_kind: str | None = None
+    missed_planned_ids: set[str] | None = None
     if args.only_missed:
         only_missed_from_arg = cast(Optional[Path], args.only_missed_from)
         if only_missed_from_arg:
@@ -629,6 +634,9 @@ def handle_batch(args) -> int:
             missed_baseline_path = eff_path
             missed_baseline_results = effective_results
             only_missed_baseline_kind = "effective"
+            planned_ids_meta = effective_meta.get("planned_case_ids") if isinstance(effective_meta, dict) else None
+            if isinstance(planned_ids_meta, list):
+                missed_planned_ids = {str(cid) for cid in planned_ids_meta}
         else:
             missed_baseline_path = only_missed_from_arg or _load_latest_results(artifacts_dir, args.tag)
             if only_missed_from_arg:
@@ -641,10 +649,17 @@ def handle_batch(args) -> int:
             except Exception as exc:
                 print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr)
                 return 2
+        if missed_baseline_path is not None and missed_planned_ids is None:
+            missed_baseline_run = _run_dir_from_results_path(missed_baseline_path)
+            baseline_meta = _load_run_meta(missed_baseline_run)
+            if isinstance(baseline_meta, dict):
+                planned_from_meta = baseline_meta.get("planned_case_ids") or baseline_meta.get("selected_case_ids")
+                if isinstance(planned_from_meta, list):
+                    missed_planned_ids = {str(cid) for cid in planned_from_meta}
         if args.only_missed and missed_baseline_results is None:
             print("No baseline found for --only-missed.", file=sys.stderr)
             return 2
-        selected_case_ids = suite_case_ids
+        selected_case_ids = missed_planned_ids or set(suite_case_ids)
         missed_ids, missed_breakdown = _only_missed_selection(
             selected_case_ids,
             missed_baseline_results,
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 26791c5..34669de 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -144,6 +144,49 @@ def test_only_missed_selection_uses_overlay_executed() -> None:
     assert breakdown["overlay_executed"] == {"c"}
 
 
+def test_anti_flake_requires_two_passes_without_double_count(tmp_path: Path) -> None:
+    artifacts_dir = tmp_path
+    case_id = "x1"
+    history_dir = artifacts_dir / "runs" / "cases"
+    history_dir.mkdir(parents=True)
+    history_file = history_dir / f"{case_id}.jsonl"
+    # most recent in history = overlay run we already count
+    history_file.write_text(
+        json.dumps({"status": "ok", "scope_hash": "s"}, ensure_ascii=False) + "\n"
+        + json.dumps({"status": "failed", "scope_hash": "s"}, ensure_ascii=False) + "\n",
+        encoding="utf-8",
+    )
+    overlay_res = _mk_result(case_id, "ok")
+    healed = _consecutive_passes(
+        case_id,
+        overlay_res,
+        artifacts_dir,
+        scope_hash="s",
+        passes_required=2,
+        fail_on="bad",
+        require_assert=False,
+        strict_scope_history=True,
+    )
+    assert healed is False
+
+
+def test_only_missed_uses_planned_pool_from_baseline_meta(tmp_path: Path) -> None:
+    artifacts_dir = tmp_path
+    run_dir = artifacts_dir / "runs" / "r1"
+    run_dir.mkdir(parents=True)
+    baseline_results = {"a": _mk_result("a", "ok")}
+    results_path = run_dir / "results.jsonl"
+    results_path.write_text("", encoding="utf-8")
+    meta = {"planned_case_ids": ["a", "b"], "selected_case_ids": ["a", "b"], "scope_hash": "s"}
+    (run_dir / "run_meta.json").write_text(json.dumps(meta), encoding="utf-8")
+
+    overlay = {"c": _mk_result("c", "ok")}
+    planned_pool = {"a", "b"}
+    missed, _ = _only_missed_selection(planned_pool, baseline_results, overlay)
+
+    assert missed == {"b"}
+
+
 def test_update_latest_markers_handles_tag(tmp_path: Path) -> None:
     artifacts_dir = tmp_path / "data" / ".runs"
     run_dir = artifacts_dir / "runs" / "20240101_cases"

From be56c6b61fb6a82d424e34981ecc515878a5e578 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 12:00:28 +0300
Subject: [PATCH 80/92] Add planned-pool helper and anti-flake regression tests

---
 examples/demo_qa/batch.py   | 32 +++++++++++++++++++++-----------
 tests/test_demo_qa_batch.py |  9 +++------
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 0f1b9bd..da379da 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -216,6 +216,24 @@ def _only_missed_selection(
     return missed_final, breakdown
 
 
+def _planned_pool_from_meta(
+    effective_meta: Mapping[str, object] | None, baseline_results_path: Path | None, suite_case_ids: Iterable[str]
+) -> set[str]:
+    planned: set[str] | None = None
+    if effective_meta:
+        planned_from_eff = effective_meta.get("planned_case_ids")
+        if isinstance(planned_from_eff, list):
+            planned = {str(cid) for cid in planned_from_eff}
+    if planned is None and baseline_results_path is not None:
+        run_dir = _run_dir_from_results_path(baseline_results_path)
+        meta = _load_run_meta(run_dir)
+        if isinstance(meta, dict):
+            planned_from_meta = meta.get("planned_case_ids") or meta.get("selected_case_ids")
+            if isinstance(planned_from_meta, list):
+                planned = {str(cid) for cid in planned_from_meta}
+    return planned or set(suite_case_ids)
+
+
 def _fingerprint_dir(data_dir: Path, *, verbose: bool = False) -> Mapping[str, object]:
     entries: list[dict] = []
     total_bytes = 0
@@ -612,6 +630,7 @@ def handle_batch(args) -> int:
 
     only_missed_baseline_kind: str | None = None
     missed_planned_ids: set[str] | None = None
+    missed_effective_meta: Mapping[str, object] | None = None
     if args.only_missed:
         only_missed_from_arg = cast(Optional[Path], args.only_missed_from)
         if only_missed_from_arg:
@@ -634,9 +653,7 @@ def handle_batch(args) -> int:
             missed_baseline_path = eff_path
             missed_baseline_results = effective_results
             only_missed_baseline_kind = "effective"
-            planned_ids_meta = effective_meta.get("planned_case_ids") if isinstance(effective_meta, dict) else None
-            if isinstance(planned_ids_meta, list):
-                missed_planned_ids = {str(cid) for cid in planned_ids_meta}
+            missed_effective_meta = effective_meta
         else:
             missed_baseline_path = only_missed_from_arg or _load_latest_results(artifacts_dir, args.tag)
             if only_missed_from_arg:
@@ -649,17 +666,10 @@ def handle_batch(args) -> int:
             except Exception as exc:
                 print(f"Failed to read baseline for --only-missed: {exc}", file=sys.stderr)
                 return 2
-        if missed_baseline_path is not None and missed_planned_ids is None:
-            missed_baseline_run = _run_dir_from_results_path(missed_baseline_path)
-            baseline_meta = _load_run_meta(missed_baseline_run)
-            if isinstance(baseline_meta, dict):
-                planned_from_meta = baseline_meta.get("planned_case_ids") or baseline_meta.get("selected_case_ids")
-                if isinstance(planned_from_meta, list):
-                    missed_planned_ids = {str(cid) for cid in planned_from_meta}
         if args.only_missed and missed_baseline_results is None:
             print("No baseline found for --only-missed.", file=sys.stderr)
             return 2
-        selected_case_ids = missed_planned_ids or set(suite_case_ids)
+        selected_case_ids = _planned_pool_from_meta(missed_effective_meta, missed_baseline_path, suite_case_ids)
         missed_ids, missed_breakdown = _only_missed_selection(
             selected_case_ids,
             missed_baseline_results,
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 34669de..61eab1d 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -13,6 +13,7 @@
     _fingerprint_dir,
     _only_failed_selection,
     _only_missed_selection,
+    _planned_pool_from_meta,
     bad_statuses,
     is_failure,
     render_markdown,
@@ -174,17 +175,13 @@ def test_only_missed_uses_planned_pool_from_baseline_meta(tmp_path: Path) -> Non
     artifacts_dir = tmp_path
     run_dir = artifacts_dir / "runs" / "r1"
     run_dir.mkdir(parents=True)
-    baseline_results = {"a": _mk_result("a", "ok")}
     results_path = run_dir / "results.jsonl"
     results_path.write_text("", encoding="utf-8")
     meta = {"planned_case_ids": ["a", "b"], "selected_case_ids": ["a", "b"], "scope_hash": "s"}
     (run_dir / "run_meta.json").write_text(json.dumps(meta), encoding="utf-8")
 
-    overlay = {"c": _mk_result("c", "ok")}
-    planned_pool = {"a", "b"}
-    missed, _ = _only_missed_selection(planned_pool, baseline_results, overlay)
-
-    assert missed == {"b"}
+    planned_pool = _planned_pool_from_meta(None, results_path, ["x", "y"])
+    assert planned_pool == {"a", "b"}
 
 
 def test_update_latest_markers_handles_tag(tmp_path: Path) -> None:

From 9062c690decc8b33f55ac3ada0f6be3d7d019eba Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 13:16:52 +0300
Subject: [PATCH 81/92] Improve anti-flake history handling (#92)

* Improve anti-flake history handling

* Fix anti-flake history filtering and ts handling
---
 examples/demo_qa/batch.py             | 152 +++++++++++++++++++-------
 examples/demo_qa/cli.py               |  11 ++
 examples/demo_qa/runs/case_history.py | 113 ++++++++++++++++++-
 tests/test_demo_qa_batch.py           |  99 +++++++++++++++--
 4 files changed, 327 insertions(+), 48 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index da379da..b1d39a1 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -30,7 +30,7 @@
     save_status,
     summarize,
 )
-from .runs.case_history import _append_case_history, _load_case_history
+from .runs.case_history import _append_case_history, _iter_case_entries_newest_first
 from .runs.coverage import _missed_case_ids
 from .runs.effective import (
     _append_effective_diff,
@@ -119,8 +119,8 @@ def _load_ids(path: Optional[Path]) -> set[str] | None:
 
 def _consecutive_passes(
     case_id: str,
-    overlay_result: RunResult,
-    artifacts_dir: Path | None = None,
+    overlay_entry: Mapping[str, object] | None,
+    history_path: Path | None,
     *,
     tag: str | None = None,
     scope_hash: str = "",
@@ -128,34 +128,38 @@ def _consecutive_passes(
     fail_on: str = "bad",
     require_assert: bool = False,
     strict_scope_history: bool = False,
-) -> bool:
-    if passes_required <= 1 or artifacts_dir is None:
-        return overlay_result.status not in bad_statuses(fail_on, require_assert)
+    max_entries: int | None = None,
+) -> tuple[bool, list[dict]]:
     bad = bad_statuses(fail_on, require_assert)
-    if overlay_result.status in bad:
-        return False
-    count = 1
-    history_path = artifacts_dir / "runs" / "cases" / f"{case_id}.jsonl"
-    entries = list(reversed(_load_case_history(history_path)))
-    skip_first = True  # overlay_result already counted; skip the most recent history entry
-    for entry in entries:
-        if skip_first:
-            skip_first = False
-            continue
-        if tag is not None and entry.get("tag") != tag:
-            continue
-        # Old history entries may not contain scope_hash; treat missing as compatible for migration unless strict_scope_history is set.
-        if scope_hash:
-            entry_scope = entry.get("scope_hash")
-            if entry_scope != scope_hash and (strict_scope_history or entry_scope is not None):
-                continue
+    if overlay_entry is None:
+        return False, []
+    if passes_required <= 1:
+        return (overlay_entry.get("status") not in bad, [dict(overlay_entry)])
+    if history_path is None:
+        return False, [dict(overlay_entry)]
+    entries: list[dict] = []
+    passes_needed = max(passes_required, 1)
+    iterator = _iter_case_entries_newest_first(
+        history_path,
+        case_id,
+        tag,
+        scope_hash or None,
+        strict_scope=strict_scope_history,
+        fail_on=fail_on,
+        require_assert=require_assert,
+        overlay_entry=dict(overlay_entry) if overlay_entry else None,
+        max_entries=max_entries or (passes_needed + 5),
+    )
+    consecutive = 0
+    for entry in iterator:
+        entries.append(entry)
         status = str(entry.get("status", ""))
         if status in bad:
-            break
-        count += 1
-        if count >= passes_required:
-            return True
-    return count >= passes_required
+            return False, entries
+        consecutive += 1
+        if consecutive >= passes_needed:
+            return True, entries
+    return False, entries
 
 
 def _only_failed_selection(
@@ -169,20 +173,45 @@ def _only_failed_selection(
     scope_hash: str = "",
     anti_flake_passes: int = 1,
     strict_scope_history: bool = False,
+    overlay_run_meta: Mapping[str, object] | None = None,
+    overlay_run_path: Path | None = None,
+    explain_selection: bool = False,
+    explain_limit: int = 20,
 ) -> tuple[set[str], dict[str, object]]:
     baseline = baseline_results or {}
     overlay = overlay_results or {}
     bad = bad_statuses(fail_on, require_assert)
     baseline_bad = {cid for cid, res in baseline.items() if res.status in bad}
     overlay_bad = {cid for cid, res in overlay.items() if res.status in bad}
-    overlay_good = {
-        cid
-        for cid, res in overlay.items()
-        if res.status not in bad
-        and _consecutive_passes(
+    overlay_run_id = cast(Optional[str], overlay_run_meta.get("run_id") if isinstance(overlay_run_meta, Mapping) else None)
+    overlay_ts: Optional[object] = None
+    if isinstance(overlay_run_meta, Mapping):
+        overlay_ts = overlay_run_meta.get("ended_at") or overlay_run_meta.get("timestamp") or overlay_run_meta.get("started_at")
+    if overlay_ts is None and overlay_run_path and overlay_run_path.exists():
+        try:
+            overlay_ts = overlay_run_path.stat().st_mtime
+        except OSError:
+            overlay_ts = None
+    overlay_entries: dict[str, dict] = {}
+    for cid, res in overlay.items():
+        entry = {
+            "run_id": overlay_run_id or (str(overlay_run_path) if overlay_run_path else "overlay"),
+            "ts": overlay_ts,
+            "timestamp": overlay_ts,
+            "status": res.status,
+            "scope_hash": scope_hash,
+            "tag": tag,
+            "run_dir": str(overlay_run_path) if overlay_run_path else None,
+        }
+        overlay_entries[cid] = {k: v for k, v in entry.items() if v is not None}
+
+    overlay_good: set[str] = set()
+    healed_details: dict[str, list[dict]] = {}
+    for cid, res in overlay.items():
+        ok, history_entries = _consecutive_passes(
             cid,
-            res,
-            artifacts_dir,
+            overlay_entries.get(cid),
+            artifacts_dir / "runs" / "cases" / f"{cid}.jsonl" if artifacts_dir else None,
             tag=tag,
             scope_hash=scope_hash,
             passes_required=anti_flake_passes,
@@ -190,7 +219,10 @@ def _only_failed_selection(
             require_assert=require_assert,
             strict_scope_history=strict_scope_history,
         )
-    }
+        if ok:
+            overlay_good.add(cid)
+            if explain_selection:
+                healed_details[cid] = history_entries
 
     healed = baseline_bad & overlay_good
     selection = (baseline_bad - healed) | overlay_bad
@@ -199,9 +231,36 @@ def _only_failed_selection(
         "healed": healed,
         "new_failures": overlay_bad,
     }
+    if explain_selection and healed_details:
+        limit = max(1, explain_limit)
+        breakdown["healed_details"] = {cid: healed_details[cid] for cid in list(sorted(healed_details))[:limit]}
     return selection, breakdown
 
 
+def _format_healed_explain(
+    healed: Iterable[str],
+    healed_details: Mapping[str, list[dict]] | None,
+    *,
+    anti_flake_passes: int,
+    limit: int,
+) -> list[str]:
+    details = healed_details or {}
+    max_entries = max(1, limit)
+    lines: list[str] = []
+    healed_list = sorted(set(healed))
+    for cid in healed_list[:max_entries]:
+        entries = details.get(cid, [])
+        lines.append(f"Healed because last {anti_flake_passes} results are PASS for case {cid}")
+        for entry in entries[:anti_flake_passes]:
+            rid = entry.get("run_id")
+            ts = entry.get("ts") or entry.get("timestamp")
+            status = entry.get("status")
+            lines.append(f"  - run_id={rid} ts={ts} status={status}")
+    if len(healed_list) > max_entries:
+        lines.append(f"... {len(healed_list) - max_entries} more healed cases not shown (limit={max_entries})")
+    return lines
+
+
 def _only_missed_selection(
     selected_case_ids: Iterable[str],
     baseline_results: Mapping[str, RunResult] | None,
@@ -563,6 +622,7 @@ def handle_batch(args) -> int:
     overlay_run_path = None
     overlay_results_path = None
     overlay_disabled = args.no_overlay
+    overlay_run_meta: Optional[Mapping[str, object]] = None
     if not overlay_disabled:
         overlay_run_path = _load_latest_run(artifacts_dir, args.tag, kind="any")
         overlay_results_path = _load_latest_any_results(artifacts_dir, args.tag)
@@ -573,6 +633,8 @@ def handle_batch(args) -> int:
                 print(f"Failed to read overlay results from latest run: {exc}", file=sys.stderr)
                 overlay_results_path = None
                 overlay_results = None
+        if overlay_run_path:
+            overlay_run_meta = _load_run_meta(overlay_run_path)
 
     filtered_cases = _select_cases_for_rerun(
         cases,
@@ -596,6 +658,10 @@ def handle_batch(args) -> int:
             scope_hash=scope_id,
             anti_flake_passes=max(1, int(args.anti_flake_passes)),
             strict_scope_history=args.strict_scope_history,
+            overlay_run_meta=overlay_run_meta,
+            overlay_run_path=overlay_run_path,
+            explain_selection=args.explain_selection,
+            explain_limit=args.explain_limit,
         )
         cases = [case for case in cases if case.id in selection_ids]
         failed_selection_ids = selection_ids
@@ -605,7 +671,7 @@ def handle_batch(args) -> int:
         baseline_meta = _load_run_meta(_run_dir_from_results_path(baseline_filter_path))
         baseline_label = baseline_meta.get("run_id") if isinstance(baseline_meta, dict) else None
         baseline_status = baseline_meta.get("run_status") if isinstance(baseline_meta, dict) else None
-        overlay_meta = _load_run_meta(overlay_run_path)
+        overlay_meta = overlay_run_meta if overlay_run_meta is not None else _load_run_meta(overlay_run_path)
         overlay_label = overlay_meta.get("run_id") if isinstance(overlay_meta, dict) else None
         overlay_status = overlay_meta.get("run_status") if isinstance(overlay_meta, dict) else None
         baseline_complete = baseline_meta.get("results_complete") if isinstance(baseline_meta, dict) else None
@@ -627,6 +693,15 @@ def handle_batch(args) -> int:
         print(f"Healed by overlay: {len(healed)}", file=sys.stderr)
         print(f"New failures in overlay: {len(new_failures)}", file=sys.stderr)
         print(f"Final only-failed selection: {len(selection_ids)}", file=sys.stderr)
+        if args.explain_selection and healed:
+            healed_lines = _format_healed_explain(
+                healed,
+                breakdown.get("healed_details"),
+                anti_flake_passes=args.anti_flake_passes,
+                limit=args.explain_limit,
+            )
+            for line in healed_lines:
+                print(line, file=sys.stderr)
 
     only_missed_baseline_kind: str | None = None
     missed_planned_ids: set[str] | None = None
@@ -939,6 +1014,8 @@ def handle_batch(args) -> int:
             "no_overlay": args.no_overlay,
             "anti_flake_passes": args.anti_flake_passes,
             "strict_scope_history": args.strict_scope_history,
+            "explain_selection": args.explain_selection,
+            "explain_limit": args.explain_limit,
         },
         "interrupted": interrupted,
         "interrupted_at_case_id": interrupted_at_case_id,
@@ -1010,6 +1087,7 @@ def handle_batch(args) -> int:
             git_sha=git_sha,
             run_dir=run_folder,
             results_path=results_path,
+            run_ts=_isoformat_utc(ended_at),
         )
     history_path.parent.mkdir(parents=True, exist_ok=True)
     with history_path.open("a", encoding="utf-8") as f:
diff --git a/examples/demo_qa/cli.py b/examples/demo_qa/cli.py
index cf66445..5938355 100644
--- a/examples/demo_qa/cli.py
+++ b/examples/demo_qa/cli.py
@@ -107,6 +107,17 @@ def build_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Require scope_hash match in history when counting consecutive passes (disable migration fallback)",
     )
+    batch_p.add_argument(
+        "--explain-selection",
+        action="store_true",
+        help="Explain why cases were selected/healed when using --only-failed/--only-missed",
+    )
+    batch_p.add_argument(
+        "--explain-limit",
+        type=int,
+        default=20,
+        help="Maximum number of cases to include in explain output",
+    )
     batch_p.add_argument("--plan-only", action="store_true", help="Run planner only (no fetch/synthesize)")
     batch_p.add_argument("--quiet", action="store_true", help="Print only summary and exit code")
     batch_p.add_argument("--show-failures", type=int, default=10, help="How many failing cases to show")
diff --git a/examples/demo_qa/runs/case_history.py b/examples/demo_qa/runs/case_history.py
index 860483c..55ac7fb 100644
--- a/examples/demo_qa/runs/case_history.py
+++ b/examples/demo_qa/runs/case_history.py
@@ -2,10 +2,15 @@
 
 import datetime
 import json
+import logging
 from pathlib import Path
-from typing import Optional
+from typing import Iterable, Mapping, Optional
 
 from ..runner import RunResult
+from .layout import _load_run_meta
+
+
+logger = logging.getLogger(__name__)
 
 
 def _reason_text(res: RunResult) -> str:
@@ -33,11 +38,14 @@ def _append_case_history(
     git_sha: str | None,
     run_dir: Path,
     results_path: Path,
+    run_ts: str | None,
 ) -> None:
     history_dir = artifacts_dir / "runs" / "cases"
     history_dir.mkdir(parents=True, exist_ok=True)
+    ts = run_ts or datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z")
     payload = {
-        "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z"),
+        "timestamp": ts,
+        "ts": ts,
         "run_id": run_id,
         "tag": tag,
         "note": note,
@@ -74,4 +82,103 @@ def _load_case_history(path: Path) -> list[dict]:
     return entries
 
 
-__all__ = ["_append_case_history", "_load_case_history"]
+def _parse_ts(value: object | None) -> float | None:
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, str):
+        try:
+            return datetime.datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp()
+        except Exception:
+            try:
+                return float(value)
+            except Exception:
+                return None
+    return None
+
+
+def _entry_ts(entry: Mapping[str, object], *, run_dir: Path | None) -> tuple[float | None, str | None]:
+    ts = _parse_ts(entry.get("ts")) or _parse_ts(entry.get("timestamp"))
+    if ts is not None:
+        return ts, None
+    meta_ts: float | None = None
+    if run_dir:
+        meta = _load_run_meta(run_dir)
+        if isinstance(meta, dict):
+            meta_ts = _parse_ts(meta.get("ended_at") or meta.get("timestamp") or meta.get("started_at"))
+        if meta_ts is None:
+            try:
+                meta_ts = run_dir.stat().st_mtime
+            except OSError:
+                meta_ts = None
+    return meta_ts, "history order fallback used"
+
+
+def _iter_case_entries_newest_first(
+    history_path: Path,
+    case_id: str,
+    tag: str | None,
+    scope_hash: str | None,
+    *,
+    strict_scope: bool,
+    fail_on: str,
+    require_assert: bool,
+    overlay_entry: dict | None,
+    max_entries: int,
+) -> Iterable[dict]:
+    entries = list(_load_case_history(history_path)) if history_path.exists() else []
+    overlay_index = None
+    if overlay_entry:
+        overlay_index = len(entries)
+        entries.append(dict(overlay_entry))
+
+    accepted: dict[str, dict] = {}
+    ts_map: dict[str, float | None] = {}
+    is_overlay_map: dict[str, bool] = {}
+    warnings_emitted = False
+    for idx, entry in enumerate(entries):
+        if tag is not None and entry.get("tag") != tag:
+            continue
+        entry_scope = entry.get("scope_hash")
+        if scope_hash:
+            if entry_scope != scope_hash and (strict_scope or entry_scope is not None):
+                continue
+        run_id = str(entry.get("run_id")) if entry.get("run_id") is not None else None
+        if not run_id:
+            continue
+        run_dir = None
+        if entry.get("run_dir"):
+            run_dir = Path(str(entry["run_dir"]))
+        ts_value, warn = _entry_ts(entry, run_dir=run_dir)
+        if ts_value is None and warn:
+            warnings_emitted = True
+        is_overlay = overlay_entry is not None and idx == overlay_index
+        current_ts = ts_map.get(run_id)
+        current_is_overlay = is_overlay_map.get(run_id, False)
+        candidate_ts = ts_value
+        should_replace = False
+        if run_id not in accepted:
+            should_replace = True
+        else:
+            if candidate_ts is not None:
+                if current_ts is None or candidate_ts > current_ts or (
+                    candidate_ts == current_ts and is_overlay and not current_is_overlay
+                ):
+                    should_replace = True
+            else:
+                if current_ts is None and is_overlay and not current_is_overlay:
+                    should_replace = True
+        if should_replace:
+            accepted[run_id] = entry
+            ts_map[run_id] = candidate_ts
+            is_overlay_map[run_id] = is_overlay
+    if warnings_emitted:
+        logger.warning("ts missing; history order fallback used for case %s", case_id)
+
+    sorted_entries = sorted(accepted.items(), key=lambda kv: ts_map.get(kv[0], 0), reverse=True)
+    for _, entry in sorted_entries[:max_entries]:
+        yield entry
+
+
+__all__ = ["_append_case_history", "_iter_case_entries_newest_first", "_load_case_history"]
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 61eab1d..0c46705 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -10,6 +10,8 @@
 import pytest
 
 from examples.demo_qa.batch import (
+    _consecutive_passes,
+    _format_healed_explain,
     _fingerprint_dir,
     _only_failed_selection,
     _only_missed_selection,
@@ -151,24 +153,92 @@ def test_anti_flake_requires_two_passes_without_double_count(tmp_path: Path) ->
     history_dir = artifacts_dir / "runs" / "cases"
     history_dir.mkdir(parents=True)
     history_file = history_dir / f"{case_id}.jsonl"
-    # most recent in history = overlay run we already count
+    now = "2024-01-02T00:00:00Z"
+    history_entries = [
+        {"status": "ok", "scope_hash": "s", "run_id": "r1", "ts": now, "timestamp": now},
+    ]
+    history_file.write_text("\n".join(json.dumps(e, ensure_ascii=False) for e in history_entries), encoding="utf-8")
+    overlay_entry = {"status": "ok", "scope_hash": "s", "run_id": "r1", "ts": now, "timestamp": now}
+    healed, _ = _consecutive_passes(
+        case_id,
+        overlay_entry,
+        artifacts_dir / "runs" / "cases" / f"{case_id}.jsonl",
+        scope_hash="s",
+        passes_required=2,
+        fail_on="bad",
+        require_assert=False,
+        strict_scope_history=True,
+    )
+    assert healed is False
+
+
+def test_anti_flake_order_independent(tmp_path: Path) -> None:
+    artifacts_dir = tmp_path
+    case_id = "case-1"
+    history_dir = artifacts_dir / "runs" / "cases"
+    history_dir.mkdir(parents=True)
+    history_file = history_dir / f"{case_id}.jsonl"
+    entries = [
+        {"status": "ok", "scope_hash": "s", "run_id": "r2", "ts": "2024-01-03T00:00:00Z"},
+        {"status": "failed", "scope_hash": "s", "run_id": "r1", "ts": "2024-01-01T00:00:00Z"},
+        {"status": "ok", "scope_hash": "s", "run_id": "r3", "ts": "2024-01-02T00:00:00Z"},
+    ]
+    history_file.write_text("\n".join(json.dumps(e, ensure_ascii=False) for e in entries), encoding="utf-8")
+    overlay_entry = {"status": "ok", "scope_hash": "s", "run_id": "r4", "ts": "2024-01-04T00:00:00Z"}
+
+    healed, used_entries = _consecutive_passes(
+        case_id,
+        overlay_entry,
+        history_file,
+        scope_hash="s",
+        passes_required=2,
+        fail_on="bad",
+        require_assert=False,
+        strict_scope_history=True,
+    )
+    assert healed is True
+    assert used_entries[0]["run_id"] == "r4"
+
+
+def test_anti_flake_respects_legacy_scope_when_not_strict(tmp_path: Path) -> None:
+    artifacts_dir = tmp_path
+    case_id = "case-legacy"
+    history_dir = artifacts_dir / "runs" / "cases"
+    history_dir.mkdir(parents=True)
+    history_file = history_dir / f"{case_id}.jsonl"
     history_file.write_text(
-        json.dumps({"status": "ok", "scope_hash": "s"}, ensure_ascii=False) + "\n"
-        + json.dumps({"status": "failed", "scope_hash": "s"}, ensure_ascii=False) + "\n",
+        "\n".join(
+            json.dumps(e, ensure_ascii=False)
+            for e in [
+                {"status": "ok", "scope_hash": None, "run_id": "r1", "ts": "2024-01-01T00:00:00Z"},
+            ]
+        ),
         encoding="utf-8",
     )
-    overlay_res = _mk_result(case_id, "ok")
-    healed = _consecutive_passes(
+    overlay_entry = {"status": "ok", "scope_hash": "s", "run_id": "r2", "ts": "2024-01-02T00:00:00Z"}
+
+    healed_strict, _ = _consecutive_passes(
         case_id,
-        overlay_res,
-        artifacts_dir,
+        overlay_entry,
+        history_file,
         scope_hash="s",
         passes_required=2,
         fail_on="bad",
         require_assert=False,
         strict_scope_history=True,
     )
-    assert healed is False
+    healed_migrating, _ = _consecutive_passes(
+        case_id,
+        overlay_entry,
+        history_file,
+        scope_hash="s",
+        passes_required=2,
+        fail_on="bad",
+        require_assert=False,
+        strict_scope_history=False,
+    )
+    assert healed_strict is False
+    assert healed_migrating is True
 
 
 def test_only_missed_uses_planned_pool_from_baseline_meta(tmp_path: Path) -> None:
@@ -223,3 +293,16 @@ def test_update_latest_markers_handles_tag(tmp_path: Path) -> None:
     assert refreshed_tag.results.read_text(encoding="utf-8").strip() == str(results_path)
     assert refreshed_tag.any_run.read_text(encoding="utf-8").strip() == str(partial_dir)
     assert refreshed_tag.legacy_run.read_text(encoding="utf-8").strip() == str(run_dir)
+
+
+def test_format_healed_explain_includes_key_lines() -> None:
+    healed = {"a", "b"}
+    healed_details = {
+        "a": [
+            {"run_id": "r2", "ts": "2024-01-02T00:00:00Z", "status": "ok"},
+            {"run_id": "r1", "ts": "2024-01-01T00:00:00Z", "status": "ok"},
+        ]
+    }
+    lines = _format_healed_explain(healed, healed_details, anti_flake_passes=2, limit=2)
+    assert any("Healed because last 2 results are PASS for case a" in line for line in lines)
+    assert any("run_id=r2" in line and "status=ok" in line for line in lines)

From b58a4ce6c65e58e9da2b9fb558f52b3eda5bc63b Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sat, 3 Jan 2026 14:22:40 +0300
Subject: [PATCH 82/92] ruff fixes

---
 examples/demo_qa/batch.py   | 4 ++--
 tests/test_demo_qa_batch.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index b1d39a1..bd8fe39 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -14,8 +14,8 @@
 from .provider_factory import build_provider
 from .runner import (
     Case,
-    DiffReport,
     DiffCaseChange,
+    DiffReport,
     EventLogger,
     RunResult,
     RunTimings,
@@ -40,8 +40,8 @@
 )
 from .runs.io import write_results
 from .runs.layout import (
-    _load_latest_results,
     _load_latest_any_results,
+    _load_latest_results,
     _load_latest_run,
     _load_run_meta,
     _run_dir_from_results_path,
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 0c46705..f294848 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -11,8 +11,8 @@
 
 from examples.demo_qa.batch import (
     _consecutive_passes,
-    _format_healed_explain,
     _fingerprint_dir,
+    _format_healed_explain,
     _only_failed_selection,
     _only_missed_selection,
     _planned_pool_from_meta,
@@ -21,9 +21,9 @@
     render_markdown,
     write_results,
 )
+from examples.demo_qa.runner import DiffReport, RunResult, diff_runs
 from examples.demo_qa.runs.coverage import _missed_case_ids
 from examples.demo_qa.runs.layout import _latest_markers, _update_latest_markers
-from examples.demo_qa.runner import DiffReport, RunResult, diff_runs
 
 
 @pytest.mark.parametrize(

From cdd6f35b1e70ed4fa20c2095d98f92e0d6a99726 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 14:49:50 +0300
Subject: [PATCH 83/92] Fix overlay scope handling for strict scope healing
 (#93)

* Fix overlay scope handling for strict scope healing

* Fix pyright typing issues in tests
---
 examples/demo_qa/batch.py    | 50 +++++++++++++++++++++---
 tests/test_demo_qa_batch.py  | 74 ++++++++++++++++++++++++++++++++++++
 tests/test_demo_qa_runner.py |  4 +-
 3 files changed, 122 insertions(+), 6 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index bd8fe39..f92d9a2 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -184,14 +184,39 @@ def _only_failed_selection(
     baseline_bad = {cid for cid, res in baseline.items() if res.status in bad}
     overlay_bad = {cid for cid, res in overlay.items() if res.status in bad}
     overlay_run_id = cast(Optional[str], overlay_run_meta.get("run_id") if isinstance(overlay_run_meta, Mapping) else None)
+    overlay_scope_hash = cast(Optional[str], overlay_run_meta.get("scope_hash") if isinstance(overlay_run_meta, Mapping) else None)
+    overlay_tag = cast(Optional[str], overlay_run_meta.get("tag") if isinstance(overlay_run_meta, Mapping) else None)
     overlay_ts: Optional[object] = None
     if isinstance(overlay_run_meta, Mapping):
-        overlay_ts = overlay_run_meta.get("ended_at") or overlay_run_meta.get("timestamp") or overlay_run_meta.get("started_at")
+        overlay_ts = (
+            overlay_run_meta.get("ended_at")
+            or overlay_run_meta.get("started_at")
+            or overlay_run_meta.get("ts")
+            or overlay_run_meta.get("timestamp")
+        )
     if overlay_ts is None and overlay_run_path and overlay_run_path.exists():
         try:
             overlay_ts = overlay_run_path.stat().st_mtime
         except OSError:
             overlay_ts = None
+
+    current_scope_hash = scope_hash or None
+    overlay_scope_matches_current = True
+    if strict_scope_history and current_scope_hash:
+        overlay_scope_matches_current = overlay_scope_hash == current_scope_hash
+
+    explain_lines: list[str] = []
+    if explain_selection:
+        explain_lines.append(
+            f"current_scope_hash={current_scope_hash} overlay_scope_hash={overlay_scope_hash} "
+            f"overlay_scope_matches_current={overlay_scope_matches_current}"
+        )
+        if overlay_tag is None and tag is not None:
+            explain_lines.append(f"Overlay tag missing; using current tag={tag} for overlay entries")
+        elif overlay_tag is not None and tag is not None and overlay_tag != tag:
+            explain_lines.append(
+                f"Overlay tag differs from current selection; using overlay tag={overlay_tag} (current tag={tag})"
+            )
     overlay_entries: dict[str, dict] = {}
     for cid, res in overlay.items():
         entry = {
@@ -199,18 +224,20 @@ def _only_failed_selection(
             "ts": overlay_ts,
             "timestamp": overlay_ts,
             "status": res.status,
-            "scope_hash": scope_hash,
-            "tag": tag,
+            "scope_hash": overlay_scope_hash,
+            "tag": overlay_tag if overlay_tag is not None else tag,
             "run_dir": str(overlay_run_path) if overlay_run_path else None,
         }
         overlay_entries[cid] = {k: v for k, v in entry.items() if v is not None}
 
     overlay_good: set[str] = set()
     healed_details: dict[str, list[dict]] = {}
+    scope_mismatch_warned = False
     for cid, res in overlay.items():
+        overlay_entry_for_history = overlay_entries.get(cid) if overlay_scope_matches_current else None
         ok, history_entries = _consecutive_passes(
             cid,
-            overlay_entries.get(cid),
+            overlay_entry_for_history,
             artifacts_dir / "runs" / "cases" / f"{cid}.jsonl" if artifacts_dir else None,
             tag=tag,
             scope_hash=scope_hash,
@@ -219,6 +246,18 @@ def _only_failed_selection(
             require_assert=require_assert,
             strict_scope_history=strict_scope_history,
         )
+        if explain_selection and strict_scope_history and not overlay_scope_matches_current:
+            if res.status not in bad:
+                explain_lines.append(
+                    f"Overlay PASS for case {cid} ignored due to strict scope mismatch "
+                    f"(overlay_scope_hash={overlay_scope_hash}, current_scope_hash={current_scope_hash})"
+                )
+            elif not scope_mismatch_warned:
+                explain_lines.append(
+                    f"Overlay scope mismatch; overlay failures still counted (overlay_scope_hash={overlay_scope_hash}, "
+                    f"current_scope_hash={current_scope_hash})"
+                )
+                scope_mismatch_warned = True
         if ok:
             overlay_good.add(cid)
             if explain_selection:
@@ -234,6 +273,8 @@ def _only_failed_selection(
     if explain_selection and healed_details:
         limit = max(1, explain_limit)
         breakdown["healed_details"] = {cid: healed_details[cid] for cid in list(sorted(healed_details))[:limit]}
+    if explain_selection and explain_lines:
+        breakdown["explain"] = explain_lines
     return selection, breakdown
 
 
@@ -704,7 +745,6 @@ def handle_batch(args) -> int:
                 print(line, file=sys.stderr)
 
     only_missed_baseline_kind: str | None = None
-    missed_planned_ids: set[str] | None = None
     missed_effective_meta: Mapping[str, object] | None = None
     if args.only_missed:
         only_missed_from_arg = cast(Optional[Path], args.only_missed_from)
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index f294848..832d3f1 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -147,6 +147,80 @@ def test_only_missed_selection_uses_overlay_executed() -> None:
     assert breakdown["overlay_executed"] == {"c"}
 
 
+def test_only_failed_strict_scope_ignores_overlay_pass(tmp_path: Path) -> None:
+    baseline = {"A": _mk_result("A", "failed")}
+    overlay = {"A": _mk_result("A", "ok")}
+    overlay_meta = {"run_id": "overlay", "scope_hash": "scope_overlay", "ended_at": "2024-01-01T00:00:00Z"}
+
+    selection, breakdown = _only_failed_selection(
+        baseline,
+        overlay,
+        fail_on="bad",
+        require_assert=False,
+        artifacts_dir=tmp_path,
+        tag="t1",
+        scope_hash="scope_current",
+        anti_flake_passes=1,
+        strict_scope_history=True,
+        overlay_run_meta=overlay_meta,
+        overlay_run_path=tmp_path,
+        explain_selection=True,
+    )
+
+    assert selection == {"A"}
+    assert breakdown["healed"] == set()
+    explain_lines = cast(list[str], breakdown.get("explain", []) or [])
+    assert any("overlay_scope_matches_current=False" in line for line in explain_lines)
+
+
+def test_only_failed_strict_scope_allows_overlay_pass_when_scope_matches(tmp_path: Path) -> None:
+    baseline = {"A": _mk_result("A", "failed")}
+    overlay = {"A": _mk_result("A", "ok")}
+    overlay_meta = {"run_id": "overlay", "scope_hash": "scope_current", "ended_at": "2024-01-01T00:00:00Z"}
+
+    selection, breakdown = _only_failed_selection(
+        baseline,
+        overlay,
+        fail_on="bad",
+        require_assert=False,
+        artifacts_dir=tmp_path,
+        tag="t1",
+        scope_hash="scope_current",
+        anti_flake_passes=1,
+        strict_scope_history=True,
+        overlay_run_meta=overlay_meta,
+        overlay_run_path=tmp_path,
+        explain_selection=True,
+    )
+
+    assert selection == set()
+    assert breakdown["healed"] == {"A"}
+
+
+def test_only_failed_explain_notes_scope_mismatch(tmp_path: Path) -> None:
+    baseline = {"A": _mk_result("A", "failed")}
+    overlay = {"A": _mk_result("A", "ok")}
+    overlay_meta = {"run_id": "overlay", "scope_hash": "scope_other", "ended_at": "2024-01-01T00:00:00Z"}
+
+    _, breakdown = _only_failed_selection(
+        baseline,
+        overlay,
+        fail_on="bad",
+        require_assert=False,
+        artifacts_dir=tmp_path,
+        tag="t1",
+        scope_hash="scope_current",
+        anti_flake_passes=1,
+        strict_scope_history=True,
+        overlay_run_meta=overlay_meta,
+        overlay_run_path=tmp_path,
+        explain_selection=True,
+    )
+
+    explain_lines = cast(list[str], breakdown.get("explain", []) or [])
+    assert any("ignored due to strict scope mismatch" in line for line in explain_lines)
+
+
 def test_anti_flake_requires_two_passes_without_double_count(tmp_path: Path) -> None:
     artifacts_dir = tmp_path
     case_id = "x1"
diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py
index 0888dfe..eafa3fc 100644
--- a/tests/test_demo_qa_runner.py
+++ b/tests/test_demo_qa_runner.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from typing import cast
+
 from examples.demo_qa.runner import Case, RunResult, _match_expected, diff_runs, summarize
 
 
@@ -9,7 +11,7 @@ def test_match_expected_unchecked_when_no_expectations() -> None:
 
 
 def test_match_expected_coerces_non_string_expected_values() -> None:
-    case = Case(id="c1", question="What is foo?", expected=42)
+    case = Case(id="c1", question="What is foo?", expected=cast(str, 42))
 
     mismatch = _match_expected(case, "43")
     assert mismatch is not None

From b91be9863eca28990a475b32f1b262a2b108e388 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sat, 3 Jan 2026 14:52:28 +0300
Subject: [PATCH 84/92] ruff fixes

---
 examples/demo_qa/runs/case_history.py | 1 -
 tests/test_demo_qa_runner.py          | 8 +++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/demo_qa/runs/case_history.py b/examples/demo_qa/runs/case_history.py
index 55ac7fb..783589f 100644
--- a/examples/demo_qa/runs/case_history.py
+++ b/examples/demo_qa/runs/case_history.py
@@ -9,7 +9,6 @@
 from ..runner import RunResult
 from .layout import _load_run_meta
 
-
 logger = logging.getLogger(__name__)
 
 
diff --git a/tests/test_demo_qa_runner.py b/tests/test_demo_qa_runner.py
index eafa3fc..67fc66f 100644
--- a/tests/test_demo_qa_runner.py
+++ b/tests/test_demo_qa_runner.py
@@ -2,7 +2,13 @@
 
 from typing import cast
 
-from examples.demo_qa.runner import Case, RunResult, _match_expected, diff_runs, summarize
+from examples.demo_qa.runner import (
+    Case,
+    RunResult,
+    _match_expected,
+    diff_runs,
+    summarize,
+)
 
 
 def test_match_expected_unchecked_when_no_expectations() -> None:

From 9da728f403a336f7691900ce5fd467576c7c57d4 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 16:04:07 +0300
Subject: [PATCH 85/92] Handle only-missed overlay scope checks (#94)

* Handle only-missed overlay scope checks

* Respect scope/tag when applying only-missed overlay

* Surface tag match flag when selection tag present
---
 examples/demo_qa/batch.py   | 47 +++++++++++++++++++++++++++++--
 tests/test_demo_qa_batch.py | 56 +++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 2 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index f92d9a2..62ca4e9 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -306,13 +306,49 @@ def _only_missed_selection(
     selected_case_ids: Iterable[str],
     baseline_results: Mapping[str, RunResult] | None,
     overlay_results: Mapping[str, RunResult] | None,
+    *,
+    overlay_scope_hash: str | None = None,
+    selection_scope_hash: str | None = None,
+    overlay_tag: str | None = None,
+    selection_tag: str | None = None,
+    overlay_disabled_reason: str | None = None,
+    overlay_ignored_reason: str | None = None,
 ) -> tuple[set[str], dict[str, object]]:
     selected = set(selected_case_ids)
     baseline_ids = set(baseline_results.keys()) if baseline_results else set()
-    overlay_executed = set(overlay_results.keys()) if overlay_results else set()
+    overlay_scope_matches_current: bool | None = None
+    overlay_tag_matches_current: bool | None = None
+    overlay_results_for_calc: Mapping[str, RunResult] | None = None
+    ignored_reason = overlay_ignored_reason or overlay_disabled_reason
+    if overlay_results is not None and overlay_disabled_reason is None:
+        overlay_scope_matches_current = (
+            overlay_scope_hash == selection_scope_hash if overlay_scope_hash is not None and selection_scope_hash is not None else None
+        )
+        overlay_tag_matches_current = (
+            overlay_tag == selection_tag if overlay_tag is not None and selection_tag is not None else None
+        )
+        overlay_results_for_calc = overlay_results
+        if overlay_scope_matches_current is False:
+            overlay_results_for_calc = None
+            ignored_reason = "scope_mismatch"
+        elif overlay_tag_matches_current is False:
+            overlay_results_for_calc = None
+            ignored_reason = "tag_mismatch"
+    overlay_executed = set(overlay_results_for_calc.keys()) if overlay_results_for_calc else set()
     missed_base = selected - baseline_ids
     missed_final = missed_base - overlay_executed
-    breakdown = {"missed_base": missed_base, "overlay_executed": overlay_executed}
+    breakdown: dict[str, object] = {
+        "missed_base": missed_base,
+        "overlay_executed": overlay_executed,
+        "overlay_scope_hash": overlay_scope_hash,
+        "overlay_scope_matches_current": overlay_scope_matches_current,
+    }
+    if overlay_tag is not None:
+        breakdown["overlay_tag"] = overlay_tag
+    if selection_tag is not None:
+        breakdown["overlay_tag_matches_current"] = overlay_tag_matches_current
+    if ignored_reason:
+        breakdown["overlay_ignored_reason"] = ignored_reason
     return missed_final, breakdown
 
 
@@ -784,11 +820,18 @@ def handle_batch(args) -> int:
         if args.only_missed and missed_baseline_results is None:
             print("No baseline found for --only-missed.", file=sys.stderr)
             return 2
+        overlay_scope_hash = cast(Optional[str], overlay_run_meta.get("scope_hash") if isinstance(overlay_run_meta, Mapping) else None)
+        overlay_tag = cast(Optional[str], overlay_run_meta.get("tag") if isinstance(overlay_run_meta, Mapping) else None)
         selected_case_ids = _planned_pool_from_meta(missed_effective_meta, missed_baseline_path, suite_case_ids)
         missed_ids, missed_breakdown = _only_missed_selection(
             selected_case_ids,
             missed_baseline_results,
             overlay_results if not args.no_overlay else None,
+            overlay_scope_hash=overlay_scope_hash,
+            selection_scope_hash=scope_id,
+            overlay_tag=overlay_tag,
+            selection_tag=args.tag,
+            overlay_disabled_reason="no_overlay" if args.no_overlay else None,
         )
         target_ids = missed_ids
         if args.only_failed and failed_selection_ids is not None:
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 832d3f1..0d625d9 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -147,6 +147,62 @@ def test_only_missed_selection_uses_overlay_executed() -> None:
     assert breakdown["overlay_executed"] == {"c"}
 
 
+def test_only_missed_ignores_overlay_when_scope_mismatches() -> None:
+    baseline = {"A": _mk_result("A", "ok")}
+    overlay = {"B": _mk_result("B", "ok")}
+
+    missed, breakdown = _only_missed_selection(
+        ["A", "B", "C"],
+        baseline,
+        overlay,
+        overlay_scope_hash="overlay_scope",
+        selection_scope_hash="current_scope",
+    )
+
+    assert missed == {"B", "C"}
+    assert breakdown["missed_base"] == {"B", "C"}
+    assert breakdown["overlay_executed"] == set()
+    assert breakdown["overlay_scope_hash"] == "overlay_scope"
+    assert breakdown["overlay_scope_matches_current"] is False
+    assert breakdown["overlay_ignored_reason"] == "scope_mismatch"
+
+
+def test_only_missed_applies_overlay_when_scope_matches() -> None:
+    baseline = {"A": _mk_result("A", "ok")}
+    overlay = {"B": _mk_result("B", "ok")}
+
+    missed, breakdown = _only_missed_selection(
+        ["A", "B", "C"],
+        baseline,
+        overlay,
+        overlay_scope_hash="scope_current",
+        selection_scope_hash="scope_current",
+    )
+
+    assert missed == {"C"}
+    assert breakdown["missed_base"] == {"B", "C"}
+    assert breakdown["overlay_executed"] == {"B"}
+    assert breakdown["overlay_scope_hash"] == "scope_current"
+    assert breakdown["overlay_scope_matches_current"] is True
+    assert "overlay_ignored_reason" not in breakdown
+
+
+def test_only_missed_exposes_tag_match_flag_even_when_overlay_tag_missing() -> None:
+    baseline = {"A": _mk_result("A", "ok")}
+    overlay = {"B": _mk_result("B", "ok")}
+
+    _, breakdown = _only_missed_selection(
+        ["A", "B"],
+        baseline,
+        overlay,
+        selection_tag="current-tag",
+    )
+
+    assert breakdown["overlay_executed"] == {"B"}
+    assert "overlay_tag_matches_current" in breakdown
+    assert breakdown["overlay_tag_matches_current"] is None
+
+
 def test_only_failed_strict_scope_ignores_overlay_pass(tmp_path: Path) -> None:
     baseline = {"A": _mk_result("A", "failed")}
     overlay = {"A": _mk_result("A", "ok")}

From 33816036fb0de0c3d6ede9e3339cbd1d89c6efd0 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sat, 3 Jan 2026 16:16:31 +0300
Subject: [PATCH 86/92] ruff fixes

---
 examples/demo_qa/chat_repl.py            |  9 ++++++++-
 examples/demo_qa/data_gen.py             |  7 ++++++-
 examples/demo_qa/llm/openai_adapter.py   |  2 +-
 examples/demo_qa/runs/effective.py       |  2 +-
 examples/demo_qa/schema_io.py            | 10 +++++++---
 examples/demo_qa/settings.py             |  3 ++-
 examples/retail_orders/demo_agent.py     |  2 +-
 examples/retail_orders/demo_agent_sql.py |  2 +-
 examples/retail_orders/schema.py         |  1 -
 9 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/examples/demo_qa/chat_repl.py b/examples/demo_qa/chat_repl.py
index 2ebd14c..5d3feb6 100644
--- a/examples/demo_qa/chat_repl.py
+++ b/examples/demo_qa/chat_repl.py
@@ -8,7 +8,14 @@
 from typing import Optional, Sequence
 
 from .provider_factory import build_provider
-from .runner import Case, EventLogger, RunArtifacts, build_agent, run_one, save_artifacts
+from .runner import (
+    Case,
+    EventLogger,
+    RunArtifacts,
+    build_agent,
+    run_one,
+    save_artifacts,
+)
 
 
 def _load_json(path: Path) -> object | None:
diff --git a/examples/demo_qa/data_gen.py b/examples/demo_qa/data_gen.py
index 4732ee1..3faf0db 100644
--- a/examples/demo_qa/data_gen.py
+++ b/examples/demo_qa/data_gen.py
@@ -9,7 +9,12 @@
 
 import pandas as pd
 
-from fetchgraph.relational.schema import ColumnConfig, EntityConfig, RelationConfig, SchemaConfig
+from fetchgraph.relational.schema import (
+    ColumnConfig,
+    EntityConfig,
+    RelationConfig,
+    SchemaConfig,
+)
 
 
 @dataclass
diff --git a/examples/demo_qa/llm/openai_adapter.py b/examples/demo_qa/llm/openai_adapter.py
index 810c752..e298a1f 100644
--- a/examples/demo_qa/llm/openai_adapter.py
+++ b/examples/demo_qa/llm/openai_adapter.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-import os
 import logging
+import os
 from typing import Any, Dict, Tuple
 from urllib.parse import urlparse
 
diff --git a/examples/demo_qa/runs/effective.py b/examples/demo_qa/runs/effective.py
index ca2457d..444f086 100644
--- a/examples/demo_qa/runs/effective.py
+++ b/examples/demo_qa/runs/effective.py
@@ -8,8 +8,8 @@
 from ..runner import RunResult, bad_statuses, load_results, summarize
 from ..utils import dump_json
 from .coverage import _missed_case_ids
-from .layout import _effective_paths
 from .io import write_results
+from .layout import _effective_paths
 
 
 def _load_effective_results(artifacts_dir: Path, tag: str) -> tuple[dict[str, RunResult], Optional[dict], Path]:
diff --git a/examples/demo_qa/schema_io.py b/examples/demo_qa/schema_io.py
index 3525ad3..febcd70 100644
--- a/examples/demo_qa/schema_io.py
+++ b/examples/demo_qa/schema_io.py
@@ -1,11 +1,15 @@
 from __future__ import annotations
 
+import json
 from pathlib import Path
 from typing import Any, Dict
 
-import json
-
-from fetchgraph.relational.schema import ColumnConfig, EntityConfig, RelationConfig, SchemaConfig
+from fetchgraph.relational.schema import (
+    ColumnConfig,
+    EntityConfig,
+    RelationConfig,
+    SchemaConfig,
+)
 
 
 def _entity_from_dict(data: Dict[str, Any]) -> EntityConfig:
diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py
index be3f03a..3d2e34e 100644
--- a/examples/demo_qa/settings.py
+++ b/examples/demo_qa/settings.py
@@ -8,8 +8,9 @@
 from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
 
 try:
-    from pydantic_settings import BaseSettings, SettingsConfigDict
     from pydantic_settings.sources import TomlConfigSettingsSource
+
+    from pydantic_settings import BaseSettings, SettingsConfigDict
 except ImportError as exc:  # pragma: no cover - make missing dependency explicit
     raise ImportError(
         "pydantic-settings is required for demo_qa configuration. "
diff --git a/examples/retail_orders/demo_agent.py b/examples/retail_orders/demo_agent.py
index 5ed9b29..20b96a0 100644
--- a/examples/retail_orders/demo_agent.py
+++ b/examples/retail_orders/demo_agent.py
@@ -8,7 +8,7 @@
 from fetchgraph.core import TaskProfile, create_generic_agent
 from fetchgraph.relational.schema import SchemaConfig  # только для типа, не обязательно
 
-from .schema import build_retail_provider, RETAIL_SCHEMA
+from .schema import RETAIL_SCHEMA, build_retail_provider
 
 
 # Простейшая заглушка LLM, чтобы пример запускался без внешних зависимостей.
diff --git a/examples/retail_orders/demo_agent_sql.py b/examples/retail_orders/demo_agent_sql.py
index 5fdeab1..1014933 100644
--- a/examples/retail_orders/demo_agent_sql.py
+++ b/examples/retail_orders/demo_agent_sql.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-from pathlib import Path
 import sqlite3
+from pathlib import Path
 
 import pandas as pd
 
diff --git a/examples/retail_orders/schema.py b/examples/retail_orders/schema.py
index b61297b..46d434f 100644
--- a/examples/retail_orders/schema.py
+++ b/examples/retail_orders/schema.py
@@ -11,7 +11,6 @@
     build_pandas_provider_from_schema,
 )
 
-
 RETAIL_SCHEMA = SchemaConfig(
     name="retail_orders",
     label="Интернет-магазин: клиенты, заказы и товары",

From b71662f440f341912cf7016a4b0d1d267c3ebe7a Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 17:31:36 +0300
Subject: [PATCH 87/92] Capture resolved demo_qa config metadata

---
 examples/demo_qa/batch.py              |  12 +--
 examples/demo_qa/settings.py           |   4 +-
 tests/test_demo_qa_batch.py            | 116 ++++++++++++++++++++++++-
 tests/test_demo_qa_settings.py         |   9 +-
 tests/test_demo_qa_settings_sources.py |   3 +-
 5 files changed, 131 insertions(+), 13 deletions(-)

diff --git a/examples/demo_qa/batch.py b/examples/demo_qa/batch.py
index 2877485..79e1ec1 100644
--- a/examples/demo_qa/batch.py
+++ b/examples/demo_qa/batch.py
@@ -429,7 +429,7 @@ def _resolve_run_path(path: Path | None, artifacts_dir: Path) -> Optional[Path]:
 
 def handle_chat(args) -> int:
     try:
-        settings = load_settings(config_path=args.config, data_dir=args.data)
+        settings, _ = load_settings(config_path=args.config, data_dir=args.data)
     except Exception as exc:
         print(f"Configuration error: {exc}", file=sys.stderr)
         return 2
@@ -605,10 +605,10 @@ def handle_batch(args) -> int:
     data_dir = Path(args.data)
     schema_path = Path(args.schema)
     cases_path = Path(args.cases)
-    config_path = Path(args.config) if args.config else None
+    cli_config_path = Path(args.config) if args.config else None
 
     try:
-        settings = load_settings(config_path=config_path, data_dir=data_dir)
+        settings, resolved_config_path = load_settings(config_path=cli_config_path, data_dir=data_dir)
     except Exception as exc:
         print(f"Configuration error: {exc}", file=sys.stderr)
         return 2
@@ -1047,7 +1047,7 @@ def handle_batch(args) -> int:
         except Exception as exc:
             print(f"Failed to update effective results for tag {args.tag!r}: {exc}", file=sys.stderr)
 
-    config_hash = _hash_file(config_path) if config_path else None
+    config_hash = _hash_file(resolved_config_path) if resolved_config_path else None
     schema_hash = _hash_file(schema_path)
     data_fingerprint = _fingerprint_dir(data_dir, verbose=args.fingerprint_verbose)
     git_sha = _git_sha()
@@ -1060,7 +1060,7 @@ def handle_batch(args) -> int:
         "inputs": {
             "cases_path": str(cases_path),
             "cases_hash": cases_hash,
-            "config_path": str(config_path) if config_path else None,
+            "config_path": str(resolved_config_path) if resolved_config_path else None,
             "config_hash": config_hash,
             "schema_path": str(schema_path),
             "schema_hash": schema_hash,
@@ -1230,7 +1230,7 @@ def handle_batch(args) -> int:
 
 def handle_case_run(args) -> int:
     try:
-        settings = load_settings(config_path=args.config, data_dir=args.data)
+        settings, _ = load_settings(config_path=args.config, data_dir=args.data)
     except Exception as exc:
         print(f"Configuration error: {exc}", file=sys.stderr)
         return 2
diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py
index 3d2e34e..e471728 100644
--- a/examples/demo_qa/settings.py
+++ b/examples/demo_qa/settings.py
@@ -118,7 +118,7 @@ def load_settings(
     config_path: Path | None = None,
     data_dir: Path | None = None,
     overrides: Dict[str, Any] | None = None,
-) -> DemoQASettings:
+) -> tuple[DemoQASettings, Path | None]:
     resolved = resolve_config_path(config_path, data_dir)
     DemoQASettings._toml_path = resolved
     try:
@@ -127,7 +127,7 @@ def load_settings(
         DemoQASettings._toml_path = None
         raise
     DemoQASettings._toml_path = None
-    return settings
+    return settings, resolved
 
 
 __all__ = ["DemoQASettings", "LLMSettings", "resolve_config_path", "load_settings"]
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 0d625d9..c46f589 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -5,10 +5,13 @@
 import os
 import time
 from pathlib import Path
+from types import SimpleNamespace
 from typing import cast
 
 import pytest
 
+import examples.demo_qa.batch as batch
+from examples.demo_qa.cli import build_parser
 from examples.demo_qa.batch import (
     _consecutive_passes,
     _fingerprint_dir,
@@ -21,7 +24,7 @@
     render_markdown,
     write_results,
 )
-from examples.demo_qa.runner import DiffReport, RunResult, diff_runs
+from examples.demo_qa.runner import DiffReport, RunResult, RunTimings, diff_runs
 from examples.demo_qa.runs.coverage import _missed_case_ids
 from examples.demo_qa.runs.layout import _latest_markers, _update_latest_markers
 
@@ -436,3 +439,114 @@ def test_format_healed_explain_includes_key_lines() -> None:
     lines = _format_healed_explain(healed, healed_details, anti_flake_passes=2, limit=2)
     assert any("Healed because last 2 results are PASS for case a" in line for line in lines)
     assert any("run_id=r2" in line and "status=ok" in line for line in lines)
+
+
+def _stubbed_run_one(case, runner, artifacts_root, *, plan_only=False, event_logger=None):
+    run_dir = artifacts_root / f"{case.id}_stub"
+    run_dir.mkdir(parents=True, exist_ok=True)
+    return RunResult(
+        id=case.id,
+        question=case.question,
+        status="ok",
+        checked=case.has_asserts,
+        reason=None,
+        details=None,
+        artifacts_dir=str(run_dir),
+        duration_ms=1,
+        tags=list(case.tags),
+        answer="ok",
+        error=None,
+        plan_path=str(run_dir / "plan.json"),
+        timings=RunTimings(),
+        expected_check=None,
+    )
+
+
+def _prepare_batch_inputs(tmp_path: Path) -> tuple[Path, Path, Path, Path]:
+    data_dir = tmp_path / "data"
+    data_dir.mkdir(exist_ok=True)
+    schema_path = tmp_path / "schema.json"
+    schema_path.write_text("{}", encoding="utf-8")
+    cases_path = tmp_path / "cases.jsonl"
+    cases_path.write_text('[{"id":"c1","question":"Q?"}]', encoding="utf-8")
+    artifacts_dir = tmp_path / "artifacts"
+    return data_dir, schema_path, cases_path, artifacts_dir
+
+
+def _run_batch_and_meta(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+    *,
+    cli_config: Path | None = None,
+    env_api_key: str | None = None,
+) -> dict:
+    data_dir, schema_path, cases_path, artifacts_dir = _prepare_batch_inputs(tmp_path)
+
+    monkeypatch.setattr(
+        batch,
+        "build_provider",
+        lambda data_dir, schema_path, enable_semantic=False, embedding_model=None: (SimpleNamespace(name="dummy"), None),
+    )
+    monkeypatch.setattr(batch, "build_llm", lambda settings: SimpleNamespace())
+    monkeypatch.setattr(batch, "build_agent", lambda llm, provider: SimpleNamespace())
+    monkeypatch.setattr(batch, "run_one", _stubbed_run_one)
+    monkeypatch.setattr(batch, "configure_logging", lambda **kwargs: None)
+
+    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+    if env_api_key is not None:
+        monkeypatch.setenv("OPENAI_API_KEY", env_api_key)
+
+    args_list = [
+        "batch",
+        "--data",
+        str(data_dir),
+        "--schema",
+        str(schema_path),
+        "--cases",
+        str(cases_path),
+        "--artifacts-dir",
+        str(artifacts_dir),
+        "--events",
+        "off",
+        "--quiet",
+    ]
+    if cli_config is not None:
+        args_list.extend(["--config", str(cli_config)])
+    args = build_parser().parse_args(args_list)
+    exit_code = batch.handle_batch(args)
+    assert exit_code == 0
+
+    run_meta_path = next((artifacts_dir / "runs").rglob("run_meta.json"))
+    return json.loads(run_meta_path.read_text(encoding="utf-8"))
+
+
+def test_default_config_is_discovered_and_hashed(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    data_dir, _, _, _ = _prepare_batch_inputs(tmp_path)
+    default_config = data_dir / "demo_qa.toml"
+    default_config.write_text('[llm]\napi_key="sk-default"\n', encoding="utf-8")
+
+    run_meta = _run_batch_and_meta(tmp_path, monkeypatch, env_api_key=None)
+
+    assert run_meta["inputs"]["config_path"] == str(default_config)
+    assert run_meta["inputs"]["config_hash"] == batch._hash_file(default_config)
+
+
+def test_explicit_config_path_wins(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    data_dir, _, _, _ = _prepare_batch_inputs(tmp_path)
+    # Default config exists but should be ignored when CLI is provided.
+    default_config = data_dir / "demo_qa.toml"
+    default_config.write_text('[llm]\napi_key="sk-default"\n', encoding="utf-8")
+    explicit_config = tmp_path / "custom.toml"
+    explicit_config.write_text('[llm]\napi_key="sk-explicit"\n', encoding="utf-8")
+
+    run_meta = _run_batch_and_meta(tmp_path, monkeypatch, cli_config=explicit_config, env_api_key=None)
+
+    assert run_meta["inputs"]["config_path"] == str(explicit_config)
+    assert run_meta["inputs"]["config_hash"] == batch._hash_file(explicit_config)
+
+
+def test_no_config_available_sets_none_in_meta(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    run_meta = _run_batch_and_meta(tmp_path, monkeypatch, env_api_key="sk-env")
+
+    assert run_meta["inputs"]["config_path"] is None
+    assert run_meta["inputs"]["config_hash"] is None
diff --git a/tests/test_demo_qa_settings.py b/tests/test_demo_qa_settings.py
index 16a1f30..c7ced7b 100644
--- a/tests/test_demo_qa_settings.py
+++ b/tests/test_demo_qa_settings.py
@@ -29,7 +29,8 @@ def test_env_overrides_toml(tmp_path, monkeypatch):
     monkeypatch.setenv("DEMO_QA_LLM__API_KEY", "sk-from-env")
     monkeypatch.setenv("DEMO_QA_LLM__PLAN_MODEL", "env-plan")
 
-    settings = load_settings(config_path=config_path)
+    settings, resolved = load_settings(config_path=config_path)
+    assert resolved == config_path
     assert settings.llm.api_key == "sk-from-env"
     assert settings.llm.base_url == "http://localhost:1234/v1"
     assert settings.llm.plan_model == "env-plan"
@@ -61,7 +62,8 @@ def test_openai_key_from_global_env(tmp_path, monkeypatch):
     )
     monkeypatch.setenv("OPENAI_API_KEY", "sk-global")
 
-    settings = load_settings(config_path=config_path)
+    settings, resolved = load_settings(config_path=config_path)
+    assert resolved == config_path
     assert settings.llm.api_key == "sk-global"
 
 
@@ -96,7 +98,8 @@ def _store_and_return(kwargs):
 
     monkeypatch.setitem(sys.modules, "openai", SimpleNamespace(OpenAI=FakeOpenAI))
 
-    settings = load_settings(config_path=config_path)
+    settings, resolved = load_settings(config_path=config_path)
+    assert resolved == config_path
     llm = build_llm(settings)
 
     result = llm("hello", sender="generic_plan")
diff --git a/tests/test_demo_qa_settings_sources.py b/tests/test_demo_qa_settings_sources.py
index f828329..c8ec38d 100644
--- a/tests/test_demo_qa_settings_sources.py
+++ b/tests/test_demo_qa_settings_sources.py
@@ -25,8 +25,9 @@ def test_source_priorities(tmp_path, monkeypatch):
     monkeypatch.setenv("DEMO_QA_LLM__API_KEY", "sk-env")
     monkeypatch.setenv("DEMO_QA_LLM__PLAN_MODEL", "env-plan")
 
-    settings = load_settings(config_path=config_path, overrides={"llm": {"plan_model": "override-plan"}})
+    settings, resolved = load_settings(config_path=config_path, overrides={"llm": {"plan_model": "override-plan"}})
 
+    assert resolved == config_path
     assert settings.llm.api_key == "sk-env"
     assert settings.llm.plan_model == "override-plan"
     assert settings.llm.synth_model == "toml-synth"

From 066228a874e79dabc4e7b8483eca81dbf1eadf3d Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 17:40:20 +0300
Subject: [PATCH 88/92] Fix demo QA batch test config inputs

---
 tests/test_demo_qa_batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index c46f589..031d7b1 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -468,7 +468,7 @@ def _prepare_batch_inputs(tmp_path: Path) -> tuple[Path, Path, Path, Path]:
     schema_path = tmp_path / "schema.json"
     schema_path.write_text("{}", encoding="utf-8")
     cases_path = tmp_path / "cases.jsonl"
-    cases_path.write_text('[{"id":"c1","question":"Q?"}]', encoding="utf-8")
+    cases_path.write_text('{"id":"c1","question":"Q?"}\n', encoding="utf-8")
     artifacts_dir = tmp_path / "artifacts"
     return data_dir, schema_path, cases_path, artifacts_dir
 

From 98879f0f22299c4a8be989019a3d394cbb885ab7 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 17:44:01 +0300
Subject: [PATCH 89/92] Write demo QA batch test case in jsonl format


From b6796b8ed007b58e3a5d7026c86b5596cdf341a7 Mon Sep 17 00:00:00 2001
From: AlexanderOnischenko
 <74920855+AlexanderOnischenko@users.noreply.github.com>
Date: Sat, 3 Jan 2026 17:51:11 +0300
Subject: [PATCH 90/92] Expect packaged default demo QA config

---
 .gitignore                    |  1 +
 examples/demo_qa/demo_qa.toml |  9 +++++++++
 tests/test_demo_qa_batch.py   | 10 +++++++---
 3 files changed, 17 insertions(+), 3 deletions(-)
 create mode 100644 examples/demo_qa/demo_qa.toml

diff --git a/.gitignore b/.gitignore
index 2ca3c0a..4a87019 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,5 +6,6 @@ build/
 .pytest_cache/
 examples/demo_qa/demo_qa.toml
 **/demo_qa.toml
+!examples/demo_qa/demo_qa.toml
 .env.demo_qa
 _demo_data/*/.runs/*
diff --git a/examples/demo_qa/demo_qa.toml b/examples/demo_qa/demo_qa.toml
new file mode 100644
index 0000000..7dd49e9
--- /dev/null
+++ b/examples/demo_qa/demo_qa.toml
@@ -0,0 +1,9 @@
+[llm]
+api_key = "unused"
+base_url = "http://localhost:8000/v1"
+plan_model = "gpt-4o-mini"
+synth_model = "gpt-4o-mini"
+plan_temperature = 0.0
+synth_temperature = 0.2
+timeout_s = 900
+retries = 2
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 031d7b1..692cd85 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -545,8 +545,12 @@ def test_explicit_config_path_wins(tmp_path: Path, monkeypatch: pytest.MonkeyPat
     assert run_meta["inputs"]["config_hash"] == batch._hash_file(explicit_config)
 
 
-def test_no_config_available_sets_none_in_meta(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+def test_packaged_default_config_used_when_no_cli_or_data_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
     run_meta = _run_batch_and_meta(tmp_path, monkeypatch, env_api_key="sk-env")
 
-    assert run_meta["inputs"]["config_path"] is None
-    assert run_meta["inputs"]["config_hash"] is None
+    config_path = run_meta["inputs"]["config_path"]
+    assert config_path is not None
+
+    expected_default = Path(batch.__file__).resolve().parent / "demo_qa.toml"
+    assert Path(config_path) == expected_default
+    assert run_meta["inputs"]["config_hash"] == batch._hash_file(expected_default)

From f5ad53920543563131fcbadc38cd991455484e15 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sat, 3 Jan 2026 19:30:53 +0300
Subject: [PATCH 91/92] =?UTF-8?q?=D0=B4=D0=B5=D0=BB=D0=B0=D0=B5=D0=BC=20ap?=
 =?UTF-8?q?i=5Fkey=20=D0=BE=D0=BF=D1=86=D0=B8=D0=BE=D0=BD=D0=B0=D0=BB?=
 =?UTF-8?q?=D1=8C=D0=BD=D1=8B=D0=BC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                             |  2 -
 README_demo_qa.md                      |  4 +-
 examples/demo_qa/demo_qa.toml          |  9 ---
 examples/demo_qa/demo_qa.toml.example  |  5 +-
 examples/demo_qa/llm/openai_adapter.py | 18 ++---
 examples/demo_qa/settings.py           | 15 +---
 tests/test_demo_qa_batch.py            |  4 +-
 tests/test_demo_qa_settings.py         | 96 ++++++++++++++++++++------
 8 files changed, 94 insertions(+), 59 deletions(-)
 delete mode 100644 examples/demo_qa/demo_qa.toml

diff --git a/.gitignore b/.gitignore
index 4a87019..c874fb3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,8 +4,6 @@ __pycache__/
 dist/
 build/
 .pytest_cache/
-examples/demo_qa/demo_qa.toml
 **/demo_qa.toml
-!examples/demo_qa/demo_qa.toml
 .env.demo_qa
 _demo_data/*/.runs/*
diff --git a/README_demo_qa.md b/README_demo_qa.md
index 8a8cc07..3b453c6 100644
--- a/README_demo_qa.md
+++ b/README_demo_qa.md
@@ -17,6 +17,7 @@ python -m examples.demo_qa.cli gen --out demo_data --rows 1000 --seed 42
 ### Файл demo_qa.toml
 См. шаблон `examples/demo_qa/demo_qa.toml.example`.
 Автопоиск: `--config`, затем `<DATA_DIR>/demo_qa.toml`, затем `examples/demo_qa/demo_qa.toml`.
+`llm.api_key` можно опустить: при инициализации LLM используется `OPENAI_API_KEY`, а при его отсутствии — строка `"unused"`.
 
 ### .env.demo_qa
 Пример:
@@ -30,6 +31,7 @@ DEMO_QA_LLM__BASE_URL=http://localhost:8000/v1
 export DEMO_QA_LLM__API_KEY=sk-...
 export DEMO_QA_LLM__BASE_URL=http://localhost:8000/v1
 ```
+Если не задавать `DEMO_QA_LLM__API_KEY` и не выставлять `OPENAI_API_KEY`, LLM-клиент подставит `"unused"` и не упадёт.
 
 ### Зависимости демо
 * Требуется Python 3.11+ (используется стандартный `tomllib`).
@@ -44,7 +46,7 @@ pip install -r examples/demo_qa/requirements.txt
 
 ### OpenAI / совместимый прокси
 1. Скопируйте `examples/demo_qa/demo_qa.toml.example` в удобное место и укажите
-   `llm.api_key` (можно `env:OPENAI_API_KEY` или любое значение, если прокси не проверяет ключ),
+   при необходимости `llm.api_key` (можно `env:OPENAI_API_KEY`; если не указать, возьмётся `OPENAI_API_KEY` или `"unused"`),
    `base_url` (формат `http://host:port/v1`), модели и температуры.
 2. Запустите чат с указанием конфига:
 ```bash
diff --git a/examples/demo_qa/demo_qa.toml b/examples/demo_qa/demo_qa.toml
deleted file mode 100644
index 7dd49e9..0000000
--- a/examples/demo_qa/demo_qa.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-[llm]
-api_key = "unused"
-base_url = "http://localhost:8000/v1"
-plan_model = "gpt-4o-mini"
-synth_model = "gpt-4o-mini"
-plan_temperature = 0.0
-synth_temperature = 0.2
-timeout_s = 900
-retries = 2
diff --git a/examples/demo_qa/demo_qa.toml.example b/examples/demo_qa/demo_qa.toml.example
index 7dd49e9..4b12c2c 100644
--- a/examples/demo_qa/demo_qa.toml.example
+++ b/examples/demo_qa/demo_qa.toml.example
@@ -1,8 +1,7 @@
 [llm]
-api_key = "unused"
 base_url = "http://localhost:8000/v1"
-plan_model = "gpt-4o-mini"
-synth_model = "gpt-4o-mini"
+plan_model = "default"
+synth_model = "default"
 plan_temperature = 0.0
 synth_temperature = 0.2
 timeout_s = 900
diff --git a/examples/demo_qa/llm/openai_adapter.py b/examples/demo_qa/llm/openai_adapter.py
index e298a1f..80bc7af 100644
--- a/examples/demo_qa/llm/openai_adapter.py
+++ b/examples/demo_qa/llm/openai_adapter.py
@@ -46,15 +46,15 @@ def __init__(
             self.logger.info("OpenAILLM using endpoint %s", endpoint)
 
     def _resolve_api_key(self, api_key: str | None) -> str:
-        if api_key is None:
-            raise RuntimeError("OpenAI provider selected but llm.api_key is missing.")
-        if api_key.startswith("env:"):
-            env_var = api_key.split(":", 1)[1]
-            value = os.getenv(env_var)
-            if not value:
-                raise RuntimeError(f"Environment variable {env_var} referenced in config but not set.")
-            return value
-        return api_key
+        if api_key:
+            if api_key.startswith("env:"):
+                env_var = api_key.split(":", 1)[1]
+                value = os.getenv(env_var)
+                return value or "unused"
+            return api_key
+
+        env_key = os.getenv("OPENAI_API_KEY")
+        return env_key or "unused"
 
     def _validate_base_url(self, base_url: str | None) -> str | None:
         if base_url in (None, ""):
diff --git a/examples/demo_qa/settings.py b/examples/demo_qa/settings.py
index e471728..546cfb4 100644
--- a/examples/demo_qa/settings.py
+++ b/examples/demo_qa/settings.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import os
 from pathlib import Path
 from typing import Any, ClassVar, Dict
 from urllib.parse import urlparse
@@ -17,6 +16,7 @@
         "Install demo extras via `pip install -e .[demo]` or `pip install -r examples/demo_qa/requirements.txt`."
     ) from exc
 
+
 class LLMSettings(BaseModel):
     base_url: str | None = Field(default=None)
     api_key: str | None = Field(default=None)
@@ -87,16 +87,6 @@ def settings_customise_sources(
         sources.append(file_secret_settings)
         return tuple(sources)
 
-    @model_validator(mode="after")
-    def require_api_key(self) -> "DemoQASettings":
-        if not self.llm.api_key:
-            env_key = os.getenv("OPENAI_API_KEY")
-            if env_key:
-                self.llm.api_key = env_key
-        if not self.llm.api_key:
-            raise ValueError("llm.api_key is required. Provide it in config or set OPENAI_API_KEY.")
-        return self
-
 
 def resolve_config_path(config: Path | None, data_dir: Path | None) -> Path | None:
     if config is not None:
@@ -107,7 +97,8 @@ def resolve_config_path(config: Path | None, data_dir: Path | None) -> Path | No
         candidate = data_dir / "demo_qa.toml"
         if candidate.exists():
             return candidate
-    default = Path(__file__).resolve().parent / "demo_qa.toml"
+    root = Path(__file__).resolve().parent
+    default = root / "demo_qa.toml"
     if default.exists():
         return default
     return None
diff --git a/tests/test_demo_qa_batch.py b/tests/test_demo_qa_batch.py
index 692cd85..fa163a8 100644
--- a/tests/test_demo_qa_batch.py
+++ b/tests/test_demo_qa_batch.py
@@ -11,7 +11,6 @@
 import pytest
 
 import examples.demo_qa.batch as batch
-from examples.demo_qa.cli import build_parser
 from examples.demo_qa.batch import (
     _consecutive_passes,
     _fingerprint_dir,
@@ -24,6 +23,7 @@
     render_markdown,
     write_results,
 )
+from examples.demo_qa.cli import build_parser
 from examples.demo_qa.runner import DiffReport, RunResult, RunTimings, diff_runs
 from examples.demo_qa.runs.coverage import _missed_case_ids
 from examples.demo_qa.runs.layout import _latest_markers, _update_latest_markers
@@ -546,7 +546,7 @@ def test_explicit_config_path_wins(tmp_path: Path, monkeypatch: pytest.MonkeyPat
 
 
 def test_packaged_default_config_used_when_no_cli_or_data_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
-    run_meta = _run_batch_and_meta(tmp_path, monkeypatch, env_api_key="sk-env")
+    run_meta = _run_batch_and_meta(tmp_path, monkeypatch, env_api_key=None)
 
     config_path = run_meta["inputs"]["config_path"]
     assert config_path is not None
diff --git a/tests/test_demo_qa_settings.py b/tests/test_demo_qa_settings.py
index c7ced7b..bd0f9f3 100644
--- a/tests/test_demo_qa_settings.py
+++ b/tests/test_demo_qa_settings.py
@@ -4,8 +4,6 @@
 from pathlib import Path
 from types import SimpleNamespace
 
-import pytest
-
 from examples.demo_qa.llm.factory import build_llm
 from examples.demo_qa.llm.openai_adapter import OpenAILLM
 from examples.demo_qa.settings import load_settings
@@ -15,6 +13,20 @@ def write_toml(path: Path, content: str) -> None:
     path.write_text(content, encoding="utf-8")
 
 
+def _install_fake_openai(monkeypatch, created: dict):
+    def _store_and_return(kwargs):
+        created["chat_kwargs"] = kwargs
+        return SimpleNamespace(choices=[SimpleNamespace(message=SimpleNamespace(content="ok"))])
+
+    class FakeOpenAI:
+        def __init__(self, api_key=None, base_url=None, **kwargs):
+            created["api_key"] = api_key
+            created["base_url"] = base_url
+            self.chat = SimpleNamespace(completions=SimpleNamespace(create=lambda **kwargs: _store_and_return(kwargs)))
+
+    monkeypatch.setitem(sys.modules, "openai", SimpleNamespace(OpenAI=FakeOpenAI))
+
+
 def test_env_overrides_toml(tmp_path, monkeypatch):
     config_path = tmp_path / "demo_qa.toml"
     write_toml(
@@ -36,7 +48,9 @@ def test_env_overrides_toml(tmp_path, monkeypatch):
     assert settings.llm.plan_model == "env-plan"
 
 
-def test_openai_requires_api_key(tmp_path):
+
+
+def test_allow_missing_api_key_when_disabled(tmp_path):
     config_path = tmp_path / "demo_qa.toml"
     write_toml(
         config_path,
@@ -47,8 +61,9 @@ def test_openai_requires_api_key(tmp_path):
 """,
     )
 
-    with pytest.raises(ValueError):
-        load_settings(config_path=config_path)
+    settings, resolved = load_settings(config_path=config_path)
+    assert resolved == config_path
+    assert settings.llm.api_key is None
 
 
 def test_openai_key_from_global_env(tmp_path, monkeypatch):
@@ -61,10 +76,15 @@ def test_openai_key_from_global_env(tmp_path, monkeypatch):
 """,
     )
     monkeypatch.setenv("OPENAI_API_KEY", "sk-global")
+    created = {}
+    _install_fake_openai(monkeypatch, created)
 
     settings, resolved = load_settings(config_path=config_path)
     assert resolved == config_path
-    assert settings.llm.api_key == "sk-global"
+    llm = build_llm(settings)
+
+    llm("hello", sender="generic_plan")
+    assert created["api_key"] == "sk-global"
 
 
 def test_base_url_passed_to_openai_client(tmp_path, monkeypatch):
@@ -82,21 +102,7 @@ def test_base_url_passed_to_openai_client(tmp_path, monkeypatch):
 
     created = {}
 
-    class FakeOpenAI:
-        def __init__(self, api_key=None, base_url=None, **kwargs):
-            created["api_key"] = api_key
-            created["base_url"] = base_url
-            self.chat = SimpleNamespace(
-                completions=SimpleNamespace(
-                    create=lambda **kwargs: _store_and_return(kwargs)
-                )
-            )
-
-    def _store_and_return(kwargs):
-        created["chat_kwargs"] = kwargs
-        return SimpleNamespace(choices=[SimpleNamespace(message=SimpleNamespace(content="ok"))])
-
-    monkeypatch.setitem(sys.modules, "openai", SimpleNamespace(OpenAI=FakeOpenAI))
+    _install_fake_openai(monkeypatch, created)
 
     settings, resolved = load_settings(config_path=config_path)
     assert resolved == config_path
@@ -193,3 +199,51 @@ def with_options(self, **kwargs):
         "messages": [{"role": "user", "content": "question"}],
         "temperature": 0.2,
     }
+
+
+def test_missing_api_key_uses_unused(monkeypatch):
+    created: dict = {}
+    _install_fake_openai(monkeypatch, created)
+    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+
+    llm = OpenAILLM(
+        api_key=None,
+        base_url=None,
+        plan_model="demo-plan",
+        synth_model="demo-synth",
+    )
+    llm("hello", sender="generic_plan")
+
+    assert created["api_key"] == "unused"
+
+
+def test_env_reference_uses_openai_api_key(monkeypatch):
+    created: dict = {}
+    _install_fake_openai(monkeypatch, created)
+    monkeypatch.setenv("OPENAI_API_KEY", "sk-env")
+
+    llm = OpenAILLM(
+        api_key="env:OPENAI_API_KEY",
+        base_url=None,
+        plan_model="demo-plan",
+        synth_model="demo-synth",
+    )
+    llm("hello", sender="generic_plan")
+
+    assert created["api_key"] == "sk-env"
+
+
+def test_env_reference_defaults_to_unused_when_missing(monkeypatch):
+    created: dict = {}
+    _install_fake_openai(monkeypatch, created)
+    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+
+    llm = OpenAILLM(
+        api_key="env:OPENAI_API_KEY",
+        base_url=None,
+        plan_model="demo-plan",
+        synth_model="demo-synth",
+    )
+    llm("hello", sender="generic_plan")
+
+    assert created["api_key"] == "unused"

From a93bd7c7d31dbaf27bc8b89f4ea5e822a95b7a04 Mon Sep 17 00:00:00 2001
From: Alex <aonischenko@gmail.com>
Date: Sat, 3 Jan 2026 19:47:52 +0300
Subject: [PATCH 92/92] demo_qa.toml

---
 examples/demo_qa/demo_qa.toml | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 examples/demo_qa/demo_qa.toml

diff --git a/examples/demo_qa/demo_qa.toml b/examples/demo_qa/demo_qa.toml
new file mode 100644
index 0000000..a1d9ede
--- /dev/null
+++ b/examples/demo_qa/demo_qa.toml
@@ -0,0 +1,8 @@
+[llm]
+base_url = "http://localhost:8000/v1"
+plan_model = "default"
+synth_model = "default"
+plan_temperature = 0.0
+synth_temperature = 0.2
+timeout_s = 900
+retries = 2
\ No newline at end of file