🤖 ci: update terminal-bench to Opus 4.5 and GPT 5.2 (#1156)

ammar-agent · web-flow · commit 9f4c41e140f4 · 2025-12-14T14:13:08.000-06:00
Update nightly benchmark models:
- `anthropic:claude-sonnet-4-5` → `anthropic:claude-opus-4-5`
- `openai:gpt-5.1-codex` → `openai:gpt-5.2`

### Recent Trends (last 5 days)

| Date | Claude Sonnet 4.5 | GPT-5.1-codex |
|------|-------------------|---------------|
| Dec 14 | **42.5%** | **31.25%** |
| Dec 13 | 37.5% | 30.0% |
| Dec 12 | 36.25% | 28.75% |
| Dec 11 | 36.25% | 28.75% |
| Dec 10 | 35.0% | 26.25% |

---
_Generated with `mux` • Model: `anthropic:claude-opus-4-5` • Thinking:
`high`_
diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml
@@ -23,7 +23,7 @@ jobs:
         id: set-models
         run: |
           if [ "${{ inputs.models }}" = "all" ] || [ -z "${{ inputs.models }}" ]; then
-            echo 'models=["anthropic:claude-sonnet-4-5","openai:gpt-5.1-codex"]' >> $GITHUB_OUTPUT
+            echo 'models=["anthropic:claude-opus-4-5","openai:gpt-5.2"]' >> $GITHUB_OUTPUT
           else
             # Convert comma-separated to JSON array
             models="${{ inputs.models }}"
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
@@ -4,7 +4,7 @@ on:
   workflow_call:
     inputs:
       model_name:
-        description: "Model to use (e.g., anthropic:claude-sonnet-4-5)"
+        description: "Model to use (e.g., anthropic:claude-opus-4-5)"
         required: false
         type: string
       thinking_level:
@@ -61,7 +61,7 @@ on:
         required: false
         type: string
       model_name:
-        description: "Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5.1-codex)"
+        description: "Model to use (e.g., anthropic:claude-opus-4-5, openai:gpt-5.2)"
         required: false
         type: string
       thinking_level: