From bfce4dc2c80d194d6a5e9e0cdd31dbfa8fb09753 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Mon, 29 Dec 2025 17:28:21 +0000
Subject: [PATCH 01/13] Run all non-integration tests in CI

- Add pytest markers for integration and slow tests
- Update CI to run all tests except those marked as integration
- Previously only ran test_models.py
---
 .github/workflows/test.yml | 2 +-
 pyproject.toml             | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 53db06d..42981b0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -22,4 +22,4 @@ jobs:
         run: uv sync --extra dev
 
       - name: Run tests
-        run: uv run pytest tests/test_models.py -v
+        run: uv run pytest -v -m "not integration"
diff --git a/pyproject.toml b/pyproject.toml
index 175a3e5..27eb310 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,6 +53,10 @@ ignore = ["E501"]  # Tool descriptions need to be long
 testpaths = ["tests"]
 pythonpath = ["."]
 asyncio_mode = "auto"
+markers = [
+    "integration: tests that require external services (API keys, databases)",
+    "slow: tests that take a long time to run",
+]
 
 [dependency-groups]
 dev = [

From 4289be2e45a83120a50984517c805f4d93f75183 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Mon, 29 Dec 2025 17:32:59 +0000
Subject: [PATCH 02/13] Redesign policy chat UI with structured tool display
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Show tool calls as collapsible cards with parameters
- Add progress indicator (Search → Create policy → Analysis → Complete)
- Display API requests with method badges (GET/POST)
- Make tool results expandable
- Show assistant reasoning in muted italic
- Parse log messages into structured UI elements
- Professional green header with status indicator
- Improved example questions with hover effects
---
 docs/src/components/policy-chat.tsx   | 673 ++++++++++++++++++--------
 src/policyengine_api/agent_sandbox.py |  20 +
 2 files changed, 495 insertions(+), 198 deletions(-)

diff --git a/docs/src/components/policy-chat.tsx b/docs/src/components/policy-chat.tsx
index 2a9865b..0147753 100644
--- a/docs/src/components/policy-chat.tsx
+++ b/docs/src/components/policy-chat.tsx
@@ -1,6 +1,6 @@
 "use client";
 
-import { useState, useRef, useEffect } from "react";
+import { useState, useRef, useEffect, useMemo } from "react";
 import ReactMarkdown from "react-markdown";
 import remarkBreaks from "remark-breaks";
 import { useApi } from "./api-context";
@@ -16,6 +16,325 @@ interface LogEntry {
   message: string;
 }
 
+interface ParsedStep {
+  type: "agent" | "tool_use" | "api_call" | "api_response" | "tool_result" | "assistant" | "unknown";
+  title: string;
+  content: string;
+  method?: string;
+  url?: string;
+  statusCode?: number;
+  toolName?: string;
+  params?: Record<string, unknown>;
+  isExpanded?: boolean;
+}
+
+function parseLogEntry(message: string): ParsedStep {
+  // [AGENT] messages
+  if (message.startsWith("[AGENT]")) {
+    const content = message.replace("[AGENT] ", "");
+    return {
+      type: "agent",
+      title: "Agent",
+      content: content,
+    };
+  }
+
+  // [TOOL_USE] tool_name: {...}
+  if (message.startsWith("[TOOL_USE]")) {
+    const content = message.replace("[TOOL_USE] ", "");
+    const colonIndex = content.indexOf(":");
+    if (colonIndex > -1) {
+      const toolName = content.slice(0, colonIndex).trim();
+      const paramsStr = content.slice(colonIndex + 1).trim();
+      let params: Record<string, unknown> = {};
+      try {
+        params = JSON.parse(paramsStr);
+      } catch {
+        // Not valid JSON
+      }
+      // Clean up tool name for display
+      const displayName = toolName
+        .replace(/_/g, " ")
+        .replace(/parameters get$/, "")
+        .replace(/parameters post$/, "")
+        .replace(/household calculate post$/, "Calculate household")
+        .replace(/list /g, "Search ");
+      return {
+        type: "tool_use",
+        title: displayName,
+        content: paramsStr,
+        toolName,
+        params,
+      };
+    }
+  }
+
+  // [API] GET/POST url
+  if (message.startsWith("[API]")) {
+    const content = message.replace("[API] ", "");
+
+    // Check if it's a response
+    if (content.startsWith("Response:")) {
+      const statusCode = parseInt(content.replace("Response: ", ""), 10);
+      return {
+        type: "api_response",
+        title: "Response",
+        content: content,
+        statusCode,
+      };
+    }
+
+    // Check if it's a request with method
+    const methodMatch = content.match(/^(GET|POST|PUT|PATCH|DELETE)\s+(.+)$/);
+    if (methodMatch) {
+      return {
+        type: "api_call",
+        title: "API Request",
+        content: content,
+        method: methodMatch[1],
+        url: methodMatch[2],
+      };
+    }
+
+    // Query or Body
+    if (content.startsWith("Query:") || content.startsWith("Body:")) {
+      return {
+        type: "api_call",
+        title: content.startsWith("Query:") ? "Query params" : "Request body",
+        content: content.replace(/^(Query|Body):\s*/, ""),
+      };
+    }
+  }
+
+  // [TOOL_RESULT] ...
+  if (message.startsWith("[TOOL_RESULT]")) {
+    const content = message.replace("[TOOL_RESULT] ", "");
+    return {
+      type: "tool_result",
+      title: "Result",
+      content: content,
+    };
+  }
+
+  // [ASSISTANT] ...
+  if (message.startsWith("[ASSISTANT]")) {
+    const content = message.replace("[ASSISTANT] ", "");
+    return {
+      type: "assistant",
+      title: "Thinking",
+      content: content,
+    };
+  }
+
+  return {
+    type: "unknown",
+    title: "Log",
+    content: message,
+  };
+}
+
+function ToolCard({ step, isLast }: { step: ParsedStep; isLast: boolean }) {
+  const [isExpanded, setIsExpanded] = useState(false);
+
+  if (step.type === "agent") {
+    return (
+      <div className="flex items-start gap-3 py-2">
+        <div className="w-1.5 h-1.5 mt-2 rounded-full bg-[var(--color-pe-green)] animate-pulse" />
+        <span className="text-sm text-[var(--color-text-secondary)]">{step.content}</span>
+      </div>
+    );
+  }
+
+  if (step.type === "tool_use") {
+    return (
+      <div className="relative pl-4 border-l-2 border-[var(--color-pe-green)] py-3 animate-fadeIn">
+        <div className="absolute -left-[7px] top-4 w-3 h-3 rounded-full bg-[var(--color-pe-green)] border-2 border-white" />
+        <div className="bg-white rounded-lg border border-[var(--color-border)] shadow-sm overflow-hidden">
+          <button
+            onClick={() => setIsExpanded(!isExpanded)}
+            className="w-full flex items-center justify-between p-3 hover:bg-[var(--color-surface-sunken)] text-left"
+          >
+            <div className="flex items-center gap-2">
+              <svg className="w-4 h-4 text-[var(--color-pe-green)]" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M10.325 4.317c.426-1.756 2.924-1.756 3.35 0a1.724 1.724 0 002.573 1.066c1.543-.94 3.31.826 2.37 2.37a1.724 1.724 0 001.065 2.572c1.756.426 1.756 2.924 0 3.35a1.724 1.724 0 00-1.066 2.573c.94 1.543-.826 3.31-2.37 2.37a1.724 1.724 0 00-2.572 1.065c-.426 1.756-2.924 1.756-3.35 0a1.724 1.724 0 00-2.573-1.066c-1.543.94-3.31-.826-2.37-2.37a1.724 1.724 0 00-1.065-2.572c-1.756-.426-1.756-2.924 0-3.35a1.724 1.724 0 001.066-2.573c-.94-1.543.826-3.31 2.37-2.37.996.608 2.296.07 2.572-1.065z" />
+                <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M15 12a3 3 0 11-6 0 3 3 0 016 0z" />
+              </svg>
+              <span className="font-medium text-sm text-[var(--color-text-primary)] capitalize">
+                {step.title}
+              </span>
+            </div>
+            <svg
+              className={`w-4 h-4 text-[var(--color-text-muted)] transition-transform ${isExpanded ? "rotate-180" : ""}`}
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+            >
+              <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 9l-7 7-7-7" />
+            </svg>
+          </button>
+          {isExpanded && step.params && Object.keys(step.params).length > 0 && (
+            <div className="px-3 pb-3 border-t border-[var(--color-border)]">
+              <div className="mt-2 font-mono text-xs bg-[var(--color-surface-sunken)] rounded p-2 overflow-x-auto">
+                {Object.entries(step.params).map(([key, value]) => (
+                  <div key={key} className="flex gap-2">
+                    <span className="text-[var(--color-pe-green)]">{key}:</span>
+                    <span className="text-[var(--color-text-secondary)]">
+                      {typeof value === "string" ? `"${value}"` : JSON.stringify(value)}
+                    </span>
+                  </div>
+                ))}
+              </div>
+            </div>
+          )}
+        </div>
+        {isLast && (
+          <div className="absolute left-0 top-full h-4 w-0.5 bg-gradient-to-b from-[var(--color-pe-green)] to-transparent -translate-x-[1px]" />
+        )}
+      </div>
+    );
+  }
+
+  if (step.type === "api_call" && step.method) {
+    const methodColors: Record<string, string> = {
+      GET: "bg-blue-100 text-blue-700",
+      POST: "bg-green-100 text-green-700",
+      PUT: "bg-amber-100 text-amber-700",
+      DELETE: "bg-red-100 text-red-700",
+    };
+    return (
+      <div className="pl-4 border-l-2 border-[var(--color-border)] py-1 ml-1 animate-fadeIn">
+        <div className="flex items-center gap-2 text-xs">
+          <span className={`px-1.5 py-0.5 rounded font-mono font-medium ${methodColors[step.method] || "bg-gray-100 text-gray-700"}`}>
+            {step.method}
+          </span>
+          <span className="font-mono text-[var(--color-text-muted)] truncate max-w-[300px]">
+            {step.url?.replace("https://v2.api.policyengine.org", "")}
+          </span>
+        </div>
+      </div>
+    );
+  }
+
+  if (step.type === "api_response") {
+    const isSuccess = step.statusCode && step.statusCode < 400;
+    return (
+      <div className="pl-4 border-l-2 border-[var(--color-border)] py-1 ml-1 animate-fadeIn">
+        <div className="flex items-center gap-2 text-xs">
+          <span className={`px-1.5 py-0.5 rounded font-mono font-medium ${isSuccess ? "bg-green-100 text-green-700" : "bg-red-100 text-red-700"}`}>
+            {step.statusCode}
+          </span>
+          <span className="text-[var(--color-text-muted)]">
+            {isSuccess ? "Success" : "Error"}
+          </span>
+        </div>
+      </div>
+    );
+  }
+
+  if (step.type === "tool_result") {
+    return (
+      <div className="pl-4 border-l-2 border-[var(--color-border)] py-2 ml-1 animate-fadeIn">
+        <button
+          onClick={() => setIsExpanded(!isExpanded)}
+          className="flex items-center gap-2 text-xs text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)]"
+        >
+          <svg className={`w-3 h-3 transition-transform ${isExpanded ? "rotate-90" : ""}`} fill="none" viewBox="0 0 24 24" stroke="currentColor">
+            <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
+          </svg>
+          <span>View result data</span>
+        </button>
+        {isExpanded && (
+          <div className="mt-2 font-mono text-xs bg-[var(--color-code-bg)] text-[var(--color-code-text)] rounded p-3 overflow-x-auto max-h-48 overflow-y-auto">
+            <pre className="whitespace-pre-wrap">{step.content.slice(0, 2000)}{step.content.length > 2000 ? "\n..." : ""}</pre>
+          </div>
+        )}
+      </div>
+    );
+  }
+
+  if (step.type === "assistant") {
+    return (
+      <div className="flex items-start gap-3 py-2 pl-5 animate-fadeIn">
+        <div className="w-1 h-1 mt-2 rounded-full bg-[var(--color-text-muted)]" />
+        <span className="text-sm text-[var(--color-text-muted)] italic">{step.content}</span>
+      </div>
+    );
+  }
+
+  return null;
+}
+
+function ProgressIndicator({ logs }: { logs: LogEntry[] }) {
+  const stages = useMemo(() => {
+    const hasSearch = logs.some(l => l.message.includes("parameters"));
+    const hasPolicy = logs.some(l => l.message.includes("policies"));
+    const hasAnalysis = logs.some(l => l.message.includes("analysis") || l.message.includes("economic"));
+    const hasHousehold = logs.some(l => l.message.includes("household"));
+    const isComplete = logs.some(l => l.message.includes("Completed"));
+
+    if (hasAnalysis) {
+      return [
+        { label: "Search", done: hasSearch },
+        { label: "Create policy", done: hasPolicy },
+        { label: "Run analysis", done: isComplete, active: !isComplete },
+        { label: "Complete", done: isComplete },
+      ];
+    }
+
+    if (hasHousehold) {
+      return [
+        { label: "Build household", done: true },
+        { label: "Calculate", done: isComplete, active: !isComplete },
+        { label: "Complete", done: isComplete },
+      ];
+    }
+
+    return [
+      { label: "Search", done: hasSearch, active: !hasSearch && logs.length > 0 },
+      { label: "Retrieve", done: logs.length > 3, active: hasSearch && logs.length <= 3 },
+      { label: "Complete", done: isComplete },
+    ];
+  }, [logs]);
+
+  if (logs.length === 0) return null;
+
+  return (
+    <div className="flex items-center gap-1 mb-4 px-1">
+      {stages.map((stage, i) => (
+        <div key={stage.label} className="flex items-center">
+          <div className="flex items-center gap-1.5">
+            <div
+              className={`w-2 h-2 rounded-full transition-all duration-300 ${
+                stage.done
+                  ? "bg-[var(--color-pe-green)]"
+                  : stage.active
+                  ? "bg-[var(--color-pe-green)] animate-pulse"
+                  : "bg-[var(--color-border)]"
+              }`}
+            />
+            <span
+              className={`text-xs font-medium transition-colors ${
+                stage.done || stage.active
+                  ? "text-[var(--color-text-primary)]"
+                  : "text-[var(--color-text-muted)]"
+              }`}
+            >
+              {stage.label}
+            </span>
+          </div>
+          {i < stages.length - 1 && (
+            <div
+              className={`w-8 h-px mx-2 transition-colors ${
+                stage.done ? "bg-[var(--color-pe-green)]" : "bg-[var(--color-border)]"
+              }`}
+            />
+          )}
+        </div>
+      ))}
+    </div>
+  );
+}
+
 export function PolicyChat() {
   const { baseUrl } = useApi();
   const [messages, setMessages] = useState<Message[]>([]);
@@ -34,7 +353,6 @@ export function PolicyChat() {
     scrollToBottom();
   }, [messages, logs]);
 
-  // Cleanup polling on unmount
   useEffect(() => {
     return () => {
       if (pollIntervalRef.current) {
@@ -43,20 +361,21 @@ export function PolicyChat() {
     };
   }, []);
 
+  const parsedSteps = useMemo(() => {
+    return logs
+      .map(log => parseLogEntry(log.message))
+      .filter(step => step.type !== "unknown");
+  }, [logs]);
+
   const pollLogs = async (id: string) => {
     try {
       const res = await fetch(`${baseUrl}/agent/logs/${id}`);
-      if (!res.ok) {
-        console.error("Failed to fetch logs:", res.status);
-        return;
-      }
+      if (!res.ok) return;
 
       const data = await res.json();
       setLogs(data.logs || []);
 
-      // Check if completed or failed
       if (data.status === "completed" || data.status === "failed") {
-        // Stop polling
         if (pollIntervalRef.current) {
           clearInterval(pollIntervalRef.current);
           pollIntervalRef.current = null;
@@ -65,39 +384,16 @@ export function PolicyChat() {
         setIsLoading(false);
         setCallId(null);
 
-        // Extract final result from logs or result field
         let finalContent = "";
         if (data.result?.result) {
           finalContent = data.result.result;
         } else {
-          // Try to extract from logs - look for [CLAUDE] lines with result
-          const claudeLogs = data.logs
-            .map((l: LogEntry) => l.message)
-            .filter((m: string) => m.startsWith("[CLAUDE]"))
-            .map((m: string) => m.replace("[CLAUDE] ", ""));
-
-          // Try to parse the last few lines for result
-          for (const log of claudeLogs.reverse()) {
-            try {
-              const event = JSON.parse(log);
-              if (event.type === "result" && event.result) {
-                finalContent = event.result;
-                break;
-              }
-            } catch {
-              // Not JSON, skip
-            }
-          }
-
-          if (!finalContent) {
-            finalContent =
-              data.status === "completed"
-                ? "Analysis completed. Check logs for details."
-                : "Analysis failed. Check logs for errors.";
-          }
+          finalContent =
+            data.status === "completed"
+              ? "Analysis completed. Check the steps above for details."
+              : "Analysis failed. Please try again.";
         }
 
-        // Update assistant message with final content
         setMessages((prev) => {
           const newMessages = [...prev];
           const lastIndex = newMessages.length - 1;
@@ -126,38 +422,30 @@ export function PolicyChat() {
     setLogs([]);
     setCallId(null);
 
-    // Stop any existing polling
     if (pollIntervalRef.current) {
       clearInterval(pollIntervalRef.current);
       pollIntervalRef.current = null;
     }
 
-    // Add user message
     setMessages((prev) => [...prev, { role: "user", content: userMessage }]);
-
-    // Add pending assistant message
     setMessages((prev) => [
       ...prev,
       { role: "assistant", content: "", status: "pending" },
     ]);
 
     try {
-      // Start the agent
       const res = await fetch(`${baseUrl}/agent/run`, {
         method: "POST",
         headers: { "Content-Type": "application/json" },
         body: JSON.stringify({ question: userMessage }),
       });
 
-      if (!res.ok) {
-        throw new Error(`HTTP ${res.status}`);
-      }
+      if (!res.ok) throw new Error(`HTTP ${res.status}`);
 
       const data = await res.json();
       const newCallId = data.call_id;
       setCallId(newCallId);
 
-      // Update to running status
       setMessages((prev) => {
         const newMessages = [...prev];
         const lastIndex = newMessages.length - 1;
@@ -170,12 +458,10 @@ export function PolicyChat() {
         return newMessages;
       });
 
-      // Start polling for logs
       pollIntervalRef.current = setInterval(() => {
         pollLogs(newCallId);
       }, 1000);
 
-      // Initial poll
       pollLogs(newCallId);
     } catch (err) {
       setMessages((prev) => {
@@ -194,195 +480,186 @@ export function PolicyChat() {
     }
   };
 
-  // Parse log message to extract useful info
-  const parseLogMessage = (message: string): { type: string; content: string } => {
-    if (message.startsWith("[AGENT]")) {
-      return { type: "agent", content: message.replace("[AGENT] ", "") };
-    }
-    if (message.startsWith("[CLAUDE]")) {
-      const claudeContent = message.replace("[CLAUDE] ", "");
-      // Try to parse as JSON
-      try {
-        const event = JSON.parse(claudeContent);
-        if (event.type === "assistant" && event.message?.content) {
-          const textParts = event.message.content
-            .filter((c: { type: string }) => c.type === "text")
-            .map((c: { text: string }) => c.text)
-            .join("");
-          if (textParts) {
-            return { type: "text", content: textParts };
-          }
-          const toolParts = event.message.content
-            .filter((c: { type: string }) => c.type === "tool_use")
-            .map((c: { name: string }) => c.name);
-          if (toolParts.length > 0) {
-            return { type: "tool", content: `Using: ${toolParts.join(", ")}` };
-          }
-        }
-        if (event.type === "system" && event.subtype === "init") {
-          const mcpStatus = event.mcp_servers?.find(
-            (s: { name: string }) => s.name === "policyengine"
-          );
-          return {
-            type: "system",
-            content: mcpStatus?.status === "connected" ? "MCP connected" : "Starting...",
-          };
-        }
-        if (event.type === "result") {
-          return { type: "result", content: "Analysis complete" };
-        }
-        return { type: "claude", content: `[${event.type || "event"}]` };
-      } catch {
-        return { type: "claude", content: claudeContent.slice(0, 100) };
-      }
-    }
-    return { type: "log", content: message.slice(0, 100) };
-  };
-
   const exampleQuestions = [
-    "How much would it cost to set the UK basic income tax rate to 19p?",
-    "What would happen if we doubled child benefit?",
-    "Calculate tax for a UK household earning 50,000",
-    "What is the budgetary impact of abolishing the higher rate of income tax?",
-    "What benefits would a single parent with two children receive in California?",
+    "What is the UK personal allowance for 2026?",
+    "Calculate tax for someone earning £50,000 in the UK",
+    "What would happen if we increased child benefit by 10%?",
+    "What benefits would a single parent with two children receive?",
   ];
 
   return (
-    <div className="border border-[var(--color-border)] rounded-xl overflow-hidden bg-white flex flex-col h-[600px]">
+    <div className="border border-[var(--color-border)] rounded-2xl overflow-hidden bg-white flex flex-col h-[700px] shadow-sm">
       {/* Header */}
-      <div className="p-4 border-b border-[var(--color-border)] bg-[var(--color-surface)]">
-        <div className="flex items-center gap-2">
-          <div
-            className={`w-2 h-2 rounded-full ${
-              isLoading ? "bg-amber-400 animate-pulse" : "bg-gray-300"
-            }`}
-          />
-          <span className="text-sm font-medium text-[var(--color-text-primary)] font-mono">
-            Policy analyst
-          </span>
-          <span className="text-xs text-[var(--color-text-muted)] ml-auto font-mono">
-            Powered by Claude Code + MCP
-          </span>
+      <div className="px-5 py-4 border-b border-[var(--color-border)] bg-gradient-to-r from-[var(--color-pe-green)] to-[var(--color-pe-green-dark)]">
+        <div className="flex items-center justify-between">
+          <div className="flex items-center gap-3">
+            <div className="w-8 h-8 rounded-lg bg-white/20 flex items-center justify-center">
+              <svg className="w-5 h-5 text-white" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={1.5} d="M9.75 17L9 20l-1 1h8l-1-1-.75-3M3 13h18M5 17h14a2 2 0 002-2V5a2 2 0 00-2-2H5a2 2 0 00-2 2v10a2 2 0 002 2z" />
+              </svg>
+            </div>
+            <div>
+              <h2 className="text-white font-semibold">Policy analyst</h2>
+              <p className="text-white/70 text-xs">Ask questions about UK and US tax-benefit policy</p>
+            </div>
+          </div>
+          <div className="flex items-center gap-2">
+            <div className={`w-2 h-2 rounded-full ${isLoading ? "bg-amber-300 animate-pulse" : "bg-green-300"}`} />
+            <span className="text-white/70 text-xs font-medium">
+              {isLoading ? "Working..." : "Ready"}
+            </span>
+          </div>
         </div>
-        <p className="text-xs text-[var(--color-text-muted)] mt-1 font-mono">
-          Ask natural language questions about UK or US tax and benefit policy
-        </p>
       </div>
 
       {/* Messages */}
-      <div className="flex-1 overflow-y-auto p-4 space-y-4">
-        {messages.length === 0 && (
-          <div className="text-center py-8">
-            <p className="text-sm text-[var(--color-text-muted)] mb-4 font-mono">
-              Try asking a question like:
-            </p>
-            <div className="space-y-2">
+      <div className="flex-1 overflow-y-auto p-5">
+        {messages.length === 0 ? (
+          <div className="h-full flex flex-col justify-center">
+            <div className="text-center mb-8">
+              <h3 className="font-display text-2xl text-[var(--color-text-primary)] mb-2">
+                What would you like to know?
+              </h3>
+              <p className="text-sm text-[var(--color-text-muted)]">
+                Ask about tax rates, benefits, or policy impacts
+              </p>
+            </div>
+            <div className="grid gap-2 max-w-lg mx-auto">
               {exampleQuestions.map((q, i) => (
                 <button
                   key={i}
                   onClick={() => setInput(q)}
-                  className="block w-full text-left p-3 rounded-lg bg-[var(--color-surface-sunken)] text-sm text-[var(--color-text-secondary)] hover:bg-[var(--color-surface)] transition-colors font-mono"
+                  className="text-left p-4 rounded-xl bg-[var(--color-surface-sunken)] hover:bg-[var(--color-surface)] border border-transparent hover:border-[var(--color-border)] text-sm text-[var(--color-text-secondary)] transition-all group"
                 >
-                  {q}
+                  <span className="group-hover:text-[var(--color-pe-green)] transition-colors">{q}</span>
                 </button>
               ))}
             </div>
           </div>
-        )}
-
-        {messages.map((message, i) => (
-          <div
-            key={i}
-            className={`flex ${message.role === "user" ? "justify-end" : "justify-start"}`}
-          >
-            <div
-              className={`max-w-[85%] rounded-xl px-4 py-3 ${
-                message.role === "user"
-                  ? "bg-[var(--color-pe-green)] text-white"
-                  : "bg-[var(--color-surface-sunken)] text-[var(--color-text-primary)]"
-              }`}
-            >
-              {message.role === "assistant" &&
-              (message.status === "pending" || message.status === "running") ? (
-                <div className="flex items-center gap-2 font-mono">
-                  <div className="w-3 h-3 border-2 border-[var(--color-pe-green)] border-t-transparent rounded-full animate-spin" />
-                  <span className="text-sm">
-                    {message.status === "pending" ? "Starting..." : "Analysing..."}
-                  </span>
-                </div>
-              ) : message.status === "completed" || message.status === "failed" ? (
-                <div className="font-mono prose prose-sm max-w-none text-sm [&>*]:text-[var(--color-text-primary)] [&_code]:bg-[var(--color-surface)] [&_code]:px-1 [&_code]:py-0.5 [&_code]:rounded [&_strong]:font-semibold">
-                  <ReactMarkdown remarkPlugins={[remarkBreaks]}>
-                    {message.content}
-                  </ReactMarkdown>
-                </div>
-              ) : (
-                <div className="text-sm whitespace-pre-wrap font-mono">{message.content}</div>
-              )}
-            </div>
-          </div>
-        ))}
-
-        {/* Live logs */}
-        {isLoading && logs.length > 0 && (
-          <div className="bg-[var(--color-surface-sunken)] rounded-xl p-3 space-y-1 font-mono text-xs max-h-64 overflow-y-auto">
-            <div className="text-xs font-medium text-[var(--color-text-muted)] mb-2 sticky top-0 bg-[var(--color-surface-sunken)]">
-              Live output ({logs.length} entries)
-            </div>
-            {logs.slice(-30).map((log, i) => {
-              const parsed = parseLogMessage(log.message);
-              return (
-                <div
-                  key={i}
-                  className={`flex items-start gap-2 ${
-                    parsed.type === "tool"
-                      ? "text-amber-600"
-                      : parsed.type === "text"
-                      ? "text-[var(--color-text-primary)]"
-                      : parsed.type === "agent"
-                      ? "text-blue-600"
-                      : parsed.type === "system"
-                      ? "text-green-600"
-                      : "text-[var(--color-text-muted)]"
-                  }`}
-                >
-                  <span className="text-[var(--color-text-muted)] select-none shrink-0">
-                    {">"}
-                  </span>
-                  <span className="whitespace-pre-wrap break-words">{parsed.content}</span>
-                </div>
-              );
-            })}
-            <div className="flex items-center gap-2 text-[var(--color-text-muted)]">
-              <span className="select-none">{">"}</span>
-              <span className="inline-block w-2 h-3 bg-[var(--color-pe-green)] animate-pulse" />
-            </div>
+        ) : (
+          <div className="space-y-6">
+            {messages.map((message, i) => (
+              <div key={i}>
+                {message.role === "user" ? (
+                  <div className="flex justify-end">
+                    <div className="max-w-[80%] bg-[var(--color-pe-green)] text-white rounded-2xl rounded-br-md px-4 py-3">
+                      <p className="text-sm">{message.content}</p>
+                    </div>
+                  </div>
+                ) : (
+                  <div className="space-y-3">
+                    {/* Running state with live steps */}
+                    {(message.status === "pending" || message.status === "running") && (
+                      <div className="bg-[var(--color-surface-sunken)] rounded-2xl p-4">
+                        <ProgressIndicator logs={logs} />
+
+                        {message.status === "pending" ? (
+                          <div className="flex items-center gap-3">
+                            <div className="w-5 h-5 border-2 border-[var(--color-pe-green)] border-t-transparent rounded-full animate-spin" />
+                            <span className="text-sm text-[var(--color-text-secondary)]">Starting analysis...</span>
+                          </div>
+                        ) : (
+                          <div className="space-y-1">
+                            {parsedSteps.slice(-15).map((step, j) => (
+                              <ToolCard
+                                key={j}
+                                step={step}
+                                isLast={j === parsedSteps.slice(-15).length - 1 && step.type === "tool_use"}
+                              />
+                            ))}
+                            <div className="flex items-center gap-2 pt-2 pl-5">
+                              <div className="w-2 h-4 bg-[var(--color-pe-green)] animate-pulse rounded-sm" />
+                            </div>
+                          </div>
+                        )}
+                      </div>
+                    )}
+
+                    {/* Completed/failed state */}
+                    {(message.status === "completed" || message.status === "failed") && (
+                      <div className="space-y-4">
+                        {/* Collapsible steps summary */}
+                        {parsedSteps.length > 0 && (
+                          <details className="group">
+                            <summary className="cursor-pointer list-none flex items-center gap-2 text-xs text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)]">
+                              <svg className="w-3 h-3 group-open:rotate-90 transition-transform" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                                <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
+                              </svg>
+                              <span>{parsedSteps.filter(s => s.type === "tool_use").length} tool calls executed</span>
+                            </summary>
+                            <div className="mt-3 bg-[var(--color-surface-sunken)] rounded-xl p-4 space-y-1">
+                              {parsedSteps.map((step, j) => (
+                                <ToolCard key={j} step={step} isLast={false} />
+                              ))}
+                            </div>
+                          </details>
+                        )}
+
+                        {/* Final response */}
+                        <div className={`rounded-2xl rounded-bl-md px-5 py-4 ${
+                          message.status === "failed"
+                            ? "bg-red-50 border border-red-200"
+                            : "bg-white border border-[var(--color-border)]"
+                        }`}>
+                          <div className="prose prose-sm max-w-none text-[var(--color-text-primary)] [&_strong]:font-semibold [&_code]:bg-[var(--color-surface-sunken)] [&_code]:px-1.5 [&_code]:py-0.5 [&_code]:rounded [&_code]:text-sm [&_code]:font-mono [&_h1]:text-lg [&_h2]:text-base [&_h3]:text-sm [&_ul]:my-2 [&_li]:my-0.5">
+                            <ReactMarkdown remarkPlugins={[remarkBreaks]}>
+                              {message.content}
+                            </ReactMarkdown>
+                          </div>
+                        </div>
+                      </div>
+                    )}
+                  </div>
+                )}
+              </div>
+            ))}
+            <div ref={messagesEndRef} />
           </div>
         )}
-
-        <div ref={messagesEndRef} />
       </div>
 
       {/* Input */}
-      <form onSubmit={handleSubmit} className="p-4 border-t border-[var(--color-border)]">
-        <div className="flex gap-2">
+      <form onSubmit={handleSubmit} className="p-4 border-t border-[var(--color-border)] bg-[var(--color-surface)]">
+        <div className="flex gap-3">
           <input
             type="text"
             value={input}
             onChange={(e) => setInput(e.target.value)}
             placeholder="Ask a policy question..."
             disabled={isLoading}
-            className="flex-1 px-4 py-2 text-sm border border-[var(--color-border)] rounded-lg focus:outline-none focus:ring-2 focus:ring-[var(--color-pe-green)] disabled:opacity-50 font-mono"
+            className="flex-1 px-4 py-3 text-sm border border-[var(--color-border)] rounded-xl bg-white focus:outline-none focus:ring-2 focus:ring-[var(--color-pe-green)] focus:border-transparent disabled:opacity-50 placeholder:text-[var(--color-text-muted)]"
           />
           <button
             type="submit"
             disabled={isLoading || !input.trim()}
-            className="px-4 py-2 bg-[var(--color-pe-green)] text-white rounded-lg text-sm font-medium hover:bg-[var(--color-pe-green-dark)] disabled:opacity-50 disabled:cursor-not-allowed transition-colors font-mono"
+            className="px-6 py-3 bg-[var(--color-pe-green)] hover:bg-[var(--color-pe-green-dark)] text-white rounded-xl text-sm font-medium disabled:opacity-50 disabled:cursor-not-allowed transition-colors flex items-center gap-2"
           >
-            {isLoading ? "..." : "Ask"}
+            {isLoading ? (
+              <>
+                <div className="w-4 h-4 border-2 border-white/30 border-t-white rounded-full animate-spin" />
+                <span>Working</span>
+              </>
+            ) : (
+              <>
+                <span>Ask</span>
+                <svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                  <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M14 5l7 7m0 0l-7 7m7-7H3" />
+                </svg>
+              </>
+            )}
           </button>
         </div>
       </form>
+
+      <style jsx>{`
+        @keyframes fadeIn {
+          from { opacity: 0; transform: translateY(4px); }
+          to { opacity: 1; transform: translateY(0); }
+        }
+        .animate-fadeIn {
+          animation: fadeIn 0.2s ease-out forwards;
+        }
+      `}</style>
     </div>
   );
 }
diff --git a/src/policyengine_api/agent_sandbox.py b/src/policyengine_api/agent_sandbox.py
index 01ac0fe..7560ee7 100644
--- a/src/policyengine_api/agent_sandbox.py
+++ b/src/policyengine_api/agent_sandbox.py
@@ -2,6 +2,7 @@
 
 import json
 import re
+import time
 from typing import Any, Callable
 
 import anthropic
@@ -34,8 +35,27 @@
 3. Be concise but thorough
 4. For UK, amounts are in GBP. For US, amounts are in USD.
 5. Poll async endpoints until status is "completed"
+
+IMPORTANT: When polling async endpoints, ALWAYS use the sleep tool to wait 5-10 seconds between requests.
+Do not poll in a tight loop - this wastes resources and may hit rate limits.
 """
 
+# Sleep tool for polling delays
+SLEEP_TOOL = {
+    "name": "sleep",
+    "description": "Wait for a specified number of seconds. Use this between polling requests to avoid hammering the API.",
+    "input_schema": {
+        "type": "object",
+        "properties": {
+            "seconds": {
+                "type": "number",
+                "description": "Number of seconds to sleep (1-60)",
+            }
+        },
+        "required": ["seconds"],
+    },
+}
+
 
 def fetch_openapi_spec(api_base_url: str) -> dict:
     """Fetch and cache OpenAPI spec."""

From e09bba870264201baa24f7c7ca14f360e4ea21fd Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Mon, 29 Dec 2025 17:33:47 +0000
Subject: [PATCH 03/13] Add sleep tool to agent for polling delays

---
 src/policyengine_api/agent_sandbox.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/policyengine_api/agent_sandbox.py b/src/policyengine_api/agent_sandbox.py
index 7560ee7..1e40220 100644
--- a/src/policyengine_api/agent_sandbox.py
+++ b/src/policyengine_api/agent_sandbox.py
@@ -336,6 +336,8 @@ def log(msg: str) -> None:
     claude_tools = [
         {k: v for k, v in t.items() if k != "_meta"} for t in tools
     ]
+    # Add the sleep tool
+    claude_tools.append(SLEEP_TOOL)
 
     client = anthropic.Anthropic()
     messages = [{"role": "user", "content": question}]
@@ -370,11 +372,18 @@ def log(msg: str) -> None:
                 assistant_content.append(block)
 
                 # Execute tool
-                tool = tool_lookup.get(block.name)
-                if tool:
-                    result = execute_api_tool(tool, block.input, api_base_url, log)
+                if block.name == "sleep":
+                    # Handle sleep tool specially
+                    seconds = min(max(block.input.get("seconds", 5), 1), 60)
+                    log(f"[SLEEP] Waiting {seconds} seconds...")
+                    time.sleep(seconds)
+                    result = f"Slept for {seconds} seconds"
                 else:
-                    result = f"Unknown tool: {block.name}"
+                    tool = tool_lookup.get(block.name)
+                    if tool:
+                        result = execute_api_tool(tool, block.input, api_base_url, log)
+                    else:
+                        result = f"Unknown tool: {block.name}"
 
                 log(f"[TOOL_RESULT] {result[:300]}")
 

From 6bb314ea06de7292a267091151df9e6d51cd6dff Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Mon, 29 Dec 2025 17:40:23 +0000
Subject: [PATCH 04/13] Simplify chat UI: cleaner tool display, better text
 spacing

- Simplified progress indicator to single status line
- Removed noisy API call/response details
- Cleaner tool cards without timeline borders
- Better paragraph/list spacing in markdown
- Hide redundant agent messages
---
 docker-compose.yml                  |   2 +-
 docs/src/components/policy-chat.tsx | 204 +++++++---------------------
 src/policyengine_api/api/agent.py   |  69 +++++++---
 3 files changed, 100 insertions(+), 175 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index b05c701..60e8645 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -14,7 +14,7 @@ services:
       API_PORT: ${API_PORT:-8000}
       ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
       POLICYENGINE_API_URL: http://localhost:${API_PORT:-8000}
-      AGENT_USE_MODAL: "false"
+      AGENT_USE_MODAL: ${AGENT_USE_MODAL:-false}
     volumes:
       - ./src:/app/src
       - ./docs/out:/app/docs/out
diff --git a/docs/src/components/policy-chat.tsx b/docs/src/components/policy-chat.tsx
index 0147753..53f9890 100644
--- a/docs/src/components/policy-chat.tsx
+++ b/docs/src/components/policy-chat.tsx
@@ -133,119 +133,68 @@ function parseLogEntry(message: string): ParsedStep {
   };
 }
 
-function ToolCard({ step, isLast }: { step: ParsedStep; isLast: boolean }) {
+function ToolCard({ step }: { step: ParsedStep }) {
   const [isExpanded, setIsExpanded] = useState(false);
 
   if (step.type === "agent") {
-    return (
-      <div className="flex items-start gap-3 py-2">
-        <div className="w-1.5 h-1.5 mt-2 rounded-full bg-[var(--color-pe-green)] animate-pulse" />
-        <span className="text-sm text-[var(--color-text-secondary)]">{step.content}</span>
-      </div>
-    );
+    return null; // Hide agent messages, they're redundant with progress indicator
   }
 
   if (step.type === "tool_use") {
     return (
-      <div className="relative pl-4 border-l-2 border-[var(--color-pe-green)] py-3 animate-fadeIn">
-        <div className="absolute -left-[7px] top-4 w-3 h-3 rounded-full bg-[var(--color-pe-green)] border-2 border-white" />
-        <div className="bg-white rounded-lg border border-[var(--color-border)] shadow-sm overflow-hidden">
-          <button
-            onClick={() => setIsExpanded(!isExpanded)}
-            className="w-full flex items-center justify-between p-3 hover:bg-[var(--color-surface-sunken)] text-left"
-          >
-            <div className="flex items-center gap-2">
-              <svg className="w-4 h-4 text-[var(--color-pe-green)]" fill="none" viewBox="0 0 24 24" stroke="currentColor">
-                <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M10.325 4.317c.426-1.756 2.924-1.756 3.35 0a1.724 1.724 0 002.573 1.066c1.543-.94 3.31.826 2.37 2.37a1.724 1.724 0 001.065 2.572c1.756.426 1.756 2.924 0 3.35a1.724 1.724 0 00-1.066 2.573c.94 1.543-.826 3.31-2.37 2.37a1.724 1.724 0 00-2.572 1.065c-.426 1.756-2.924 1.756-3.35 0a1.724 1.724 0 00-2.573-1.066c-1.543.94-3.31-.826-2.37-2.37a1.724 1.724 0 00-1.065-2.572c-1.756-.426-1.756-2.924 0-3.35a1.724 1.724 0 001.066-2.573c-.94-1.543.826-3.31 2.37-2.37.996.608 2.296.07 2.572-1.065z" />
-                <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M15 12a3 3 0 11-6 0 3 3 0 016 0z" />
-              </svg>
-              <span className="font-medium text-sm text-[var(--color-text-primary)] capitalize">
-                {step.title}
-              </span>
-            </div>
+      <div className="py-1.5 animate-fadeIn">
+        <button
+          onClick={() => setIsExpanded(!isExpanded)}
+          className="flex items-center gap-2 text-sm hover:text-[var(--color-pe-green)] transition-colors"
+        >
+          <span className="w-1.5 h-1.5 rounded-full bg-[var(--color-pe-green)]" />
+          <span className="text-[var(--color-text-secondary)] capitalize">{step.title}</span>
+          {step.params && Object.keys(step.params).length > 0 && (
             <svg
-              className={`w-4 h-4 text-[var(--color-text-muted)] transition-transform ${isExpanded ? "rotate-180" : ""}`}
+              className={`w-3 h-3 text-[var(--color-text-muted)] transition-transform ${isExpanded ? "rotate-90" : ""}`}
               fill="none"
               viewBox="0 0 24 24"
               stroke="currentColor"
             >
-              <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 9l-7 7-7-7" />
+              <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
             </svg>
-          </button>
-          {isExpanded && step.params && Object.keys(step.params).length > 0 && (
-            <div className="px-3 pb-3 border-t border-[var(--color-border)]">
-              <div className="mt-2 font-mono text-xs bg-[var(--color-surface-sunken)] rounded p-2 overflow-x-auto">
-                {Object.entries(step.params).map(([key, value]) => (
-                  <div key={key} className="flex gap-2">
-                    <span className="text-[var(--color-pe-green)]">{key}:</span>
-                    <span className="text-[var(--color-text-secondary)]">
-                      {typeof value === "string" ? `"${value}"` : JSON.stringify(value)}
-                    </span>
-                  </div>
-                ))}
-              </div>
-            </div>
           )}
-        </div>
-        {isLast && (
-          <div className="absolute left-0 top-full h-4 w-0.5 bg-gradient-to-b from-[var(--color-pe-green)] to-transparent -translate-x-[1px]" />
+        </button>
+        {isExpanded && step.params && Object.keys(step.params).length > 0 && (
+          <div className="ml-3.5 mt-1 font-mono text-xs text-[var(--color-text-muted)] bg-[var(--color-surface)] rounded px-2 py-1.5">
+            {Object.entries(step.params).map(([key, value]) => (
+              <div key={key}>
+                <span className="text-[var(--color-pe-green)]">{key}</span>
+                <span className="text-[var(--color-text-muted)]">: </span>
+                <span>{typeof value === "string" ? `"${value}"` : JSON.stringify(value)}</span>
+              </div>
+            ))}
+          </div>
         )}
       </div>
     );
   }
 
-  if (step.type === "api_call" && step.method) {
-    const methodColors: Record<string, string> = {
-      GET: "bg-blue-100 text-blue-700",
-      POST: "bg-green-100 text-green-700",
-      PUT: "bg-amber-100 text-amber-700",
-      DELETE: "bg-red-100 text-red-700",
-    };
-    return (
-      <div className="pl-4 border-l-2 border-[var(--color-border)] py-1 ml-1 animate-fadeIn">
-        <div className="flex items-center gap-2 text-xs">
-          <span className={`px-1.5 py-0.5 rounded font-mono font-medium ${methodColors[step.method] || "bg-gray-100 text-gray-700"}`}>
-            {step.method}
-          </span>
-          <span className="font-mono text-[var(--color-text-muted)] truncate max-w-[300px]">
-            {step.url?.replace("https://v2.api.policyengine.org", "")}
-          </span>
-        </div>
-      </div>
-    );
-  }
-
-  if (step.type === "api_response") {
-    const isSuccess = step.statusCode && step.statusCode < 400;
-    return (
-      <div className="pl-4 border-l-2 border-[var(--color-border)] py-1 ml-1 animate-fadeIn">
-        <div className="flex items-center gap-2 text-xs">
-          <span className={`px-1.5 py-0.5 rounded font-mono font-medium ${isSuccess ? "bg-green-100 text-green-700" : "bg-red-100 text-red-700"}`}>
-            {step.statusCode}
-          </span>
-          <span className="text-[var(--color-text-muted)]">
-            {isSuccess ? "Success" : "Error"}
-          </span>
-        </div>
-      </div>
-    );
+  // Hide API details - too noisy
+  if (step.type === "api_call" || step.type === "api_response") {
+    return null;
   }
 
   if (step.type === "tool_result") {
     return (
-      <div className="pl-4 border-l-2 border-[var(--color-border)] py-2 ml-1 animate-fadeIn">
+      <div className="py-1 ml-3.5 animate-fadeIn">
         <button
           onClick={() => setIsExpanded(!isExpanded)}
-          className="flex items-center gap-2 text-xs text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)]"
+          className="flex items-center gap-1.5 text-xs text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)]"
         >
           <svg className={`w-3 h-3 transition-transform ${isExpanded ? "rotate-90" : ""}`} fill="none" viewBox="0 0 24 24" stroke="currentColor">
             <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
           </svg>
-          <span>View result data</span>
+          <span>Result</span>
         </button>
         {isExpanded && (
-          <div className="mt-2 font-mono text-xs bg-[var(--color-code-bg)] text-[var(--color-code-text)] rounded p-3 overflow-x-auto max-h-48 overflow-y-auto">
-            <pre className="whitespace-pre-wrap">{step.content.slice(0, 2000)}{step.content.length > 2000 ? "\n..." : ""}</pre>
+          <div className="mt-1.5 font-mono text-xs bg-[var(--color-code-bg)] text-[var(--color-code-text)] rounded p-2 overflow-x-auto max-h-32 overflow-y-auto">
+            <pre className="whitespace-pre-wrap">{step.content.slice(0, 1500)}{step.content.length > 1500 ? "\n..." : ""}</pre>
           </div>
         )}
       </div>
@@ -254,9 +203,8 @@ function ToolCard({ step, isLast }: { step: ParsedStep; isLast: boolean }) {
 
   if (step.type === "assistant") {
     return (
-      <div className="flex items-start gap-3 py-2 pl-5 animate-fadeIn">
-        <div className="w-1 h-1 mt-2 rounded-full bg-[var(--color-text-muted)]" />
-        <span className="text-sm text-[var(--color-text-muted)] italic">{step.content}</span>
+      <div className="py-1.5 animate-fadeIn">
+        <p className="text-sm text-[var(--color-text-muted)] leading-relaxed">{step.content}</p>
       </div>
     );
   }
@@ -265,72 +213,29 @@ function ToolCard({ step, isLast }: { step: ParsedStep; isLast: boolean }) {
 }
 
 function ProgressIndicator({ logs }: { logs: LogEntry[] }) {
-  const stages = useMemo(() => {
+  const stage = useMemo(() => {
     const hasSearch = logs.some(l => l.message.includes("parameters"));
     const hasPolicy = logs.some(l => l.message.includes("policies"));
     const hasAnalysis = logs.some(l => l.message.includes("analysis") || l.message.includes("economic"));
     const hasHousehold = logs.some(l => l.message.includes("household"));
     const isComplete = logs.some(l => l.message.includes("Completed"));
 
-    if (hasAnalysis) {
-      return [
-        { label: "Search", done: hasSearch },
-        { label: "Create policy", done: hasPolicy },
-        { label: "Run analysis", done: isComplete, active: !isComplete },
-        { label: "Complete", done: isComplete },
-      ];
-    }
-
-    if (hasHousehold) {
-      return [
-        { label: "Build household", done: true },
-        { label: "Calculate", done: isComplete, active: !isComplete },
-        { label: "Complete", done: isComplete },
-      ];
-    }
-
-    return [
-      { label: "Search", done: hasSearch, active: !hasSearch && logs.length > 0 },
-      { label: "Retrieve", done: logs.length > 3, active: hasSearch && logs.length <= 3 },
-      { label: "Complete", done: isComplete },
-    ];
+    if (isComplete) return "Complete";
+    if (hasAnalysis) return "Running analysis...";
+    if (hasPolicy) return "Creating policy...";
+    if (hasHousehold) return "Calculating...";
+    if (hasSearch) return "Searching parameters...";
+    return "Starting...";
   }, [logs]);
 
   if (logs.length === 0) return null;
 
   return (
-    <div className="flex items-center gap-1 mb-4 px-1">
-      {stages.map((stage, i) => (
-        <div key={stage.label} className="flex items-center">
-          <div className="flex items-center gap-1.5">
-            <div
-              className={`w-2 h-2 rounded-full transition-all duration-300 ${
-                stage.done
-                  ? "bg-[var(--color-pe-green)]"
-                  : stage.active
-                  ? "bg-[var(--color-pe-green)] animate-pulse"
-                  : "bg-[var(--color-border)]"
-              }`}
-            />
-            <span
-              className={`text-xs font-medium transition-colors ${
-                stage.done || stage.active
-                  ? "text-[var(--color-text-primary)]"
-                  : "text-[var(--color-text-muted)]"
-              }`}
-            >
-              {stage.label}
-            </span>
-          </div>
-          {i < stages.length - 1 && (
-            <div
-              className={`w-8 h-px mx-2 transition-colors ${
-                stage.done ? "bg-[var(--color-pe-green)]" : "bg-[var(--color-border)]"
-              }`}
-            />
-          )}
-        </div>
-      ))}
+    <div className="flex items-center gap-2 mb-3 text-xs text-[var(--color-text-muted)]">
+      {stage !== "Complete" && (
+        <div className="w-3 h-3 border-2 border-[var(--color-pe-green)] border-t-transparent rounded-full animate-spin" />
+      )}
+      <span>{stage}</span>
     </div>
   );
 }
@@ -559,17 +464,10 @@ export function PolicyChat() {
                             <span className="text-sm text-[var(--color-text-secondary)]">Starting analysis...</span>
                           </div>
                         ) : (
-                          <div className="space-y-1">
-                            {parsedSteps.slice(-15).map((step, j) => (
-                              <ToolCard
-                                key={j}
-                                step={step}
-                                isLast={j === parsedSteps.slice(-15).length - 1 && step.type === "tool_use"}
-                              />
+                          <div className="space-y-0">
+                            {parsedSteps.slice(-10).map((step, j) => (
+                              <ToolCard key={j} step={step} />
                             ))}
-                            <div className="flex items-center gap-2 pt-2 pl-5">
-                              <div className="w-2 h-4 bg-[var(--color-pe-green)] animate-pulse rounded-sm" />
-                            </div>
                           </div>
                         )}
                       </div>
@@ -587,9 +485,9 @@ export function PolicyChat() {
                               </svg>
                               <span>{parsedSteps.filter(s => s.type === "tool_use").length} tool calls executed</span>
                             </summary>
-                            <div className="mt-3 bg-[var(--color-surface-sunken)] rounded-xl p-4 space-y-1">
+                            <div className="mt-3 bg-[var(--color-surface-sunken)] rounded-xl p-4 space-y-0">
                               {parsedSteps.map((step, j) => (
-                                <ToolCard key={j} step={step} isLast={false} />
+                                <ToolCard key={j} step={step} />
                               ))}
                             </div>
                           </details>
@@ -601,7 +499,7 @@ export function PolicyChat() {
                             ? "bg-red-50 border border-red-200"
                             : "bg-white border border-[var(--color-border)]"
                         }`}>
-                          <div className="prose prose-sm max-w-none text-[var(--color-text-primary)] [&_strong]:font-semibold [&_code]:bg-[var(--color-surface-sunken)] [&_code]:px-1.5 [&_code]:py-0.5 [&_code]:rounded [&_code]:text-sm [&_code]:font-mono [&_h1]:text-lg [&_h2]:text-base [&_h3]:text-sm [&_ul]:my-2 [&_li]:my-0.5">
+                          <div className="prose prose-sm max-w-none text-[var(--color-text-primary)] [&_strong]:font-semibold [&_code]:bg-[var(--color-surface-sunken)] [&_code]:px-1.5 [&_code]:py-0.5 [&_code]:rounded [&_code]:text-sm [&_code]:font-mono [&_h1]:text-lg [&_h1]:mt-4 [&_h1]:mb-2 [&_h2]:text-base [&_h2]:mt-3 [&_h2]:mb-2 [&_h3]:text-sm [&_h3]:mt-2 [&_h3]:mb-1 [&_p]:my-3 [&_p]:leading-relaxed [&_ul]:my-3 [&_ul]:space-y-1 [&_ol]:my-3 [&_ol]:space-y-1 [&_li]:my-0 [&_li]:leading-relaxed [&_blockquote]:border-l-2 [&_blockquote]:border-[var(--color-pe-green)] [&_blockquote]:pl-4 [&_blockquote]:my-3 [&_blockquote]:text-[var(--color-text-secondary)]">
                             <ReactMarkdown remarkPlugins={[remarkBreaks]}>
                               {message.content}
                             </ReactMarkdown>
diff --git a/src/policyengine_api/api/agent.py b/src/policyengine_api/api/agent.py
index 33a4f21..7389211 100644
--- a/src/policyengine_api/api/agent.py
+++ b/src/policyengine_api/api/agent.py
@@ -3,14 +3,14 @@
 This endpoint lets users ask natural language questions about tax/benefit policy
 and get AI-generated reports using Claude Code connected to the PolicyEngine MCP server.
 
-The agent runs in a Modal sandbox and logs are fetched via Modal SDK.
+The agent runs in a Modal sandbox (production) or locally (development).
 """
 
+import asyncio
 import uuid
 from datetime import datetime
 
 import logfire
-import modal
 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 
@@ -67,6 +67,19 @@ class StatusResponse(BaseModel):
 _logs: dict[str, list[LogEntry]] = {}
 
 
+def _run_local_agent(call_id: str, question: str, api_base_url: str) -> None:
+    """Run agent locally in a background thread."""
+    from policyengine_api.agent_sandbox import _run_agent_impl
+
+    try:
+        result = _run_agent_impl(question, api_base_url, call_id)
+        _calls[call_id]["status"] = result.get("status", "completed")
+        _calls[call_id]["result"] = result
+    except Exception as e:
+        _calls[call_id]["status"] = "failed"
+        _calls[call_id]["result"] = {"status": "failed", "error": str(e)}
+
+
 @router.post("/run", response_model=RunResponse)
 async def run_agent(request: RunRequest) -> RunResponse:
     """Start the agent to answer a policy question.
@@ -90,30 +103,44 @@ async def run_agent(request: RunRequest) -> RunResponse:
     logfire.info("agent_run", question=request.question[:100])
 
     api_base_url = settings.policyengine_api_url
-
-    # Look up the deployed function
-    run_fn = modal.Function.from_name("policyengine-sandbox", "run_agent")
-
-    # Generate a call_id before spawning so we can pass it to the function
     call_id = f"fc-{uuid.uuid4().hex[:24]}"
 
     # Initialize logs storage
     _logs[call_id] = []
 
-    # Spawn the function (non-blocking) - pass call_id so it can POST logs back
-    call = run_fn.spawn(request.question, api_base_url, call_id)
-
-    # Store call info
-    _calls[call_id] = {
-        "call": call,
-        "modal_call_id": call.object_id,
-        "question": request.question,
-        "started_at": datetime.utcnow().isoformat(),
-        "status": "running",
-        "result": None,
-    }
-
-    logfire.info("agent_spawned", call_id=call_id, modal_call_id=call.object_id)
+    if settings.agent_use_modal:
+        # Production: use Modal
+        import modal
+
+        run_fn = modal.Function.from_name("policyengine-sandbox", "run_agent")
+        call = run_fn.spawn(request.question, api_base_url, call_id)
+
+        _calls[call_id] = {
+            "call": call,
+            "modal_call_id": call.object_id,
+            "question": request.question,
+            "started_at": datetime.utcnow().isoformat(),
+            "status": "running",
+            "result": None,
+        }
+        logfire.info("agent_spawned", call_id=call_id, modal_call_id=call.object_id)
+    else:
+        # Local development: run in background thread
+        _calls[call_id] = {
+            "call": None,
+            "modal_call_id": None,
+            "question": request.question,
+            "started_at": datetime.utcnow().isoformat(),
+            "status": "running",
+            "result": None,
+        }
+        logfire.info("agent_spawned_local", call_id=call_id)
+
+        # Run in background using asyncio
+        loop = asyncio.get_event_loop()
+        loop.run_in_executor(
+            None, _run_local_agent, call_id, request.question, api_base_url
+        )
 
     return RunResponse(call_id=call_id, status="running")
 

From d8d36a2703ff4451548c289daa9495ac71935478 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Mon, 29 Dec 2025 17:40:45 +0000
Subject: [PATCH 05/13] Filter out internal debug messages from chat UI

---
 docs/src/components/policy-chat.tsx | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/src/components/policy-chat.tsx b/docs/src/components/policy-chat.tsx
index 53f9890..8416bdb 100644
--- a/docs/src/components/policy-chat.tsx
+++ b/docs/src/components/policy-chat.tsx
@@ -29,9 +29,17 @@ interface ParsedStep {
 }
 
 function parseLogEntry(message: string): ParsedStep {
-  // [AGENT] messages
+  // [AGENT] messages - filter out internal debug info
   if (message.startsWith("[AGENT]")) {
     const content = message.replace("[AGENT] ", "");
+    // Skip internal debug messages
+    if (content.startsWith("Stop reason:") ||
+        content.startsWith("Turn ") ||
+        content.startsWith("Loaded ") ||
+        content.startsWith("Fetching ") ||
+        content.startsWith("Completed")) {
+      return { type: "unknown", title: "", content: "" };
+    }
     return {
       type: "agent",
       title: "Agent",

From 2c1ce5abccc02d418de1e6b5a0d0f559431b06c1 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Mon, 29 Dec 2025 17:44:11 +0000
Subject: [PATCH 06/13] fix: standardise typography in chat UI to text-sm

---
 docs/src/components/policy-chat.tsx | 33 +++++++++++++++--------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/docs/src/components/policy-chat.tsx b/docs/src/components/policy-chat.tsx
index 8416bdb..e465947 100644
--- a/docs/src/components/policy-chat.tsx
+++ b/docs/src/components/policy-chat.tsx
@@ -150,16 +150,16 @@ function ToolCard({ step }: { step: ParsedStep }) {
 
   if (step.type === "tool_use") {
     return (
-      <div className="py-1.5 animate-fadeIn">
+      <div className="py-1 animate-fadeIn">
         <button
           onClick={() => setIsExpanded(!isExpanded)}
-          className="flex items-center gap-2 text-sm hover:text-[var(--color-pe-green)] transition-colors"
+          className="flex items-center gap-2 hover:text-[var(--color-pe-green)] transition-colors"
         >
-          <span className="w-1.5 h-1.5 rounded-full bg-[var(--color-pe-green)]" />
-          <span className="text-[var(--color-text-secondary)] capitalize">{step.title}</span>
+          <span className="w-1.5 h-1.5 rounded-full bg-[var(--color-pe-green)] shrink-0" />
+          <span className="text-sm text-[var(--color-text-secondary)]">{step.title}</span>
           {step.params && Object.keys(step.params).length > 0 && (
             <svg
-              className={`w-3 h-3 text-[var(--color-text-muted)] transition-transform ${isExpanded ? "rotate-90" : ""}`}
+              className={`w-3.5 h-3.5 text-[var(--color-text-muted)] transition-transform shrink-0 ${isExpanded ? "rotate-90" : ""}`}
               fill="none"
               viewBox="0 0 24 24"
               stroke="currentColor"
@@ -169,12 +169,13 @@ function ToolCard({ step }: { step: ParsedStep }) {
           )}
         </button>
         {isExpanded && step.params && Object.keys(step.params).length > 0 && (
-          <div className="ml-3.5 mt-1 font-mono text-xs text-[var(--color-text-muted)] bg-[var(--color-surface)] rounded px-2 py-1.5">
+          <div className="ml-3.5 mt-1.5 text-sm text-[var(--color-text-muted)] bg-[var(--color-surface)] rounded-lg px-3 py-2">
             {Object.entries(step.params).map(([key, value]) => (
-              <div key={key}>
-                <span className="text-[var(--color-pe-green)]">{key}</span>
-                <span className="text-[var(--color-text-muted)]">: </span>
-                <span>{typeof value === "string" ? `"${value}"` : JSON.stringify(value)}</span>
+              <div key={key} className="flex gap-1">
+                <span className="text-[var(--color-pe-green)] font-medium">{key}:</span>
+                <span className="text-[var(--color-text-secondary)]">
+                  {typeof value === "string" ? value : JSON.stringify(value)}
+                </span>
               </div>
             ))}
           </div>
@@ -193,9 +194,9 @@ function ToolCard({ step }: { step: ParsedStep }) {
       <div className="py-1 ml-3.5 animate-fadeIn">
         <button
           onClick={() => setIsExpanded(!isExpanded)}
-          className="flex items-center gap-1.5 text-xs text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)]"
+          className="flex items-center gap-1.5 text-sm text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)]"
         >
-          <svg className={`w-3 h-3 transition-transform ${isExpanded ? "rotate-90" : ""}`} fill="none" viewBox="0 0 24 24" stroke="currentColor">
+          <svg className={`w-3.5 h-3.5 transition-transform ${isExpanded ? "rotate-90" : ""}`} fill="none" viewBox="0 0 24 24" stroke="currentColor">
             <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
           </svg>
           <span>Result</span>
@@ -239,9 +240,9 @@ function ProgressIndicator({ logs }: { logs: LogEntry[] }) {
   if (logs.length === 0) return null;
 
   return (
-    <div className="flex items-center gap-2 mb-3 text-xs text-[var(--color-text-muted)]">
+    <div className="flex items-center gap-2 mb-3 text-sm text-[var(--color-text-muted)]">
       {stage !== "Complete" && (
-        <div className="w-3 h-3 border-2 border-[var(--color-pe-green)] border-t-transparent rounded-full animate-spin" />
+        <div className="w-3.5 h-3.5 border-2 border-[var(--color-pe-green)] border-t-transparent rounded-full animate-spin" />
       )}
       <span>{stage}</span>
     </div>
@@ -487,8 +488,8 @@ export function PolicyChat() {
                         {/* Collapsible steps summary */}
                         {parsedSteps.length > 0 && (
                           <details className="group">
-                            <summary className="cursor-pointer list-none flex items-center gap-2 text-xs text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)]">
-                              <svg className="w-3 h-3 group-open:rotate-90 transition-transform" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                            <summary className="cursor-pointer list-none flex items-center gap-2 text-sm text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)]">
+                              <svg className="w-3.5 h-3.5 group-open:rotate-90 transition-transform" fill="none" viewBox="0 0 24 24" stroke="currentColor">
                                 <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
                               </svg>
                               <span>{parsedSteps.filter(s => s.type === "tool_use").length} tool calls executed</span>

From e1d3104748bf35a43c99752280521b1e620be098 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Mon, 29 Dec 2025 17:46:32 +0000
Subject: [PATCH 07/13] fix: code font for params, remove serif title, no
 truncation, add animations

---
 docs/src/components/policy-chat.tsx | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/docs/src/components/policy-chat.tsx b/docs/src/components/policy-chat.tsx
index e465947..77b5e8e 100644
--- a/docs/src/components/policy-chat.tsx
+++ b/docs/src/components/policy-chat.tsx
@@ -169,10 +169,10 @@ function ToolCard({ step }: { step: ParsedStep }) {
           )}
         </button>
         {isExpanded && step.params && Object.keys(step.params).length > 0 && (
-          <div className="ml-3.5 mt-1.5 text-sm text-[var(--color-text-muted)] bg-[var(--color-surface)] rounded-lg px-3 py-2">
+          <div className="ml-3.5 mt-1.5 font-mono text-xs text-[var(--color-text-muted)] bg-[var(--color-surface)] rounded-lg px-3 py-2 animate-slideDown">
             {Object.entries(step.params).map(([key, value]) => (
               <div key={key} className="flex gap-1">
-                <span className="text-[var(--color-pe-green)] font-medium">{key}:</span>
+                <span className="text-[var(--color-pe-green)]">{key}:</span>
                 <span className="text-[var(--color-text-secondary)]">
                   {typeof value === "string" ? value : JSON.stringify(value)}
                 </span>
@@ -202,8 +202,8 @@ function ToolCard({ step }: { step: ParsedStep }) {
           <span>Result</span>
         </button>
         {isExpanded && (
-          <div className="mt-1.5 font-mono text-xs bg-[var(--color-code-bg)] text-[var(--color-code-text)] rounded p-2 overflow-x-auto max-h-32 overflow-y-auto">
-            <pre className="whitespace-pre-wrap">{step.content.slice(0, 1500)}{step.content.length > 1500 ? "\n..." : ""}</pre>
+          <div className="mt-1.5 font-mono text-xs bg-[var(--color-code-bg)] text-[var(--color-code-text)] rounded p-2 overflow-x-auto max-h-64 overflow-y-auto animate-slideDown">
+            <pre className="whitespace-pre-wrap">{step.content}</pre>
           </div>
         )}
       </div>
@@ -431,7 +431,7 @@ export function PolicyChat() {
         {messages.length === 0 ? (
           <div className="h-full flex flex-col justify-center">
             <div className="text-center mb-8">
-              <h3 className="font-display text-2xl text-[var(--color-text-primary)] mb-2">
+              <h3 className="text-xl font-medium text-[var(--color-text-primary)] mb-2">
                 What would you like to know?
               </h3>
               <p className="text-sm text-[var(--color-text-muted)]">
@@ -563,9 +563,16 @@ export function PolicyChat() {
           from { opacity: 0; transform: translateY(4px); }
           to { opacity: 1; transform: translateY(0); }
         }
+        @keyframes slideDown {
+          from { opacity: 0; max-height: 0; }
+          to { opacity: 1; max-height: 500px; }
+        }
         .animate-fadeIn {
           animation: fadeIn 0.2s ease-out forwards;
         }
+        .animate-slideDown {
+          animation: slideDown 0.2s ease-out forwards;
+        }
       `}</style>
     </div>
   );

From f815e22f6b0c4e53af3eea49acc91008808ff00a Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Mon, 29 Dec 2025 17:49:58 +0000
Subject: [PATCH 08/13] style: add monospace font throughout for code-like
 aesthetic

---
 docs/src/components/policy-chat.tsx | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/src/components/policy-chat.tsx b/docs/src/components/policy-chat.tsx
index 77b5e8e..5a5b101 100644
--- a/docs/src/components/policy-chat.tsx
+++ b/docs/src/components/policy-chat.tsx
@@ -153,7 +153,7 @@ function ToolCard({ step }: { step: ParsedStep }) {
       <div className="py-1 animate-fadeIn">
         <button
           onClick={() => setIsExpanded(!isExpanded)}
-          className="flex items-center gap-2 hover:text-[var(--color-pe-green)] transition-colors"
+          className="flex items-center gap-2 hover:text-[var(--color-pe-green)] transition-colors font-mono"
         >
           <span className="w-1.5 h-1.5 rounded-full bg-[var(--color-pe-green)] shrink-0" />
           <span className="text-sm text-[var(--color-text-secondary)]">{step.title}</span>
@@ -194,7 +194,7 @@ function ToolCard({ step }: { step: ParsedStep }) {
       <div className="py-1 ml-3.5 animate-fadeIn">
         <button
           onClick={() => setIsExpanded(!isExpanded)}
-          className="flex items-center gap-1.5 text-sm text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)]"
+          className="flex items-center gap-1.5 text-sm text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)] font-mono"
         >
           <svg className={`w-3.5 h-3.5 transition-transform ${isExpanded ? "rotate-90" : ""}`} fill="none" viewBox="0 0 24 24" stroke="currentColor">
             <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
@@ -240,7 +240,7 @@ function ProgressIndicator({ logs }: { logs: LogEntry[] }) {
   if (logs.length === 0) return null;
 
   return (
-    <div className="flex items-center gap-2 mb-3 text-sm text-[var(--color-text-muted)]">
+    <div className="flex items-center gap-2 mb-3 text-sm text-[var(--color-text-muted)] font-mono">
       {stage !== "Complete" && (
         <div className="w-3.5 h-3.5 border-2 border-[var(--color-pe-green)] border-t-transparent rounded-full animate-spin" />
       )}
@@ -443,7 +443,7 @@ export function PolicyChat() {
                 <button
                   key={i}
                   onClick={() => setInput(q)}
-                  className="text-left p-4 rounded-xl bg-[var(--color-surface-sunken)] hover:bg-[var(--color-surface)] border border-transparent hover:border-[var(--color-border)] text-sm text-[var(--color-text-secondary)] transition-all group"
+                  className="text-left p-4 rounded-xl bg-[var(--color-surface-sunken)] hover:bg-[var(--color-surface)] border border-transparent hover:border-[var(--color-border)] text-sm text-[var(--color-text-secondary)] transition-all group font-mono"
                 >
                   <span className="group-hover:text-[var(--color-pe-green)] transition-colors">{q}</span>
                 </button>
@@ -457,7 +457,7 @@ export function PolicyChat() {
                 {message.role === "user" ? (
                   <div className="flex justify-end">
                     <div className="max-w-[80%] bg-[var(--color-pe-green)] text-white rounded-2xl rounded-br-md px-4 py-3">
-                      <p className="text-sm">{message.content}</p>
+                      <p className="text-sm font-mono">{message.content}</p>
                     </div>
                   </div>
                 ) : (
@@ -470,7 +470,7 @@ export function PolicyChat() {
                         {message.status === "pending" ? (
                           <div className="flex items-center gap-3">
                             <div className="w-5 h-5 border-2 border-[var(--color-pe-green)] border-t-transparent rounded-full animate-spin" />
-                            <span className="text-sm text-[var(--color-text-secondary)]">Starting analysis...</span>
+                            <span className="text-sm text-[var(--color-text-secondary)] font-mono">Starting analysis...</span>
                           </div>
                         ) : (
                           <div className="space-y-0">
@@ -488,7 +488,7 @@ export function PolicyChat() {
                         {/* Collapsible steps summary */}
                         {parsedSteps.length > 0 && (
                           <details className="group">
-                            <summary className="cursor-pointer list-none flex items-center gap-2 text-sm text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)]">
+                            <summary className="cursor-pointer list-none flex items-center gap-2 text-sm text-[var(--color-text-muted)] hover:text-[var(--color-text-secondary)] font-mono">
                               <svg className="w-3.5 h-3.5 group-open:rotate-90 transition-transform" fill="none" viewBox="0 0 24 24" stroke="currentColor">
                                 <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
                               </svg>
@@ -534,7 +534,7 @@ export function PolicyChat() {
             onChange={(e) => setInput(e.target.value)}
             placeholder="Ask a policy question..."
             disabled={isLoading}
-            className="flex-1 px-4 py-3 text-sm border border-[var(--color-border)] rounded-xl bg-white focus:outline-none focus:ring-2 focus:ring-[var(--color-pe-green)] focus:border-transparent disabled:opacity-50 placeholder:text-[var(--color-text-muted)]"
+            className="flex-1 px-4 py-3 text-sm font-mono border border-[var(--color-border)] rounded-xl bg-white focus:outline-none focus:ring-2 focus:ring-[var(--color-pe-green)] focus:border-transparent disabled:opacity-50 placeholder:text-[var(--color-text-muted)]"
           />
           <button
             type="submit"

From b2b53b2d9acabf0b7668ef29997f871f8db87931 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Mon, 29 Dec 2025 17:53:29 +0000
Subject: [PATCH 09/13] fix: increase max_turns to 30 and timeout to 600s for
 complex analyses

---
 src/policyengine_api/agent_sandbox.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/policyengine_api/agent_sandbox.py b/src/policyengine_api/agent_sandbox.py
index 1e40220..6408093 100644
--- a/src/policyengine_api/agent_sandbox.py
+++ b/src/policyengine_api/agent_sandbox.py
@@ -305,7 +305,7 @@ def _run_agent_impl(
     question: str,
     api_base_url: str = "https://v2.api.policyengine.org",
     call_id: str = "",
-    max_turns: int = 15,
+    max_turns: int = 30,
 ) -> dict:
     """Core agent implementation."""
 
@@ -421,12 +421,12 @@ def log(msg: str) -> None:
     return result
 
 
-@app.function(image=image, secrets=[anthropic_secret], timeout=300)
+@app.function(image=image, secrets=[anthropic_secret], timeout=600)
 def run_agent(
     question: str,
     api_base_url: str = "https://v2.api.policyengine.org",
     call_id: str = "",
-    max_turns: int = 15,
+    max_turns: int = 30,
 ) -> dict:
     """Run agentic loop to answer a policy question (Modal wrapper)."""
     return _run_agent_impl(question, api_base_url, call_id, max_turns)

From a2e517f4cdd133bd0b6bbda36f647ac8e737272f Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Tue, 30 Dec 2025 11:07:23 +0000
Subject: [PATCH 10/13] ci: spin up Supabase in CI to run all tests

---
 .github/workflows/test.yml | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 42981b0..01f3d9c 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,5 +21,19 @@ jobs:
       - name: Sync dependencies
         run: uv sync --extra dev
 
+      - name: Setup Supabase CLI
+        uses: supabase/setup-cli@v1
+        with:
+          version: latest
+
+      - name: Start Supabase
+        run: supabase start
+
+      - name: Initialise database
+        run: echo "yes" | uv run python scripts/init.py
+
+      - name: Seed database
+        run: uv run python scripts/seed.py
+
       - name: Run tests
-        run: uv run pytest -v -m "not integration"
+        run: uv run pytest -v

From 660f4b4b6811684e79f0ee4f8b1b40fabea0a989 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Tue, 30 Dec 2025 11:12:21 +0000
Subject: [PATCH 11/13] fix: remove migrations that depend on tables created by
 init.py

---
 .github/workflows/test.yml                    |   2 +-
 scripts/init.py                               |   2 +
 .../20241121000001_rls_policies.sql           | 157 ------------------
 .../20241228000000_household_jobs.sql         |  42 -----
 .../20241229000000_allow_null_user_id.sql     |   2 -
 5 files changed, 3 insertions(+), 202 deletions(-)
 delete mode 100644 supabase/migrations/20241121000001_rls_policies.sql
 delete mode 100644 supabase/migrations/20241228000000_household_jobs.sql
 delete mode 100644 supabase/migrations/20241229000000_allow_null_user_id.sql

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 01f3d9c..0356856 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -27,7 +27,7 @@ jobs:
           version: latest
 
       - name: Start Supabase
-        run: supabase start
+        run: supabase start || true
 
       - name: Initialise database
         run: echo "yes" | uv run python scripts/init.py
diff --git a/scripts/init.py b/scripts/init.py
index 587755e..cf7a04a 100644
--- a/scripts/init.py
+++ b/scripts/init.py
@@ -157,6 +157,7 @@ def apply_rls_policies(engine):
         "parameters",
         "parameter_values",
         "users",
+        "household_jobs",
     ]
 
     # Read-only tables (public can read, only service role can write)
@@ -176,6 +177,7 @@ def apply_rls_policies(engine):
         "policies",
         "dynamics",
         "reports",
+        "household_jobs",
     ]
 
     # Read-only results tables
diff --git a/supabase/migrations/20241121000001_rls_policies.sql b/supabase/migrations/20241121000001_rls_policies.sql
deleted file mode 100644
index 7022091..0000000
--- a/supabase/migrations/20241121000001_rls_policies.sql
+++ /dev/null
@@ -1,157 +0,0 @@
--- Enable RLS on all application tables
-ALTER TABLE datasets ENABLE ROW LEVEL SECURITY;
-ALTER TABLE dataset_versions ENABLE ROW LEVEL SECURITY;
-ALTER TABLE simulations ENABLE ROW LEVEL SECURITY;
-ALTER TABLE policies ENABLE ROW LEVEL SECURITY;
-ALTER TABLE dynamics ENABLE ROW LEVEL SECURITY;
-ALTER TABLE aggregates ENABLE ROW LEVEL SECURITY;
-ALTER TABLE change_aggregates ENABLE ROW LEVEL SECURITY;
-ALTER TABLE tax_benefit_models ENABLE ROW LEVEL SECURITY;
-ALTER TABLE tax_benefit_model_versions ENABLE ROW LEVEL SECURITY;
-ALTER TABLE variables ENABLE ROW LEVEL SECURITY;
-ALTER TABLE parameters ENABLE ROW LEVEL SECURITY;
-ALTER TABLE parameter_values ENABLE ROW LEVEL SECURITY;
-
--- Service role policies (full access to everything)
-DO $$
-BEGIN
-    -- Datasets
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'datasets' AND policyname = 'Service role full access') THEN
-        CREATE POLICY "Service role full access" ON datasets FOR ALL TO service_role USING (true) WITH CHECK (true);
-    END IF;
-
-    -- Dataset versions
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'dataset_versions' AND policyname = 'Service role full access') THEN
-        CREATE POLICY "Service role full access" ON dataset_versions FOR ALL TO service_role USING (true) WITH CHECK (true);
-    END IF;
-
-    -- Simulations
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'simulations' AND policyname = 'Service role full access') THEN
-        CREATE POLICY "Service role full access" ON simulations FOR ALL TO service_role USING (true) WITH CHECK (true);
-    END IF;
-
-    -- Policies
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'policies' AND policyname = 'Service role full access') THEN
-        CREATE POLICY "Service role full access" ON policies FOR ALL TO service_role USING (true) WITH CHECK (true);
-    END IF;
-
-    -- Dynamics
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'dynamics' AND policyname = 'Service role full access') THEN
-        CREATE POLICY "Service role full access" ON dynamics FOR ALL TO service_role USING (true) WITH CHECK (true);
-    END IF;
-
-    -- Aggregates
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'aggregates' AND policyname = 'Service role full access') THEN
-        CREATE POLICY "Service role full access" ON aggregates FOR ALL TO service_role USING (true) WITH CHECK (true);
-    END IF;
-
-    -- Change aggregates
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'change_aggregates' AND policyname = 'Service role full access') THEN
-        CREATE POLICY "Service role full access" ON change_aggregates FOR ALL TO service_role USING (true) WITH CHECK (true);
-    END IF;
-
-    -- Tax benefit models
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'tax_benefit_models' AND policyname = 'Service role full access') THEN
-        CREATE POLICY "Service role full access" ON tax_benefit_models FOR ALL TO service_role USING (true) WITH CHECK (true);
-    END IF;
-
-    -- Tax benefit model versions
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'tax_benefit_model_versions' AND policyname = 'Service role full access') THEN
-        CREATE POLICY "Service role full access" ON tax_benefit_model_versions FOR ALL TO service_role USING (true) WITH CHECK (true);
-    END IF;
-
-    -- Variables
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'variables' AND policyname = 'Service role full access') THEN
-        CREATE POLICY "Service role full access" ON variables FOR ALL TO service_role USING (true) WITH CHECK (true);
-    END IF;
-
-    -- Parameters
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'parameters' AND policyname = 'Service role full access') THEN
-        CREATE POLICY "Service role full access" ON parameters FOR ALL TO service_role USING (true) WITH CHECK (true);
-    END IF;
-
-    -- Parameter values
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'parameter_values' AND policyname = 'Service role full access') THEN
-        CREATE POLICY "Service role full access" ON parameter_values FOR ALL TO service_role USING (true) WITH CHECK (true);
-    END IF;
-END $$;
-
--- Public read access for read-only tables
-DO $$
-BEGIN
-    -- Tax benefit models (read-only for public)
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'tax_benefit_models' AND policyname = 'Public read access') THEN
-        CREATE POLICY "Public read access" ON tax_benefit_models FOR SELECT TO anon, authenticated USING (true);
-    END IF;
-
-    -- Tax benefit model versions (read-only for public)
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'tax_benefit_model_versions' AND policyname = 'Public read access') THEN
-        CREATE POLICY "Public read access" ON tax_benefit_model_versions FOR SELECT TO anon, authenticated USING (true);
-    END IF;
-
-    -- Variables (read-only for public)
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'variables' AND policyname = 'Public read access') THEN
-        CREATE POLICY "Public read access" ON variables FOR SELECT TO anon, authenticated USING (true);
-    END IF;
-
-    -- Parameters (read-only for public)
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'parameters' AND policyname = 'Public read access') THEN
-        CREATE POLICY "Public read access" ON parameters FOR SELECT TO anon, authenticated USING (true);
-    END IF;
-
-    -- Parameter values (read-only for public)
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'parameter_values' AND policyname = 'Public read access') THEN
-        CREATE POLICY "Public read access" ON parameter_values FOR SELECT TO anon, authenticated USING (true);
-    END IF;
-
-    -- Datasets (read-only for public)
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'datasets' AND policyname = 'Public read access') THEN
-        CREATE POLICY "Public read access" ON datasets FOR SELECT TO anon, authenticated USING (true);
-    END IF;
-
-    -- Dataset versions (read-only for public)
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'dataset_versions' AND policyname = 'Public read access') THEN
-        CREATE POLICY "Public read access" ON dataset_versions FOR SELECT TO anon, authenticated USING (true);
-    END IF;
-END $$;
-
--- User-created content policies
-DO $$
-BEGIN
-    -- Simulations (users can create and read their own)
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'simulations' AND policyname = 'Users can create simulations') THEN
-        CREATE POLICY "Users can create simulations" ON simulations FOR INSERT TO anon, authenticated WITH CHECK (true);
-    END IF;
-
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'simulations' AND policyname = 'Users can read simulations') THEN
-        CREATE POLICY "Users can read simulations" ON simulations FOR SELECT TO anon, authenticated USING (true);
-    END IF;
-
-    -- Policies (users can create and read their own)
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'policies' AND policyname = 'Users can create policies') THEN
-        CREATE POLICY "Users can create policies" ON policies FOR INSERT TO anon, authenticated WITH CHECK (true);
-    END IF;
-
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'policies' AND policyname = 'Users can read policies') THEN
-        CREATE POLICY "Users can read policies" ON policies FOR SELECT TO anon, authenticated USING (true);
-    END IF;
-
-    -- Dynamics (users can create and read their own)
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'dynamics' AND policyname = 'Users can create dynamics') THEN
-        CREATE POLICY "Users can create dynamics" ON dynamics FOR INSERT TO anon, authenticated WITH CHECK (true);
-    END IF;
-
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'dynamics' AND policyname = 'Users can read dynamics') THEN
-        CREATE POLICY "Users can read dynamics" ON dynamics FOR SELECT TO anon, authenticated USING (true);
-    END IF;
-
-    -- Aggregates (read access for all)
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'aggregates' AND policyname = 'Users can read aggregates') THEN
-        CREATE POLICY "Users can read aggregates" ON aggregates FOR SELECT TO anon, authenticated USING (true);
-    END IF;
-
-    -- Change aggregates (read access for all)
-    IF NOT EXISTS (SELECT 1 FROM pg_policies WHERE schemaname = 'public' AND tablename = 'change_aggregates' AND policyname = 'Users can read change aggregates') THEN
-        CREATE POLICY "Users can read change aggregates" ON change_aggregates FOR SELECT TO anon, authenticated USING (true);
-    END IF;
-END $$;
diff --git a/supabase/migrations/20241228000000_household_jobs.sql b/supabase/migrations/20241228000000_household_jobs.sql
deleted file mode 100644
index 758ba62..0000000
--- a/supabase/migrations/20241228000000_household_jobs.sql
+++ /dev/null
@@ -1,42 +0,0 @@
--- Create household_jobs table for async household calculations
-
-CREATE TABLE IF NOT EXISTS household_jobs (
-    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-    tax_benefit_model_name TEXT NOT NULL,
-    request_data JSONB NOT NULL,
-    policy_id UUID REFERENCES policies(id),
-    dynamic_id UUID REFERENCES dynamics(id),
-    status TEXT NOT NULL DEFAULT 'pending',
-    error_message TEXT,
-    result JSONB,
-    created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
-    started_at TIMESTAMPTZ,
-    completed_at TIMESTAMPTZ
-);
-
--- Index for polling by status
-CREATE INDEX IF NOT EXISTS idx_household_jobs_status ON household_jobs(status);
-
--- Index for looking up by id
-CREATE INDEX IF NOT EXISTS idx_household_jobs_id ON household_jobs(id);
-
--- Enable RLS
-ALTER TABLE household_jobs ENABLE ROW LEVEL SECURITY;
-
--- Allow public read access (jobs are not sensitive)
-CREATE POLICY "Allow public read access to household_jobs"
-    ON household_jobs
-    FOR SELECT
-    USING (true);
-
--- Allow public insert (anyone can create a job)
-CREATE POLICY "Allow public insert to household_jobs"
-    ON household_jobs
-    FOR INSERT
-    WITH CHECK (true);
-
--- Allow service role to update (for Modal functions)
-CREATE POLICY "Allow service role to update household_jobs"
-    ON household_jobs
-    FOR UPDATE
-    USING (true);
diff --git a/supabase/migrations/20241229000000_allow_null_user_id.sql b/supabase/migrations/20241229000000_allow_null_user_id.sql
deleted file mode 100644
index dba0825..0000000
--- a/supabase/migrations/20241229000000_allow_null_user_id.sql
+++ /dev/null
@@ -1,2 +0,0 @@
--- Allow null user_id in reports table for anonymous API-triggered reports
-ALTER TABLE reports ALTER COLUMN user_id DROP NOT NULL;

From 9005914db067c47be99b4d7e11723f03899da224 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Tue, 30 Dec 2025 11:15:47 +0000
Subject: [PATCH 12/13] fix: mark db-requiring tests as integration, restore
 simple CI workflow

---
 .github/workflows/test.yml     | 16 +---------------
 tests/test_household_impact.py |  3 +++
 tests/test_integration.py      |  6 ++++--
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0356856..42981b0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,19 +21,5 @@ jobs:
       - name: Sync dependencies
         run: uv sync --extra dev
 
-      - name: Setup Supabase CLI
-        uses: supabase/setup-cli@v1
-        with:
-          version: latest
-
-      - name: Start Supabase
-        run: supabase start || true
-
-      - name: Initialise database
-        run: echo "yes" | uv run python scripts/init.py
-
-      - name: Seed database
-        run: uv run python scripts/seed.py
-
       - name: Run tests
-        run: uv run pytest -v
+        run: uv run pytest -v -m "not integration"
diff --git a/tests/test_household_impact.py b/tests/test_household_impact.py
index 1cfd469..2ed1224 100644
--- a/tests/test_household_impact.py
+++ b/tests/test_household_impact.py
@@ -1,6 +1,9 @@
 """Tests for household impact comparison endpoint."""
 
 import pytest
+
+pytestmark = pytest.mark.integration
+
 from fastapi.testclient import TestClient
 
 from policyengine_api.main import app
diff --git a/tests/test_integration.py b/tests/test_integration.py
index d3f8dc1..e044cab 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -4,9 +4,11 @@
 Run with: make integration-test
 """
 
-from datetime import datetime, timezone
-
 import pytest
+
+pytestmark = pytest.mark.integration
+
+from datetime import datetime, timezone
 from rich.console import Console
 from sqlmodel import Session, create_engine, select
 

From 8c3ab7be358d7d2ca56475827eecd3452b60d4e2 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil@policyengine.org>
Date: Tue, 30 Dec 2025 11:17:14 +0000
Subject: [PATCH 13/13] fix: mark all db-requiring tests as integration

---
 tests/test_agent.py     | 6 ++++--
 tests/test_analysis.py  | 2 ++
 tests/test_household.py | 3 +++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tests/test_agent.py b/tests/test_agent.py
index c917159..2c591f5 100644
--- a/tests/test_agent.py
+++ b/tests/test_agent.py
@@ -3,10 +3,12 @@
 Tests verify that Claude Code is invoked correctly with proper MCP configuration.
 """
 
+import pytest
+
+pytestmark = pytest.mark.integration
+
 import json
 from unittest.mock import AsyncMock, MagicMock, patch
-
-import pytest
 from fastapi.testclient import TestClient
 
 from policyengine_api.main import app
diff --git a/tests/test_analysis.py b/tests/test_analysis.py
index b093303..90dbe7c 100644
--- a/tests/test_analysis.py
+++ b/tests/test_analysis.py
@@ -5,6 +5,8 @@
 """
 
 import pytest
+
+pytestmark = pytest.mark.integration
 from fastapi.testclient import TestClient
 from sqlmodel import Session, select
 
diff --git a/tests/test_household.py b/tests/test_household.py
index f8e2629..8f17176 100644
--- a/tests/test_household.py
+++ b/tests/test_household.py
@@ -1,6 +1,9 @@
 """Tests for household calculation endpoint."""
 
 import pytest
+
+pytestmark = pytest.mark.integration
+
 from fastapi.testclient import TestClient
 
 from policyengine_api.main import app