JasonHonKL
diff --git a/‎ROADMAP.md‎
Lines changed: 43 additions & 1 deletion b/‎ROADMAP.md‎
Lines changed: 43 additions & 1 deletion
diff --git a/‎ai-agent/pardus-browser/src/agent/Agent.ts‎
Lines changed: 109 additions & 41 deletions b/‎ai-agent/pardus-browser/src/agent/Agent.ts‎
Lines changed: 109 additions & 41 deletions
@@ -35,7 +35,49 @@ Core engine, CLI, and all major subsystems are stable. Summary of shipped featur
 
 ## In Progress
 
-_(Currently empty)_
+### Tauri Desktop App — Mission Control for AI Agents
+
+**Priority: Urgent** — The desktop app is the primary interface for users to manage, monitor, and assist AI browsing agents.
+
+**Phase 1 — Semantic Tree Viewer + CAPTCHA Handoff (current)**
+- [ ] Semantic tree viewer panel — render ARIA role tree with interactive nodes in Tauri dashboard
+- [ ] Per-instance controls — URL bar, navigate, agent status (idle/running/waiting-challenge)
+- [ ] CAPTCHA handoff — when agent hits a challenge, popup OS webview (WKWebView/WebKitGTK/WebView2) for user to solve, then sync cookies back to headless browser via CDP `Network.setCookie`
+- [ ] Cookie bridge — `tokio-tungstenite` WebSocket client to inject cookies into headless CDP server
+- [ ] Agent action log — real-time log of agent actions (navigate, click, type, wait) streamed from CDP events
+- [ ] Cross-platform — dashboard is pure HTML/CSS (no OS webview dependency for primary view); CAPTCHA popup uses OS webview only when needed
+
+**Phase 2 — Multi-Agent Dashboard**
+- [ ] Multiple concurrent agent instances — spawn/manage N agents in one window
+- [ ] Agent status grid — see all agents at a glance with status indicators (running, idle, stuck, CAPTCHA)
+- [ ] Live agent action streaming — watch each agent's actions in real-time via CDP event bus
+- [ ] Take-over button — pause agent, let user manually interact, then resume agent
+- [ ] Agent conversation panel — show the LLM conversation alongside browser actions
+
+**Phase 3 — Rendered View (Optional)**
+- [ ] Rendered page tab — OS webview shows actual page pixels (WKWebView on macOS, WebKitGTK on Linux, WebView2 on Windows)
+- [ ] Split view — semantic tree on left, rendered pixels on right
+- [ ] Screenshot capture — use pardus-core screenshot feature (chromiumoxide) for pixel-perfect captures
+
+**Architecture:**
+```
+┌─ Mission Control ──────────────────────────────────────┐
+│ ┌─ Agents ─────┐  ┌─ Semantic Tree ──────────────────┐ │
+│ │ ● Agent 1    │  │ [Document]                        │ │
+│ │   Shopping   │  │  ├── [Nav] "Menu"                 │ │
+│ │   Running    │  │  ├── [Main]                       │ │
+│ │              │  │  │   ├── [H1] "Welcome"           │ │
+│ │ ● Agent 2    │  │  │   ├── [TextBox #3] "Email"    │ │
+│ │   Research   │  │  │   └── [Button #4] "Submit"    │ │
+│ │   ⚠ CAPTCHA  │  │  └── [Footer]                     │ │
+│ └──────────────┘  └───────────────────────────────────┘ │
+│ ┌─ Action Log ────────────────────────────────────────┐ │
+│ │ 12:03:01 Navigate → shop.example.com                │ │
+│ │ 12:03:02 Click [#5] "Add to Cart"                   │ │
+│ │ 12:03:03 ⚠ CAPTCHA detected — Cloudflare           │ │
+│ └─────────────────────────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────┘
+```
 
 ---
 
 
@@ -1,4 +1,4 @@
-import { LLMClient, LLMConfig, Message, getSystemPrompt } from '../llm/index.js';
+import { LLMClient, LLMConfig, Message, getSystemPrompt, compactMessages, truncateToolResult, ContextConfig } from '../llm/index.js';
 import { ToolExecutor } from '../tools/executor.js';
 import { BrowserManager } from '../core/index.js';
 import { BrowserToolName } from '../tools/definitions.js';
@@ -13,13 +13,15 @@ interface AgentOptions {
   maxRounds?: number;
   /** Tool execution configuration */
   toolConfig?: {
-    /** Enable parallel execution where safe (default: false) */
+    /** Enable parallel execution where safe (default: true) */
     parallel?: boolean;
     /** Continue on tool failure (default: true) */
     continueOnError?: boolean;
     /** Default retry configuration for all tools */
     defaultRetryConfig?: ToolExecutionConfig;
   };
+  /** Context window management configuration */
+  contextConfig?: Partial<ContextConfig>;
 }
 
 /**
@@ -36,17 +38,25 @@ export class Agent {
   private browserManager: BrowserManager;
   private isRunning = false;
   private toolConfig: AgentOptions['toolConfig'];
+  private contextConfig: ContextConfig;
 
   constructor(browserManager: BrowserManager, options: AgentOptions) {
     this.browserManager = browserManager;
     this.llm = new LLMClient(options.llmConfig);
     this.toolExecutor = new ToolExecutor(browserManager);
     this.maxRounds = options.maxRounds ?? 50;
     this.toolConfig = {
-      parallel: false,
+      parallel: true,
       continueOnError: true,
       ...options.toolConfig,
     };
+    this.contextConfig = {
+      maxTokens: 100_000,
+      keepRecentMessages: 10,
+      maxToolResultChars: 6000,
+      charsPerToken: 4,
+      ...options.contextConfig,
+    };
 
     // Initialize with system prompt
     this.messages.push({
@@ -128,22 +138,21 @@ export class Agent {
           return errorMessage;
         }
 
-        // Add all tool results to conversation
+        // Add all tool results to conversation — toolCallId flows from the LLM response
         for (const result of toolResults) {
-          // Find the original tool call ID
-          const toolCall = response.toolCalls.find(t => 
-            t.name === result.name && 
-            JSON.stringify(t.arguments) === JSON.stringify(result.args)
-          );
-          
+          const content = result.success
+            ? truncateToolResult(result.content || '', this.contextConfig.maxToolResultChars)
+            : `Error: ${result.error || 'Unknown error'}\n\nPartial result: ${result.content || 'none'}`;
+
           this.messages.push({
             role: 'tool',
-            tool_call_id: toolCall?.id || 'unknown',
-            content: result.success 
-              ? (result.content || '')
-              : `Error: ${result.error || 'Unknown error'}\n\nPartial result: ${result.content || 'none'}`,
+            tool_call_id: result.toolCallId || 'unknown',
+            content,
           });
         }
+
+        // Compact conversation history if approaching context limit
+        this.messages = compactMessages(this.messages, this.contextConfig);
       }
 
       if (rounds >= this.maxRounds) {
@@ -169,7 +178,7 @@ export class Agent {
     if (!this.toolConfig?.parallel) {
       // Sequential execution
       const results: ToolExecutionResult[] = [];
-      
+
       for (const call of toolCalls) {
         console.log(`[Tool] ${call.name}: ${JSON.stringify(call.arguments)}`);
 
@@ -180,6 +189,7 @@ export class Agent {
         );
 
         results.push({
+          toolCallId: call.id,
           name: call.name,
           args: call.arguments,
           success: result.success,
@@ -196,12 +206,12 @@ export class Agent {
           console.log(`[Tool Error] ${result.error}`);
         }
       }
-      
+
       return results;
     } else {
       // Parallel execution with grouping
-      // Convert to format expected by executeTools
       const tools = toolCalls.map(call => ({
+        toolCallId: call.id,
         name: call.name as BrowserToolName,
         args: call.arguments,
         config: this.toolConfig?.defaultRetryConfig,
@@ -220,42 +230,100 @@ export class Agent {
           console.log(`[Tool Error] ${result.error}`);
         }
       }
-      
+
       return parallelResult.results;
     }
   }
 
   /**
-   * Stream a response for interactive CLI
-   * 
-   * Note: Tool calls still happen after the stream completes
+   * Stream a response for interactive CLI with full tool call support.
+   *
+   * Yields text chunks as they arrive. Tool calls are buffered and
+   * executed after the stream completes, then the loop continues
+   * (same as chat() but with streamed text output).
    */
   async *streamChat(userMessage: string): AsyncGenerator<string, string, unknown> {
-    // For streaming, we currently don't support mid-stream tool calls
-    // The LLM will respond with text, then we check for tool calls
-    // This is a simplified version - full implementation would parse tool calls from stream
+    if (this.isRunning) {
+      throw new Error('Agent is already processing a message');
+    }
 
-    this.messages.push({
-      role: 'user',
-      content: userMessage,
-    });
+    this.isRunning = true;
 
-    // For simplicity in streaming mode, we don't use tools
-    // Full implementation would parse tool calls from stream
-    const stream = this.llm.streamChat(this.messages);
-    let fullResponse = '';
+    try {
+      this.messages.push({ role: 'user', content: userMessage });
 
-    for await (const chunk of stream) {
-      fullResponse += chunk;
-      yield chunk;
-    }
+      let rounds = 0;
 
-    this.messages.push({
-      role: 'assistant',
-      content: fullResponse,
-    });
+      while (rounds < this.maxRounds) {
+        rounds++;
+
+        const result = await this.llm.streamChat(this.messages);
+
+        // Yield any text chunks
+        for (const chunk of result.textChunks) {
+          yield chunk;
+        }
 
-    return fullResponse;
+        // No tool calls — done
+        if (!result.toolCalls || result.toolCalls.length === 0) {
+          this.messages.push({
+            role: 'assistant',
+            content: result.content ?? '',
+          });
+          return result.content ?? '';
+        }
+
+        // Add assistant message with tool calls
+        this.messages.push({
+          role: 'assistant',
+          content: result.content ?? '',
+          tool_calls: result.toolCalls.map(call => ({
+            id: call.id,
+            type: 'function' as const,
+            function: {
+              name: call.name,
+              arguments: JSON.stringify(call.arguments),
+            },
+          })),
+        });
+
+        // Execute tool calls
+        const toolResults = await this.executeToolCalls(result.toolCalls);
+
+        const hasFailures = toolResults.some(r => !r.success);
+        if (hasFailures && !this.toolConfig?.continueOnError) {
+          const errorMessage = 'Tool execution failed. Aborting conversation.';
+          this.messages.push({ role: 'assistant', content: errorMessage });
+          yield `\n\n${errorMessage}`;
+          return errorMessage;
+        }
+
+        // Add tool results
+        for (const res of toolResults) {
+          const content = res.success
+            ? truncateToolResult(res.content || '', this.contextConfig.maxToolResultChars)
+            : `Error: ${res.error || 'Unknown error'}\n\nPartial result: ${res.content || 'none'}`;
+
+          this.messages.push({
+            role: 'tool',
+            tool_call_id: res.toolCallId || 'unknown',
+            content,
+          });
+        }
+
+        // Compact context
+        this.messages = compactMessages(this.messages, this.contextConfig);
+
+        // The loop continues — the next iteration will stream the LLM's
+        // response to the tool results (which may include more tool calls).
+      }
+
+      const limitMsg = 'Maximum number of tool call rounds reached.';
+      yield `\n\n${limitMsg}`;
+      return limitMsg;
+    } finally {
+      this.isRunning = false;
+    }
   }
 
   /**