Skip to content

Commit 8efc6f8

Browse files
authored
Merge pull request #12 from JasonHonKL/dev/roadmap
tauri appdebug
2 parents f06c4b4 + efeeab2 commit 8efc6f8

32 files changed

Lines changed: 1674 additions & 258 deletions

ROADMAP.md

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,49 @@ Core engine, CLI, and all major subsystems are stable. Summary of shipped featur
3535

3636
## In Progress
3737

38-
_(Currently empty)_
38+
### Tauri Desktop App — Mission Control for AI Agents
39+
40+
**Priority: Urgent** — The desktop app is the primary interface for users to manage, monitor, and assist AI browsing agents.
41+
42+
**Phase 1 — Semantic Tree Viewer + CAPTCHA Handoff (current)**
43+
- [ ] Semantic tree viewer panel — render ARIA role tree with interactive nodes in Tauri dashboard
44+
- [ ] Per-instance controls — URL bar, navigate, agent status (idle/running/waiting-challenge)
45+
- [ ] CAPTCHA handoff — when agent hits a challenge, popup OS webview (WKWebView/WebKitGTK/WebView2) for user to solve, then sync cookies back to headless browser via CDP `Network.setCookie`
46+
- [ ] Cookie bridge — `tokio-tungstenite` WebSocket client to inject cookies into headless CDP server
47+
- [ ] Agent action log — real-time log of agent actions (navigate, click, type, wait) streamed from CDP events
48+
- [ ] Cross-platform — dashboard is pure HTML/CSS (no OS webview dependency for primary view); CAPTCHA popup uses OS webview only when needed
49+
50+
**Phase 2 — Multi-Agent Dashboard**
51+
- [ ] Multiple concurrent agent instances — spawn/manage N agents in one window
52+
- [ ] Agent status grid — see all agents at a glance with status indicators (running, idle, stuck, CAPTCHA)
53+
- [ ] Live agent action streaming — watch each agent's actions in real-time via CDP event bus
54+
- [ ] Take-over button — pause agent, let user manually interact, then resume agent
55+
- [ ] Agent conversation panel — show the LLM conversation alongside browser actions
56+
57+
**Phase 3 — Rendered View (Optional)**
58+
- [ ] Rendered page tab — OS webview shows actual page pixels (WKWebView on macOS, WebKitGTK on Linux, WebView2 on Windows)
59+
- [ ] Split view — semantic tree on left, rendered pixels on right
60+
- [ ] Screenshot capture — use pardus-core screenshot feature (chromiumoxide) for pixel-perfect captures
61+
62+
**Architecture:**
63+
```
64+
┌─ Mission Control ──────────────────────────────────────┐
65+
│ ┌─ Agents ─────┐ ┌─ Semantic Tree ──────────────────┐ │
66+
│ │ ● Agent 1 │ │ [Document] │ │
67+
│ │ Shopping │ │ ├── [Nav] "Menu" │ │
68+
│ │ Running │ │ ├── [Main] │ │
69+
│ │ │ │ │ ├── [H1] "Welcome" │ │
70+
│ │ ● Agent 2 │ │ │ ├── [TextBox #3] "Email" │ │
71+
│ │ Research │ │ │ └── [Button #4] "Submit" │ │
72+
│ │ ⚠ CAPTCHA │ │ └── [Footer] │ │
73+
│ └──────────────┘ └───────────────────────────────────┘ │
74+
│ ┌─ Action Log ────────────────────────────────────────┐ │
75+
│ │ 12:03:01 Navigate → shop.example.com │ │
76+
│ │ 12:03:02 Click [#5] "Add to Cart" │ │
77+
│ │ 12:03:03 ⚠ CAPTCHA detected — Cloudflare │ │
78+
│ └─────────────────────────────────────────────────────┘ │
79+
└─────────────────────────────────────────────────────────┘
80+
```
3981

4082
---
4183

ai-agent/pardus-browser/src/agent/Agent.ts

Lines changed: 109 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { LLMClient, LLMConfig, Message, getSystemPrompt } from '../llm/index.js';
1+
import { LLMClient, LLMConfig, Message, getSystemPrompt, compactMessages, truncateToolResult, ContextConfig } from '../llm/index.js';
22
import { ToolExecutor } from '../tools/executor.js';
33
import { BrowserManager } from '../core/index.js';
44
import { BrowserToolName } from '../tools/definitions.js';
@@ -13,13 +13,15 @@ interface AgentOptions {
1313
maxRounds?: number;
1414
/** Tool execution configuration */
1515
toolConfig?: {
16-
/** Enable parallel execution where safe (default: false) */
16+
/** Enable parallel execution where safe (default: true) */
1717
parallel?: boolean;
1818
/** Continue on tool failure (default: true) */
1919
continueOnError?: boolean;
2020
/** Default retry configuration for all tools */
2121
defaultRetryConfig?: ToolExecutionConfig;
2222
};
23+
/** Context window management configuration */
24+
contextConfig?: Partial<ContextConfig>;
2325
}
2426

2527
/**
@@ -36,17 +38,25 @@ export class Agent {
3638
private browserManager: BrowserManager;
3739
private isRunning = false;
3840
private toolConfig: AgentOptions['toolConfig'];
41+
private contextConfig: ContextConfig;
3942

4043
constructor(browserManager: BrowserManager, options: AgentOptions) {
4144
this.browserManager = browserManager;
4245
this.llm = new LLMClient(options.llmConfig);
4346
this.toolExecutor = new ToolExecutor(browserManager);
4447
this.maxRounds = options.maxRounds ?? 50;
4548
this.toolConfig = {
46-
parallel: false,
49+
parallel: true,
4750
continueOnError: true,
4851
...options.toolConfig,
4952
};
53+
this.contextConfig = {
54+
maxTokens: 100_000,
55+
keepRecentMessages: 10,
56+
maxToolResultChars: 6000,
57+
charsPerToken: 4,
58+
...options.contextConfig,
59+
};
5060

5161
// Initialize with system prompt
5262
this.messages.push({
@@ -128,22 +138,21 @@ export class Agent {
128138
return errorMessage;
129139
}
130140

131-
// Add all tool results to conversation
141+
// Add all tool results to conversation — toolCallId flows from the LLM response
132142
for (const result of toolResults) {
133-
// Find the original tool call ID
134-
const toolCall = response.toolCalls.find(t =>
135-
t.name === result.name &&
136-
JSON.stringify(t.arguments) === JSON.stringify(result.args)
137-
);
138-
143+
const content = result.success
144+
? truncateToolResult(result.content || '', this.contextConfig.maxToolResultChars)
145+
: `Error: ${result.error || 'Unknown error'}\n\nPartial result: ${result.content || 'none'}`;
146+
139147
this.messages.push({
140148
role: 'tool',
141-
tool_call_id: toolCall?.id || 'unknown',
142-
content: result.success
143-
? (result.content || '')
144-
: `Error: ${result.error || 'Unknown error'}\n\nPartial result: ${result.content || 'none'}`,
149+
tool_call_id: result.toolCallId || 'unknown',
150+
content,
145151
});
146152
}
153+
154+
// Compact conversation history if approaching context limit
155+
this.messages = compactMessages(this.messages, this.contextConfig);
147156
}
148157

149158
if (rounds >= this.maxRounds) {
@@ -169,7 +178,7 @@ export class Agent {
169178
if (!this.toolConfig?.parallel) {
170179
// Sequential execution
171180
const results: ToolExecutionResult[] = [];
172-
181+
173182
for (const call of toolCalls) {
174183
console.log(`[Tool] ${call.name}: ${JSON.stringify(call.arguments)}`);
175184

@@ -180,6 +189,7 @@ export class Agent {
180189
);
181190

182191
results.push({
192+
toolCallId: call.id,
183193
name: call.name,
184194
args: call.arguments,
185195
success: result.success,
@@ -196,12 +206,12 @@ export class Agent {
196206
console.log(`[Tool Error] ${result.error}`);
197207
}
198208
}
199-
209+
200210
return results;
201211
} else {
202212
// Parallel execution with grouping
203-
// Convert to format expected by executeTools
204213
const tools = toolCalls.map(call => ({
214+
toolCallId: call.id,
205215
name: call.name as BrowserToolName,
206216
args: call.arguments,
207217
config: this.toolConfig?.defaultRetryConfig,
@@ -220,42 +230,100 @@ export class Agent {
220230
console.log(`[Tool Error] ${result.error}`);
221231
}
222232
}
223-
233+
224234
return parallelResult.results;
225235
}
226236
}
227237

228238
/**
229-
* Stream a response for interactive CLI
230-
*
231-
* Note: Tool calls still happen after the stream completes
239+
* Stream a response for interactive CLI with full tool call support.
240+
*
241+
* Yields text chunks as they arrive. Tool calls are buffered and
242+
* executed after the stream completes, then the loop continues
243+
* (same as chat() but with streamed text output).
232244
*/
233245
async *streamChat(userMessage: string): AsyncGenerator<string, string, unknown> {
234-
// For streaming, we currently don't support mid-stream tool calls
235-
// The LLM will respond with text, then we check for tool calls
236-
// This is a simplified version - full implementation would parse tool calls from stream
246+
if (this.isRunning) {
247+
throw new Error('Agent is already processing a message');
248+
}
237249

238-
this.messages.push({
239-
role: 'user',
240-
content: userMessage,
241-
});
250+
this.isRunning = true;
242251

243-
// For simplicity in streaming mode, we don't use tools
244-
// Full implementation would parse tool calls from stream
245-
const stream = this.llm.streamChat(this.messages);
246-
let fullResponse = '';
252+
try {
253+
this.messages.push({ role: 'user', content: userMessage });
247254

248-
for await (const chunk of stream) {
249-
fullResponse += chunk;
250-
yield chunk;
251-
}
255+
let rounds = 0;
252256

253-
this.messages.push({
254-
role: 'assistant',
255-
content: fullResponse,
256-
});
257+
while (rounds < this.maxRounds) {
258+
rounds++;
259+
260+
const result = await this.llm.streamChat(this.messages);
261+
262+
// Yield any text chunks
263+
for (const chunk of result.textChunks) {
264+
yield chunk;
265+
}
257266

258-
return fullResponse;
267+
// No tool calls — done
268+
if (!result.toolCalls || result.toolCalls.length === 0) {
269+
this.messages.push({
270+
role: 'assistant',
271+
content: result.content ?? '',
272+
});
273+
return result.content ?? '';
274+
}
275+
276+
// Add assistant message with tool calls
277+
this.messages.push({
278+
role: 'assistant',
279+
content: result.content ?? '',
280+
tool_calls: result.toolCalls.map(call => ({
281+
id: call.id,
282+
type: 'function' as const,
283+
function: {
284+
name: call.name,
285+
arguments: JSON.stringify(call.arguments),
286+
},
287+
})),
288+
});
289+
290+
// Execute tool calls
291+
const toolResults = await this.executeToolCalls(result.toolCalls);
292+
293+
const hasFailures = toolResults.some(r => !r.success);
294+
if (hasFailures && !this.toolConfig?.continueOnError) {
295+
const errorMessage = 'Tool execution failed. Aborting conversation.';
296+
this.messages.push({ role: 'assistant', content: errorMessage });
297+
yield `\n\n${errorMessage}`;
298+
return errorMessage;
299+
}
300+
301+
// Add tool results
302+
for (const res of toolResults) {
303+
const content = res.success
304+
? truncateToolResult(res.content || '', this.contextConfig.maxToolResultChars)
305+
: `Error: ${res.error || 'Unknown error'}\n\nPartial result: ${res.content || 'none'}`;
306+
307+
this.messages.push({
308+
role: 'tool',
309+
tool_call_id: res.toolCallId || 'unknown',
310+
content,
311+
});
312+
}
313+
314+
// Compact context
315+
this.messages = compactMessages(this.messages, this.contextConfig);
316+
317+
// The loop continues — the next iteration will stream the LLM's
318+
// response to the tool results (which may include more tool calls).
319+
}
320+
321+
const limitMsg = 'Maximum number of tool call rounds reached.';
322+
yield `\n\n${limitMsg}`;
323+
return limitMsg;
324+
} finally {
325+
this.isRunning = false;
326+
}
259327
}
260328

261329
/**

0 commit comments

Comments
 (0)