Feat: run browser

This commit is contained in:
Grail Finder
2026-03-14 12:24:11 +03:00
parent 92acfb7ed4
commit f4fcb85570
2 changed files with 285 additions and 4 deletions

212
tools.go
View File

@@ -205,8 +205,7 @@ func updateToolCapabilities() {
if cfg == nil || cfg.CurrentAPI == "" { if cfg == nil || cfg.CurrentAPI == "" {
logger.Warn("cannot determine model capabilities: cfg or CurrentAPI is nil") logger.Warn("cannot determine model capabilities: cfg or CurrentAPI is nil")
registerWindowTools() registerWindowTools()
fnMap["browser_agent"] = runBrowserAgent // fnMap["browser_agent"] = runBrowserAgent
// registerPlaywrightTools()
return return
} }
prevHasVision := modelHasVision prevHasVision := modelHasVision
@@ -220,8 +219,7 @@ func updateToolCapabilities() {
} }
} }
registerWindowTools() registerWindowTools()
fnMap["browser_agent"] = runBrowserAgent // fnMap["browser_agent"] = runBrowserAgent
// registerPlaywrightTools()
} }
// getWebAgentClient returns a singleton AgentClient for web agents. // getWebAgentClient returns a singleton AgentClient for web agents.
@@ -511,6 +509,18 @@ func runCmd(args map[string]string) []byte {
case "todo": case "todo":
// todo create|read|update|delete - route to existing todo handlers // todo create|read|update|delete - route to existing todo handlers
return []byte(handleTodoSubcommand(rest, args)) return []byte(handleTodoSubcommand(rest, args))
case "window", "windows":
// window list - list all windows
return listWindows(args)
case "capture", "screenshot":
// capture <window-name> - capture a window
return captureWindow(args)
case "capture_and_view", "screenshot_and_view":
// capture and view screenshot
return captureWindowAndView(args)
case "browser":
// browser <action> [args...] - Playwright browser automation
return runBrowserCommand(rest, args)
default: default:
// Everything else: shell with pipe/chaining support // Everything else: shell with pipe/chaining support
result := tools.ExecChain(commandStr) result := tools.ExecChain(commandStr)
@@ -518,6 +528,136 @@ func runCmd(args map[string]string) []byte {
} }
} }
// runBrowserCommand routes browser subcommands to Playwright handlers
func runBrowserCommand(args []string, originalArgs map[string]string) []byte {
if len(args) == 0 {
return []byte(`usage: browser <action> [args...]
Actions:
start - start browser
stop - stop browser
running - check if browser is running
go <url> - navigate to URL
click <selector> - click element
fill <selector> <text> - fill input
text [selector] - extract text
html [selector] - get HTML
dom - get DOM
screenshot [path] - take screenshot
screenshot_and_view - take and view screenshot
wait <selector> - wait for element
drag <from> <to> - drag element`)
}
action := args[0]
rest := args[1:]
switch action {
case "start":
return pwStart(originalArgs)
case "stop":
return pwStop(originalArgs)
case "running":
return pwIsRunning(originalArgs)
case "go", "navigate", "open":
// browser go <url>
url := ""
if len(rest) > 0 {
url = rest[0]
}
if url == "" {
return []byte("usage: browser go <url>")
}
return pwNavigate(map[string]string{"url": url})
case "click":
// browser click <selector> [index]
selector := ""
index := "0"
if len(rest) > 0 {
selector = rest[0]
}
if len(rest) > 1 {
index = rest[1]
}
if selector == "" {
return []byte("usage: browser click <selector> [index]")
}
return pwClick(map[string]string{"selector": selector, "index": index})
case "fill":
// browser fill <selector> <text>
if len(rest) < 2 {
return []byte("usage: browser fill <selector> <text>")
}
return pwFill(map[string]string{"selector": rest[0], "text": strings.Join(rest[1:], " ")})
case "text":
// browser text [selector]
selector := ""
if len(rest) > 0 {
selector = rest[0]
}
return pwExtractText(map[string]string{"selector": selector})
case "html":
// browser html [selector]
selector := ""
if len(rest) > 0 {
selector = rest[0]
}
return pwGetHTML(map[string]string{"selector": selector})
case "dom":
return pwGetDOM(originalArgs)
case "screenshot":
// browser screenshot [path]
path := ""
if len(rest) > 0 {
path = rest[0]
}
return pwScreenshot(map[string]string{"path": path})
case "screenshot_and_view":
// browser screenshot_and_view [path]
path := ""
if len(rest) > 0 {
path = rest[0]
}
return pwScreenshotAndView(map[string]string{"path": path})
case "wait":
// browser wait <selector>
selector := ""
if len(rest) > 0 {
selector = rest[0]
}
if selector == "" {
return []byte("usage: browser wait <selector>")
}
return pwWaitForSelector(map[string]string{"selector": selector})
case "drag":
// browser drag <x1> <y1> <x2> <y2> OR browser drag <from_selector> <to_selector>
if len(rest) < 4 && len(rest) < 2 {
return []byte("usage: browser drag <x1> <y1> <x2> <y2> OR browser drag <from_selector> <to_selector>")
}
// Check if first arg is a number (coordinates) or selector
_, err := strconv.Atoi(rest[0])
_, err2 := strconv.ParseFloat(rest[0], 64)
if err == nil || err2 == nil {
// Coordinates: browser drag 100 200 300 400
if len(rest) < 4 {
return []byte("usage: browser drag <x1> <y1> <x2> <y2>")
}
return pwDrag(map[string]string{
"x1": rest[0], "y1": rest[1],
"x2": rest[2], "y2": rest[3],
})
}
// Selectors: browser drag #item #container
// pwDrag needs coordinates, so we need to get element positions first
// This requires a different approach - use JavaScript to get centers
return pwDragBySelector(map[string]string{
"fromSelector": rest[0],
"toSelector": rest[1],
})
default:
return []byte(fmt.Sprintf("unknown browser action: %s", action))
}
}
// getHelp returns help text for commands // getHelp returns help text for commands
func getHelp(args []string) string { func getHelp(args []string) string {
if len(args) == 0 { if len(args) == 0 {
@@ -567,6 +707,25 @@ func getHelp(args []string) string {
todo update <id> <status> - update todo (pending/in_progress/completed) todo update <id> <status> - update todo (pending/in_progress/completed)
todo delete <id> - delete a todo todo delete <id> - delete a todo
# Window (requires xdotool + maim)
window - list available windows
capture <name> - capture a window screenshot
capture_and_view <name> - capture and view screenshot
# Browser (requires Playwright)
browser start - start browser
browser stop - stop browser
browser running - check if running
browser go <url> - navigate to URL
browser click <sel> - click element
browser fill <sel> <txt> - fill input
browser text [sel] - extract text
browser html [sel] - get HTML
browser screenshot - take screenshot
browser wait <sel> - wait for element
browser drag <x1> <y1> <x2> <y2> - drag by coordinates
browser drag <sel1> <sel2> - drag by selectors (center points)
# System # System
<any shell command> - run shell command directly <any shell command> - run shell command directly
@@ -675,6 +834,51 @@ Use: run "command" to execute.`
run "go test ./..." run "go test ./..."
run "go mod tidy" run "go mod tidy"
run "go get github.com/package"` run "go get github.com/package"`
case "window", "windows":
return `window
List available windows.
Requires: xdotool and maim
Example:
run "window"`
case "capture", "screenshot":
return `capture <window-name-or-id>
Capture a screenshot of a window.
Requires: xdotool and maim
Examples:
run "capture Firefox"
run "capture 0x12345678"
run "capture_and_view Firefox"`
case "capture_and_view":
return `capture_and_view <window-name-or-id>
Capture a window and return for viewing.
Requires: xdotool and maim
Examples:
run "capture_and_view Firefox"`
case "browser":
return `browser <action> [args]
Playwright browser automation.
Requires: Playwright browser server running
Actions:
start - start browser
stop - stop browser
running - check if browser is running
go <url> - navigate to URL
click <selector> - click element (use index for multiple: click #btn 1)
fill <selector> <text> - fill input field
text [selector] - extract text (from element or whole page)
html [selector] - get HTML (from element or whole page)
screenshot [path] - take screenshot
wait <selector> - wait for element to appear
drag <from> <to> - drag element to another element
Examples:
run "browser start"
run "browser go https://example.com"
run "browser click #submit-button"
run "browser fill #search-input hello"
run "browser text"
run "browser screenshot"
run "browser drag 100 200 300 400"
run "browser drag #item1 #container2"`
default: default:
return fmt.Sprintf("No help available for: %s. Use: run \"help\" for all commands.", cmd) return fmt.Sprintf("No help available for: %s. Use: run \"help\" for all commands.", cmd)
} }

View File

@@ -455,6 +455,83 @@ func pwDrag(args map[string]string) []byte {
return []byte(fmt.Sprintf(`{"success": true, "message": "Dragged from (%s,%s) to (%s,%s)"}`, x1, y1, x2, y2)) return []byte(fmt.Sprintf(`{"success": true, "message": "Dragged from (%s,%s) to (%s,%s)"}`, x1, y1, x2, y2))
} }
func pwDragBySelector(args map[string]string) []byte {
fromSelector, ok := args["fromSelector"]
if !ok || fromSelector == "" {
return []byte(`{"error": "fromSelector not provided"}`)
}
toSelector, ok := args["toSelector"]
if !ok || toSelector == "" {
return []byte(`{"error": "toSelector not provided"}`)
}
if !browserStarted || page == nil {
return []byte(`{"error": "Browser not started. Call pw_start first."}`)
}
// Get center coordinates of both elements using JavaScript
fromJS := fmt.Sprintf(`
function getCenter(selector) {
const el = document.querySelector(selector);
if (!el) return null;
const rect = el.getBoundingClientRect();
return { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
}
getCenter(%q)
`, fromSelector)
toJS := fmt.Sprintf(`
function getCenter(selector) {
const el = document.querySelector(selector);
if (!el) return null;
const rect = el.getBoundingClientRect();
return { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
}
getCenter(%q)
`, toSelector)
fromResult, err := page.Evaluate(fromJS)
if err != nil {
return []byte(fmt.Sprintf(`{"error": "failed to get from element: %s"}`, err.Error()))
}
fromMap, ok := fromResult.(map[string]interface{})
if !ok || fromMap == nil {
return []byte(fmt.Sprintf(`{"error": "from selector '%s' not found"}`, fromSelector))
}
fromX := fromMap["x"].(float64)
fromY := fromMap["y"].(float64)
toResult, err := page.Evaluate(toJS)
if err != nil {
return []byte(fmt.Sprintf(`{"error": "failed to get to element: %s"}`, err.Error()))
}
toMap, ok := toResult.(map[string]interface{})
if !ok || toMap == nil {
return []byte(fmt.Sprintf(`{"error": "to selector '%s' not found"}`, toSelector))
}
toX := toMap["x"].(float64)
toY := toMap["y"].(float64)
// Perform the drag using coordinates
mouse := page.Mouse()
err = mouse.Move(fromX, fromY)
if err != nil {
return []byte(fmt.Sprintf(`{"error": "failed to move mouse: %s"}`, err.Error()))
}
err = mouse.Down()
if err != nil {
return []byte(fmt.Sprintf(`{"error": "failed to mouse down: %s"}`, err.Error()))
}
err = mouse.Move(toX, toY)
if err != nil {
return []byte(fmt.Sprintf(`{"error": "failed to move mouse: %s"}`, err.Error()))
}
err = mouse.Up()
if err != nil {
return []byte(fmt.Sprintf(`{"error": "failed to mouse up: %s"}`, err.Error()))
}
msg := fmt.Sprintf("Dragged from %s (%.0f,%.0f) to %s (%.0f,%.0f)", fromSelector, fromX, fromY, toSelector, toX, toY)
return []byte(fmt.Sprintf(`{"success": true, "message": "%s"}`, msg))
}
func pwClickAt(args map[string]string) []byte { func pwClickAt(args map[string]string) []byte {
x, ok := args["x"] x, ok := args["x"]
if !ok { if !ok {