Feat: run browser
This commit is contained in:
212
tools.go
212
tools.go
@@ -205,8 +205,7 @@ func updateToolCapabilities() {
|
||||
if cfg == nil || cfg.CurrentAPI == "" {
|
||||
logger.Warn("cannot determine model capabilities: cfg or CurrentAPI is nil")
|
||||
registerWindowTools()
|
||||
fnMap["browser_agent"] = runBrowserAgent
|
||||
// registerPlaywrightTools()
|
||||
// fnMap["browser_agent"] = runBrowserAgent
|
||||
return
|
||||
}
|
||||
prevHasVision := modelHasVision
|
||||
@@ -220,8 +219,7 @@ func updateToolCapabilities() {
|
||||
}
|
||||
}
|
||||
registerWindowTools()
|
||||
fnMap["browser_agent"] = runBrowserAgent
|
||||
// registerPlaywrightTools()
|
||||
// fnMap["browser_agent"] = runBrowserAgent
|
||||
}
|
||||
|
||||
// getWebAgentClient returns a singleton AgentClient for web agents.
|
||||
@@ -511,6 +509,18 @@ func runCmd(args map[string]string) []byte {
|
||||
case "todo":
|
||||
// todo create|read|update|delete - route to existing todo handlers
|
||||
return []byte(handleTodoSubcommand(rest, args))
|
||||
case "window", "windows":
|
||||
// window list - list all windows
|
||||
return listWindows(args)
|
||||
case "capture", "screenshot":
|
||||
// capture <window-name> - capture a window
|
||||
return captureWindow(args)
|
||||
case "capture_and_view", "screenshot_and_view":
|
||||
// capture and view screenshot
|
||||
return captureWindowAndView(args)
|
||||
case "browser":
|
||||
// browser <action> [args...] - Playwright browser automation
|
||||
return runBrowserCommand(rest, args)
|
||||
default:
|
||||
// Everything else: shell with pipe/chaining support
|
||||
result := tools.ExecChain(commandStr)
|
||||
@@ -518,6 +528,136 @@ func runCmd(args map[string]string) []byte {
|
||||
}
|
||||
}
|
||||
|
||||
// runBrowserCommand routes browser subcommands to Playwright handlers
|
||||
func runBrowserCommand(args []string, originalArgs map[string]string) []byte {
|
||||
if len(args) == 0 {
|
||||
return []byte(`usage: browser <action> [args...]
|
||||
Actions:
|
||||
start - start browser
|
||||
stop - stop browser
|
||||
running - check if browser is running
|
||||
go <url> - navigate to URL
|
||||
click <selector> - click element
|
||||
fill <selector> <text> - fill input
|
||||
text [selector] - extract text
|
||||
html [selector] - get HTML
|
||||
dom - get DOM
|
||||
screenshot [path] - take screenshot
|
||||
screenshot_and_view - take and view screenshot
|
||||
wait <selector> - wait for element
|
||||
drag <from> <to> - drag element`)
|
||||
}
|
||||
|
||||
action := args[0]
|
||||
rest := args[1:]
|
||||
|
||||
switch action {
|
||||
case "start":
|
||||
return pwStart(originalArgs)
|
||||
case "stop":
|
||||
return pwStop(originalArgs)
|
||||
case "running":
|
||||
return pwIsRunning(originalArgs)
|
||||
case "go", "navigate", "open":
|
||||
// browser go <url>
|
||||
url := ""
|
||||
if len(rest) > 0 {
|
||||
url = rest[0]
|
||||
}
|
||||
if url == "" {
|
||||
return []byte("usage: browser go <url>")
|
||||
}
|
||||
return pwNavigate(map[string]string{"url": url})
|
||||
case "click":
|
||||
// browser click <selector> [index]
|
||||
selector := ""
|
||||
index := "0"
|
||||
if len(rest) > 0 {
|
||||
selector = rest[0]
|
||||
}
|
||||
if len(rest) > 1 {
|
||||
index = rest[1]
|
||||
}
|
||||
if selector == "" {
|
||||
return []byte("usage: browser click <selector> [index]")
|
||||
}
|
||||
return pwClick(map[string]string{"selector": selector, "index": index})
|
||||
case "fill":
|
||||
// browser fill <selector> <text>
|
||||
if len(rest) < 2 {
|
||||
return []byte("usage: browser fill <selector> <text>")
|
||||
}
|
||||
return pwFill(map[string]string{"selector": rest[0], "text": strings.Join(rest[1:], " ")})
|
||||
case "text":
|
||||
// browser text [selector]
|
||||
selector := ""
|
||||
if len(rest) > 0 {
|
||||
selector = rest[0]
|
||||
}
|
||||
return pwExtractText(map[string]string{"selector": selector})
|
||||
case "html":
|
||||
// browser html [selector]
|
||||
selector := ""
|
||||
if len(rest) > 0 {
|
||||
selector = rest[0]
|
||||
}
|
||||
return pwGetHTML(map[string]string{"selector": selector})
|
||||
case "dom":
|
||||
return pwGetDOM(originalArgs)
|
||||
case "screenshot":
|
||||
// browser screenshot [path]
|
||||
path := ""
|
||||
if len(rest) > 0 {
|
||||
path = rest[0]
|
||||
}
|
||||
return pwScreenshot(map[string]string{"path": path})
|
||||
case "screenshot_and_view":
|
||||
// browser screenshot_and_view [path]
|
||||
path := ""
|
||||
if len(rest) > 0 {
|
||||
path = rest[0]
|
||||
}
|
||||
return pwScreenshotAndView(map[string]string{"path": path})
|
||||
case "wait":
|
||||
// browser wait <selector>
|
||||
selector := ""
|
||||
if len(rest) > 0 {
|
||||
selector = rest[0]
|
||||
}
|
||||
if selector == "" {
|
||||
return []byte("usage: browser wait <selector>")
|
||||
}
|
||||
return pwWaitForSelector(map[string]string{"selector": selector})
|
||||
case "drag":
|
||||
// browser drag <x1> <y1> <x2> <y2> OR browser drag <from_selector> <to_selector>
|
||||
if len(rest) < 4 && len(rest) < 2 {
|
||||
return []byte("usage: browser drag <x1> <y1> <x2> <y2> OR browser drag <from_selector> <to_selector>")
|
||||
}
|
||||
// Check if first arg is a number (coordinates) or selector
|
||||
_, err := strconv.Atoi(rest[0])
|
||||
_, err2 := strconv.ParseFloat(rest[0], 64)
|
||||
if err == nil || err2 == nil {
|
||||
// Coordinates: browser drag 100 200 300 400
|
||||
if len(rest) < 4 {
|
||||
return []byte("usage: browser drag <x1> <y1> <x2> <y2>")
|
||||
}
|
||||
return pwDrag(map[string]string{
|
||||
"x1": rest[0], "y1": rest[1],
|
||||
"x2": rest[2], "y2": rest[3],
|
||||
})
|
||||
}
|
||||
// Selectors: browser drag #item #container
|
||||
// pwDrag needs coordinates, so we need to get element positions first
|
||||
// This requires a different approach - use JavaScript to get centers
|
||||
return pwDragBySelector(map[string]string{
|
||||
"fromSelector": rest[0],
|
||||
"toSelector": rest[1],
|
||||
})
|
||||
default:
|
||||
return []byte(fmt.Sprintf("unknown browser action: %s", action))
|
||||
}
|
||||
}
|
||||
|
||||
// getHelp returns help text for commands
|
||||
func getHelp(args []string) string {
|
||||
if len(args) == 0 {
|
||||
@@ -567,6 +707,25 @@ func getHelp(args []string) string {
|
||||
todo update <id> <status> - update todo (pending/in_progress/completed)
|
||||
todo delete <id> - delete a todo
|
||||
|
||||
# Window (requires xdotool + maim)
|
||||
window - list available windows
|
||||
capture <name> - capture a window screenshot
|
||||
capture_and_view <name> - capture and view screenshot
|
||||
|
||||
# Browser (requires Playwright)
|
||||
browser start - start browser
|
||||
browser stop - stop browser
|
||||
browser running - check if running
|
||||
browser go <url> - navigate to URL
|
||||
browser click <sel> - click element
|
||||
browser fill <sel> <txt> - fill input
|
||||
browser text [sel] - extract text
|
||||
browser html [sel] - get HTML
|
||||
browser screenshot - take screenshot
|
||||
browser wait <sel> - wait for element
|
||||
browser drag <x1> <y1> <x2> <y2> - drag by coordinates
|
||||
browser drag <sel1> <sel2> - drag by selectors (center points)
|
||||
|
||||
# System
|
||||
<any shell command> - run shell command directly
|
||||
|
||||
@@ -675,6 +834,51 @@ Use: run "command" to execute.`
|
||||
run "go test ./..."
|
||||
run "go mod tidy"
|
||||
run "go get github.com/package"`
|
||||
case "window", "windows":
|
||||
return `window
|
||||
List available windows.
|
||||
Requires: xdotool and maim
|
||||
Example:
|
||||
run "window"`
|
||||
case "capture", "screenshot":
|
||||
return `capture <window-name-or-id>
|
||||
Capture a screenshot of a window.
|
||||
Requires: xdotool and maim
|
||||
Examples:
|
||||
run "capture Firefox"
|
||||
run "capture 0x12345678"
|
||||
run "capture_and_view Firefox"`
|
||||
case "capture_and_view":
|
||||
return `capture_and_view <window-name-or-id>
|
||||
Capture a window and return for viewing.
|
||||
Requires: xdotool and maim
|
||||
Examples:
|
||||
run "capture_and_view Firefox"`
|
||||
case "browser":
|
||||
return `browser <action> [args]
|
||||
Playwright browser automation.
|
||||
Requires: Playwright browser server running
|
||||
Actions:
|
||||
start - start browser
|
||||
stop - stop browser
|
||||
running - check if browser is running
|
||||
go <url> - navigate to URL
|
||||
click <selector> - click element (use index for multiple: click #btn 1)
|
||||
fill <selector> <text> - fill input field
|
||||
text [selector] - extract text (from element or whole page)
|
||||
html [selector] - get HTML (from element or whole page)
|
||||
screenshot [path] - take screenshot
|
||||
wait <selector> - wait for element to appear
|
||||
drag <from> <to> - drag element to another element
|
||||
Examples:
|
||||
run "browser start"
|
||||
run "browser go https://example.com"
|
||||
run "browser click #submit-button"
|
||||
run "browser fill #search-input hello"
|
||||
run "browser text"
|
||||
run "browser screenshot"
|
||||
run "browser drag 100 200 300 400"
|
||||
run "browser drag #item1 #container2"`
|
||||
default:
|
||||
return fmt.Sprintf("No help available for: %s. Use: run \"help\" for all commands.", cmd)
|
||||
}
|
||||
|
||||
@@ -455,6 +455,83 @@ func pwDrag(args map[string]string) []byte {
|
||||
return []byte(fmt.Sprintf(`{"success": true, "message": "Dragged from (%s,%s) to (%s,%s)"}`, x1, y1, x2, y2))
|
||||
}
|
||||
|
||||
func pwDragBySelector(args map[string]string) []byte {
|
||||
fromSelector, ok := args["fromSelector"]
|
||||
if !ok || fromSelector == "" {
|
||||
return []byte(`{"error": "fromSelector not provided"}`)
|
||||
}
|
||||
toSelector, ok := args["toSelector"]
|
||||
if !ok || toSelector == "" {
|
||||
return []byte(`{"error": "toSelector not provided"}`)
|
||||
}
|
||||
if !browserStarted || page == nil {
|
||||
return []byte(`{"error": "Browser not started. Call pw_start first."}`)
|
||||
}
|
||||
|
||||
// Get center coordinates of both elements using JavaScript
|
||||
fromJS := fmt.Sprintf(`
|
||||
function getCenter(selector) {
|
||||
const el = document.querySelector(selector);
|
||||
if (!el) return null;
|
||||
const rect = el.getBoundingClientRect();
|
||||
return { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
|
||||
}
|
||||
getCenter(%q)
|
||||
`, fromSelector)
|
||||
toJS := fmt.Sprintf(`
|
||||
function getCenter(selector) {
|
||||
const el = document.querySelector(selector);
|
||||
if (!el) return null;
|
||||
const rect = el.getBoundingClientRect();
|
||||
return { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
|
||||
}
|
||||
getCenter(%q)
|
||||
`, toSelector)
|
||||
|
||||
fromResult, err := page.Evaluate(fromJS)
|
||||
if err != nil {
|
||||
return []byte(fmt.Sprintf(`{"error": "failed to get from element: %s"}`, err.Error()))
|
||||
}
|
||||
fromMap, ok := fromResult.(map[string]interface{})
|
||||
if !ok || fromMap == nil {
|
||||
return []byte(fmt.Sprintf(`{"error": "from selector '%s' not found"}`, fromSelector))
|
||||
}
|
||||
fromX := fromMap["x"].(float64)
|
||||
fromY := fromMap["y"].(float64)
|
||||
|
||||
toResult, err := page.Evaluate(toJS)
|
||||
if err != nil {
|
||||
return []byte(fmt.Sprintf(`{"error": "failed to get to element: %s"}`, err.Error()))
|
||||
}
|
||||
toMap, ok := toResult.(map[string]interface{})
|
||||
if !ok || toMap == nil {
|
||||
return []byte(fmt.Sprintf(`{"error": "to selector '%s' not found"}`, toSelector))
|
||||
}
|
||||
toX := toMap["x"].(float64)
|
||||
toY := toMap["y"].(float64)
|
||||
|
||||
// Perform the drag using coordinates
|
||||
mouse := page.Mouse()
|
||||
err = mouse.Move(fromX, fromY)
|
||||
if err != nil {
|
||||
return []byte(fmt.Sprintf(`{"error": "failed to move mouse: %s"}`, err.Error()))
|
||||
}
|
||||
err = mouse.Down()
|
||||
if err != nil {
|
||||
return []byte(fmt.Sprintf(`{"error": "failed to mouse down: %s"}`, err.Error()))
|
||||
}
|
||||
err = mouse.Move(toX, toY)
|
||||
if err != nil {
|
||||
return []byte(fmt.Sprintf(`{"error": "failed to move mouse: %s"}`, err.Error()))
|
||||
}
|
||||
err = mouse.Up()
|
||||
if err != nil {
|
||||
return []byte(fmt.Sprintf(`{"error": "failed to mouse up: %s"}`, err.Error()))
|
||||
}
|
||||
msg := fmt.Sprintf("Dragged from %s (%.0f,%.0f) to %s (%.0f,%.0f)", fromSelector, fromX, fromY, toSelector, toX, toY)
|
||||
return []byte(fmt.Sprintf(`{"success": true, "message": "%s"}`, msg))
|
||||
}
|
||||
|
||||
func pwClickAt(args map[string]string) []byte {
|
||||
x, ok := args["x"]
|
||||
if !ok {
|
||||
|
||||
Reference in New Issue
Block a user