Feat: run browser
This commit is contained in:
212
tools.go
212
tools.go
@@ -205,8 +205,7 @@ func updateToolCapabilities() {
|
|||||||
if cfg == nil || cfg.CurrentAPI == "" {
|
if cfg == nil || cfg.CurrentAPI == "" {
|
||||||
logger.Warn("cannot determine model capabilities: cfg or CurrentAPI is nil")
|
logger.Warn("cannot determine model capabilities: cfg or CurrentAPI is nil")
|
||||||
registerWindowTools()
|
registerWindowTools()
|
||||||
fnMap["browser_agent"] = runBrowserAgent
|
// fnMap["browser_agent"] = runBrowserAgent
|
||||||
// registerPlaywrightTools()
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
prevHasVision := modelHasVision
|
prevHasVision := modelHasVision
|
||||||
@@ -220,8 +219,7 @@ func updateToolCapabilities() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
registerWindowTools()
|
registerWindowTools()
|
||||||
fnMap["browser_agent"] = runBrowserAgent
|
// fnMap["browser_agent"] = runBrowserAgent
|
||||||
// registerPlaywrightTools()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// getWebAgentClient returns a singleton AgentClient for web agents.
|
// getWebAgentClient returns a singleton AgentClient for web agents.
|
||||||
@@ -511,6 +509,18 @@ func runCmd(args map[string]string) []byte {
|
|||||||
case "todo":
|
case "todo":
|
||||||
// todo create|read|update|delete - route to existing todo handlers
|
// todo create|read|update|delete - route to existing todo handlers
|
||||||
return []byte(handleTodoSubcommand(rest, args))
|
return []byte(handleTodoSubcommand(rest, args))
|
||||||
|
case "window", "windows":
|
||||||
|
// window list - list all windows
|
||||||
|
return listWindows(args)
|
||||||
|
case "capture", "screenshot":
|
||||||
|
// capture <window-name> - capture a window
|
||||||
|
return captureWindow(args)
|
||||||
|
case "capture_and_view", "screenshot_and_view":
|
||||||
|
// capture and view screenshot
|
||||||
|
return captureWindowAndView(args)
|
||||||
|
case "browser":
|
||||||
|
// browser <action> [args...] - Playwright browser automation
|
||||||
|
return runBrowserCommand(rest, args)
|
||||||
default:
|
default:
|
||||||
// Everything else: shell with pipe/chaining support
|
// Everything else: shell with pipe/chaining support
|
||||||
result := tools.ExecChain(commandStr)
|
result := tools.ExecChain(commandStr)
|
||||||
@@ -518,6 +528,136 @@ func runCmd(args map[string]string) []byte {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// runBrowserCommand routes browser subcommands to Playwright handlers
|
||||||
|
func runBrowserCommand(args []string, originalArgs map[string]string) []byte {
|
||||||
|
if len(args) == 0 {
|
||||||
|
return []byte(`usage: browser <action> [args...]
|
||||||
|
Actions:
|
||||||
|
start - start browser
|
||||||
|
stop - stop browser
|
||||||
|
running - check if browser is running
|
||||||
|
go <url> - navigate to URL
|
||||||
|
click <selector> - click element
|
||||||
|
fill <selector> <text> - fill input
|
||||||
|
text [selector] - extract text
|
||||||
|
html [selector] - get HTML
|
||||||
|
dom - get DOM
|
||||||
|
screenshot [path] - take screenshot
|
||||||
|
screenshot_and_view - take and view screenshot
|
||||||
|
wait <selector> - wait for element
|
||||||
|
drag <from> <to> - drag element`)
|
||||||
|
}
|
||||||
|
|
||||||
|
action := args[0]
|
||||||
|
rest := args[1:]
|
||||||
|
|
||||||
|
switch action {
|
||||||
|
case "start":
|
||||||
|
return pwStart(originalArgs)
|
||||||
|
case "stop":
|
||||||
|
return pwStop(originalArgs)
|
||||||
|
case "running":
|
||||||
|
return pwIsRunning(originalArgs)
|
||||||
|
case "go", "navigate", "open":
|
||||||
|
// browser go <url>
|
||||||
|
url := ""
|
||||||
|
if len(rest) > 0 {
|
||||||
|
url = rest[0]
|
||||||
|
}
|
||||||
|
if url == "" {
|
||||||
|
return []byte("usage: browser go <url>")
|
||||||
|
}
|
||||||
|
return pwNavigate(map[string]string{"url": url})
|
||||||
|
case "click":
|
||||||
|
// browser click <selector> [index]
|
||||||
|
selector := ""
|
||||||
|
index := "0"
|
||||||
|
if len(rest) > 0 {
|
||||||
|
selector = rest[0]
|
||||||
|
}
|
||||||
|
if len(rest) > 1 {
|
||||||
|
index = rest[1]
|
||||||
|
}
|
||||||
|
if selector == "" {
|
||||||
|
return []byte("usage: browser click <selector> [index]")
|
||||||
|
}
|
||||||
|
return pwClick(map[string]string{"selector": selector, "index": index})
|
||||||
|
case "fill":
|
||||||
|
// browser fill <selector> <text>
|
||||||
|
if len(rest) < 2 {
|
||||||
|
return []byte("usage: browser fill <selector> <text>")
|
||||||
|
}
|
||||||
|
return pwFill(map[string]string{"selector": rest[0], "text": strings.Join(rest[1:], " ")})
|
||||||
|
case "text":
|
||||||
|
// browser text [selector]
|
||||||
|
selector := ""
|
||||||
|
if len(rest) > 0 {
|
||||||
|
selector = rest[0]
|
||||||
|
}
|
||||||
|
return pwExtractText(map[string]string{"selector": selector})
|
||||||
|
case "html":
|
||||||
|
// browser html [selector]
|
||||||
|
selector := ""
|
||||||
|
if len(rest) > 0 {
|
||||||
|
selector = rest[0]
|
||||||
|
}
|
||||||
|
return pwGetHTML(map[string]string{"selector": selector})
|
||||||
|
case "dom":
|
||||||
|
return pwGetDOM(originalArgs)
|
||||||
|
case "screenshot":
|
||||||
|
// browser screenshot [path]
|
||||||
|
path := ""
|
||||||
|
if len(rest) > 0 {
|
||||||
|
path = rest[0]
|
||||||
|
}
|
||||||
|
return pwScreenshot(map[string]string{"path": path})
|
||||||
|
case "screenshot_and_view":
|
||||||
|
// browser screenshot_and_view [path]
|
||||||
|
path := ""
|
||||||
|
if len(rest) > 0 {
|
||||||
|
path = rest[0]
|
||||||
|
}
|
||||||
|
return pwScreenshotAndView(map[string]string{"path": path})
|
||||||
|
case "wait":
|
||||||
|
// browser wait <selector>
|
||||||
|
selector := ""
|
||||||
|
if len(rest) > 0 {
|
||||||
|
selector = rest[0]
|
||||||
|
}
|
||||||
|
if selector == "" {
|
||||||
|
return []byte("usage: browser wait <selector>")
|
||||||
|
}
|
||||||
|
return pwWaitForSelector(map[string]string{"selector": selector})
|
||||||
|
case "drag":
|
||||||
|
// browser drag <x1> <y1> <x2> <y2> OR browser drag <from_selector> <to_selector>
|
||||||
|
if len(rest) < 4 && len(rest) < 2 {
|
||||||
|
return []byte("usage: browser drag <x1> <y1> <x2> <y2> OR browser drag <from_selector> <to_selector>")
|
||||||
|
}
|
||||||
|
// Check if first arg is a number (coordinates) or selector
|
||||||
|
_, err := strconv.Atoi(rest[0])
|
||||||
|
_, err2 := strconv.ParseFloat(rest[0], 64)
|
||||||
|
if err == nil || err2 == nil {
|
||||||
|
// Coordinates: browser drag 100 200 300 400
|
||||||
|
if len(rest) < 4 {
|
||||||
|
return []byte("usage: browser drag <x1> <y1> <x2> <y2>")
|
||||||
|
}
|
||||||
|
return pwDrag(map[string]string{
|
||||||
|
"x1": rest[0], "y1": rest[1],
|
||||||
|
"x2": rest[2], "y2": rest[3],
|
||||||
|
})
|
||||||
|
}
|
||||||
|
// Selectors: browser drag #item #container
|
||||||
|
// pwDrag needs coordinates, so we need to get element positions first
|
||||||
|
// This requires a different approach - use JavaScript to get centers
|
||||||
|
return pwDragBySelector(map[string]string{
|
||||||
|
"fromSelector": rest[0],
|
||||||
|
"toSelector": rest[1],
|
||||||
|
})
|
||||||
|
default:
|
||||||
|
return []byte(fmt.Sprintf("unknown browser action: %s", action))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// getHelp returns help text for commands
|
// getHelp returns help text for commands
|
||||||
func getHelp(args []string) string {
|
func getHelp(args []string) string {
|
||||||
if len(args) == 0 {
|
if len(args) == 0 {
|
||||||
@@ -567,6 +707,25 @@ func getHelp(args []string) string {
|
|||||||
todo update <id> <status> - update todo (pending/in_progress/completed)
|
todo update <id> <status> - update todo (pending/in_progress/completed)
|
||||||
todo delete <id> - delete a todo
|
todo delete <id> - delete a todo
|
||||||
|
|
||||||
|
# Window (requires xdotool + maim)
|
||||||
|
window - list available windows
|
||||||
|
capture <name> - capture a window screenshot
|
||||||
|
capture_and_view <name> - capture and view screenshot
|
||||||
|
|
||||||
|
# Browser (requires Playwright)
|
||||||
|
browser start - start browser
|
||||||
|
browser stop - stop browser
|
||||||
|
browser running - check if running
|
||||||
|
browser go <url> - navigate to URL
|
||||||
|
browser click <sel> - click element
|
||||||
|
browser fill <sel> <txt> - fill input
|
||||||
|
browser text [sel] - extract text
|
||||||
|
browser html [sel] - get HTML
|
||||||
|
browser screenshot - take screenshot
|
||||||
|
browser wait <sel> - wait for element
|
||||||
|
browser drag <x1> <y1> <x2> <y2> - drag by coordinates
|
||||||
|
browser drag <sel1> <sel2> - drag by selectors (center points)
|
||||||
|
|
||||||
# System
|
# System
|
||||||
<any shell command> - run shell command directly
|
<any shell command> - run shell command directly
|
||||||
|
|
||||||
@@ -675,6 +834,51 @@ Use: run "command" to execute.`
|
|||||||
run "go test ./..."
|
run "go test ./..."
|
||||||
run "go mod tidy"
|
run "go mod tidy"
|
||||||
run "go get github.com/package"`
|
run "go get github.com/package"`
|
||||||
|
case "window", "windows":
|
||||||
|
return `window
|
||||||
|
List available windows.
|
||||||
|
Requires: xdotool and maim
|
||||||
|
Example:
|
||||||
|
run "window"`
|
||||||
|
case "capture", "screenshot":
|
||||||
|
return `capture <window-name-or-id>
|
||||||
|
Capture a screenshot of a window.
|
||||||
|
Requires: xdotool and maim
|
||||||
|
Examples:
|
||||||
|
run "capture Firefox"
|
||||||
|
run "capture 0x12345678"
|
||||||
|
run "capture_and_view Firefox"`
|
||||||
|
case "capture_and_view":
|
||||||
|
return `capture_and_view <window-name-or-id>
|
||||||
|
Capture a window and return for viewing.
|
||||||
|
Requires: xdotool and maim
|
||||||
|
Examples:
|
||||||
|
run "capture_and_view Firefox"`
|
||||||
|
case "browser":
|
||||||
|
return `browser <action> [args]
|
||||||
|
Playwright browser automation.
|
||||||
|
Requires: Playwright browser server running
|
||||||
|
Actions:
|
||||||
|
start - start browser
|
||||||
|
stop - stop browser
|
||||||
|
running - check if browser is running
|
||||||
|
go <url> - navigate to URL
|
||||||
|
click <selector> - click element (use index for multiple: click #btn 1)
|
||||||
|
fill <selector> <text> - fill input field
|
||||||
|
text [selector] - extract text (from element or whole page)
|
||||||
|
html [selector] - get HTML (from element or whole page)
|
||||||
|
screenshot [path] - take screenshot
|
||||||
|
wait <selector> - wait for element to appear
|
||||||
|
drag <from> <to> - drag element to another element
|
||||||
|
Examples:
|
||||||
|
run "browser start"
|
||||||
|
run "browser go https://example.com"
|
||||||
|
run "browser click #submit-button"
|
||||||
|
run "browser fill #search-input hello"
|
||||||
|
run "browser text"
|
||||||
|
run "browser screenshot"
|
||||||
|
run "browser drag 100 200 300 400"
|
||||||
|
run "browser drag #item1 #container2"`
|
||||||
default:
|
default:
|
||||||
return fmt.Sprintf("No help available for: %s. Use: run \"help\" for all commands.", cmd)
|
return fmt.Sprintf("No help available for: %s. Use: run \"help\" for all commands.", cmd)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -455,6 +455,83 @@ func pwDrag(args map[string]string) []byte {
|
|||||||
return []byte(fmt.Sprintf(`{"success": true, "message": "Dragged from (%s,%s) to (%s,%s)"}`, x1, y1, x2, y2))
|
return []byte(fmt.Sprintf(`{"success": true, "message": "Dragged from (%s,%s) to (%s,%s)"}`, x1, y1, x2, y2))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func pwDragBySelector(args map[string]string) []byte {
|
||||||
|
fromSelector, ok := args["fromSelector"]
|
||||||
|
if !ok || fromSelector == "" {
|
||||||
|
return []byte(`{"error": "fromSelector not provided"}`)
|
||||||
|
}
|
||||||
|
toSelector, ok := args["toSelector"]
|
||||||
|
if !ok || toSelector == "" {
|
||||||
|
return []byte(`{"error": "toSelector not provided"}`)
|
||||||
|
}
|
||||||
|
if !browserStarted || page == nil {
|
||||||
|
return []byte(`{"error": "Browser not started. Call pw_start first."}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get center coordinates of both elements using JavaScript
|
||||||
|
fromJS := fmt.Sprintf(`
|
||||||
|
function getCenter(selector) {
|
||||||
|
const el = document.querySelector(selector);
|
||||||
|
if (!el) return null;
|
||||||
|
const rect = el.getBoundingClientRect();
|
||||||
|
return { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
|
||||||
|
}
|
||||||
|
getCenter(%q)
|
||||||
|
`, fromSelector)
|
||||||
|
toJS := fmt.Sprintf(`
|
||||||
|
function getCenter(selector) {
|
||||||
|
const el = document.querySelector(selector);
|
||||||
|
if (!el) return null;
|
||||||
|
const rect = el.getBoundingClientRect();
|
||||||
|
return { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
|
||||||
|
}
|
||||||
|
getCenter(%q)
|
||||||
|
`, toSelector)
|
||||||
|
|
||||||
|
fromResult, err := page.Evaluate(fromJS)
|
||||||
|
if err != nil {
|
||||||
|
return []byte(fmt.Sprintf(`{"error": "failed to get from element: %s"}`, err.Error()))
|
||||||
|
}
|
||||||
|
fromMap, ok := fromResult.(map[string]interface{})
|
||||||
|
if !ok || fromMap == nil {
|
||||||
|
return []byte(fmt.Sprintf(`{"error": "from selector '%s' not found"}`, fromSelector))
|
||||||
|
}
|
||||||
|
fromX := fromMap["x"].(float64)
|
||||||
|
fromY := fromMap["y"].(float64)
|
||||||
|
|
||||||
|
toResult, err := page.Evaluate(toJS)
|
||||||
|
if err != nil {
|
||||||
|
return []byte(fmt.Sprintf(`{"error": "failed to get to element: %s"}`, err.Error()))
|
||||||
|
}
|
||||||
|
toMap, ok := toResult.(map[string]interface{})
|
||||||
|
if !ok || toMap == nil {
|
||||||
|
return []byte(fmt.Sprintf(`{"error": "to selector '%s' not found"}`, toSelector))
|
||||||
|
}
|
||||||
|
toX := toMap["x"].(float64)
|
||||||
|
toY := toMap["y"].(float64)
|
||||||
|
|
||||||
|
// Perform the drag using coordinates
|
||||||
|
mouse := page.Mouse()
|
||||||
|
err = mouse.Move(fromX, fromY)
|
||||||
|
if err != nil {
|
||||||
|
return []byte(fmt.Sprintf(`{"error": "failed to move mouse: %s"}`, err.Error()))
|
||||||
|
}
|
||||||
|
err = mouse.Down()
|
||||||
|
if err != nil {
|
||||||
|
return []byte(fmt.Sprintf(`{"error": "failed to mouse down: %s"}`, err.Error()))
|
||||||
|
}
|
||||||
|
err = mouse.Move(toX, toY)
|
||||||
|
if err != nil {
|
||||||
|
return []byte(fmt.Sprintf(`{"error": "failed to move mouse: %s"}`, err.Error()))
|
||||||
|
}
|
||||||
|
err = mouse.Up()
|
||||||
|
if err != nil {
|
||||||
|
return []byte(fmt.Sprintf(`{"error": "failed to mouse up: %s"}`, err.Error()))
|
||||||
|
}
|
||||||
|
msg := fmt.Sprintf("Dragged from %s (%.0f,%.0f) to %s (%.0f,%.0f)", fromSelector, fromX, fromY, toSelector, toX, toY)
|
||||||
|
return []byte(fmt.Sprintf(`{"success": true, "message": "%s"}`, msg))
|
||||||
|
}
|
||||||
|
|
||||||
func pwClickAt(args map[string]string) []byte {
|
func pwClickAt(args map[string]string) []byte {
|
||||||
x, ok := args["x"]
|
x, ok := args["x"]
|
||||||
if !ok {
|
if !ok {
|
||||||
|
|||||||
Reference in New Issue
Block a user