Enha: pw agent

2026-03-09 08:50:33 +03:00
parent 94769225cf
commit c2c90f6d2b
5 changed files with 536 additions and 10 deletions
--- a/agent/pw_agent.go
+++ b/agent/pw_agent.go
@@ -1,5 +1,11 @@
 package agent
 import (
 	"encoding/json"
 	"gf-lt/models"
 	"strings"
 )
 // PWAgent: is AgenterA type agent (enclosed with tool chaining)
 // sysprompt explain tools and how to plan for execution
 type PWAgent struct {
@@ -7,11 +13,16 @@ type PWAgent struct {
 	sysprompt string
 }
-// NewWebAgentB creates a WebAgentB that uses the given formatting function
+// NewPWAgent creates a PWAgent with the given client and system prompt
 func NewPWAgent(client *AgentClient, sysprompt string) *PWAgent {
 	return &PWAgent{AgentClient: client, sysprompt: sysprompt}
 }
 // SetTools sets the tools available to the agent
 func (a *PWAgent) SetTools(tools []models.Tool) {
 	a.tools = tools
 }
 func (a *PWAgent) ProcessTask(task string) []byte {
 	req, err := a.FormFirstMsg(a.sysprompt, task)
 	if err != nil {
@@ -25,16 +36,91 @@ func (a *PWAgent) ProcessTask(task string) []byte {
 			a.Log().Error("failed to process the request", "error", err)
 			return []byte("failed to process the request; err: " + err.Error())
 		}
-		toolCall, hasToolCall := findToolCall(resp)
+		execTool, toolCallID, hasToolCall := findToolCall(resp)
 		if !hasToolCall {
 			return resp
 		}
-		// check resp for tool calls
+
-		// make tool call
+		a.setToolCallOnLastMessage(resp, toolCallID)
-		// add tool call resp to body
+
-		// send new request too lmm
+		toolResp := string(execTool())
-		tooResp := toolCall(resp)
+		req, err = a.FormMsgWithToolCallID(toolResp, toolCallID)
-		req, err = a.FormMsg(toolResp)
+		if err != nil {
 			a.Log().Error("failed to form next message", "error", err)
 			return []byte("failed to form next message; err: " + err.Error())
 		}
 	}
 	return nil
 }
 func (a *PWAgent) setToolCallOnLastMessage(resp []byte, toolCallID string) {
 	if toolCallID == "" {
 		return
 	}
 	var genericResp map[string]interface{}
 	if err := json.Unmarshal(resp, &genericResp); err != nil {
 		return
 	}
 	var name string
 	var args map[string]string
 	if choices, ok := genericResp["choices"].([]interface{}); ok && len(choices) > 0 {
 		if firstChoice, ok := choices[0].(map[string]interface{}); ok {
 			if message, ok := firstChoice["message"].(map[string]interface{}); ok {
 				if toolCalls, ok := message["tool_calls"].([]interface{}); ok && len(toolCalls) > 0 {
 					if tc, ok := toolCalls[0].(map[string]interface{}); ok {
 						if fn, ok := tc["function"].(map[string]interface{}); ok {
 							name, _ = fn["name"].(string)
 							argsStr, _ := fn["arguments"].(string)
 							json.Unmarshal([]byte(argsStr), &args)
 						}
 					}
 				}
 			}
 		}
 	}
 	if name == "" {
 		content, _ := genericResp["content"].(string)
 		name = extractToolNameFromText(content)
 	}
 	lastIdx := len(a.chatBody.Messages) - 1
 	if lastIdx >= 0 {
 		a.chatBody.Messages[lastIdx].ToolCallID = toolCallID
 		if name != "" {
 			argsJSON, _ := json.Marshal(args)
 			a.chatBody.Messages[lastIdx].ToolCall = &models.ToolCall{
 				ID:   toolCallID,
 				Name: name,
 				Args: string(argsJSON),
 			}
 		}
 	}
 }
 func extractToolNameFromText(text string) string {
 	jsStr := toolCallRE.FindString(text)
 	if jsStr == "" {
 		return ""
 	}
 	jsStr = strings.TrimSpace(jsStr)
 	jsStr = strings.TrimPrefix(jsStr, "__tool_call__")
 	jsStr = strings.TrimSuffix(jsStr, "__tool_call__")
 	jsStr = strings.TrimSpace(jsStr)
 	start := strings.Index(jsStr, "{")
 	end := strings.LastIndex(jsStr, "}")
 	if start == -1 || end == -1 || end <= start {
 		return ""
 	}
 	jsStr = jsStr[start : end+1]
 	var fc models.FuncCall
 	if err := json.Unmarshal([]byte(jsStr), &fc); err != nil {
 		return ""
 	}
 	return fc.Name
 }
--- a/agent/pw_tools.go
+++ b/agent/pw_tools.go
@@ -0,0 +1,349 @@
 package agent
 import (
 	"encoding/json"
 	"fmt"
 	"regexp"
 	"strings"
 	"gf-lt/models"
 )
 type ToolFunc func(map[string]string) []byte
 var pwToolMap = make(map[string]ToolFunc)
 func RegisterPWTool(name string, fn ToolFunc) {
 	pwToolMap[name] = fn
 }
 func GetPWTools() []models.Tool {
 	return pwTools
 }
 var pwTools = []models.Tool{
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_start",
 			Description: "Start a Playwright browser instance. Must be called first before any other browser automation. Uses headless mode by default.",
 			Parameters: models.ToolFuncParams{
 				Type:       "object",
 				Required:   []string{},
 				Properties: map[string]models.ToolArgProps{},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_stop",
 			Description: "Stop the Playwright browser instance. Call when done with browser automation.",
 			Parameters: models.ToolFuncParams{
 				Type:       "object",
 				Required:   []string{},
 				Properties: map[string]models.ToolArgProps{},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_is_running",
 			Description: "Check if Playwright browser is currently running.",
 			Parameters: models.ToolFuncParams{
 				Type:       "object",
 				Required:   []string{},
 				Properties: map[string]models.ToolArgProps{},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_navigate",
 			Description: "Navigate to a URL in the browser.",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{"url"},
 				Properties: map[string]models.ToolArgProps{
 					"url": {Type: "string", Description: "URL to navigate to"},
 				},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_click",
 			Description: "Click on an element on the current webpage. Use 'index' for multiple matches (default 0).",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{"selector"},
 				Properties: map[string]models.ToolArgProps{
 					"selector": {Type: "string", Description: "CSS selector for the element"},
 					"index":    {Type: "integer", Description: "Index for multiple matches (default 0)"},
 				},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_fill",
 			Description: "Type text into an input field. Use 'index' for multiple matches (default 0).",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{"selector", "text"},
 				Properties: map[string]models.ToolArgProps{
 					"selector": {Type: "string", Description: "CSS selector for the input element"},
 					"text":     {Type: "string", Description: "Text to type into the field"},
 					"index":    {Type: "integer", Description: "Index for multiple matches (default 0)"},
 				},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_extract_text",
 			Description: "Extract text content from the page or specific elements. Use selector 'body' for all page text.",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{},
 				Properties: map[string]models.ToolArgProps{
 					"selector": {Type: "string", Description: "CSS selector (default 'body' for all page text)"},
 				},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_screenshot",
 			Description: "Take a screenshot of the page or a specific element. Returns a file path to the image.",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{},
 				Properties: map[string]models.ToolArgProps{
 					"selector":  {Type: "string", Description: "CSS selector for element to screenshot"},
 					"full_page": {Type: "boolean", Description: "Capture full page (default false)"},
 				},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_screenshot_and_view",
 			Description: "Take a screenshot and return the image for viewing. Use to visually verify page state.",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{},
 				Properties: map[string]models.ToolArgProps{
 					"selector":  {Type: "string", Description: "CSS selector for element to screenshot"},
 					"full_page": {Type: "boolean", Description: "Capture full page (default false)"},
 				},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_wait_for_selector",
 			Description: "Wait for an element to appear on the page before proceeding.",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{"selector"},
 				Properties: map[string]models.ToolArgProps{
 					"selector": {Type: "string", Description: "CSS selector to wait for"},
 					"timeout":  {Type: "integer", Description: "Timeout in milliseconds (default 30000)"},
 				},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_drag",
 			Description: "Drag the mouse from point (x1,y1) to (x2,y2).",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{"x1", "y1", "x2", "y2"},
 				Properties: map[string]models.ToolArgProps{
 					"x1": {Type: "number", Description: "Starting X coordinate"},
 					"y1": {Type: "number", Description: "Starting Y coordinate"},
 					"x2": {Type: "number", Description: "Ending X coordinate"},
 					"y2": {Type: "number", Description: "Ending Y coordinate"},
 				},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_click_at",
 			Description: "Click at specific X,Y coordinates on the page.",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{"x", "y"},
 				Properties: map[string]models.ToolArgProps{
 					"x": {Type: "number", Description: "X coordinate"},
 					"y": {Type: "number", Description: "Y coordinate"},
 				},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_get_html",
 			Description: "Get the HTML content of the page or a specific element.",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{},
 				Properties: map[string]models.ToolArgProps{
 					"selector": {Type: "string", Description: "CSS selector (default 'body')"},
 				},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_get_dom",
 			Description: "Get a structured DOM representation with tag, attributes, text, and children.",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{},
 				Properties: map[string]models.ToolArgProps{
 					"selector": {Type: "string", Description: "CSS selector (default 'body')"},
 				},
 			},
 		},
 	},
 	{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "pw_search_elements",
 			Description: "Search for elements by text content or CSS selector.",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{},
 				Properties: map[string]models.ToolArgProps{
 					"text":     {Type: "string", Description: "Text content to search for"},
 					"selector": {Type: "string", Description: "CSS selector to search for"},
 				},
 			},
 		},
 	},
 }
 var toolCallRE = regexp.MustCompile(`__tool_call__(.+?)__tool_call__`)
 type ParsedToolCall struct {
 	ID   string
 	Name string
 	Args map[string]string
 }
 func findToolCall(resp []byte) (func() []byte, string, bool) {
 	var genericResp map[string]interface{}
 	if err := json.Unmarshal(resp, &genericResp); err != nil {
 		return findToolCallFromText(string(resp))
 	}
 	if choices, ok := genericResp["choices"].([]interface{}); ok && len(choices) > 0 {
 		if firstChoice, ok := choices[0].(map[string]interface{}); ok {
 			if message, ok := firstChoice["message"].(map[string]interface{}); ok {
 				if toolCalls, ok := message["tool_calls"].([]interface{}); ok && len(toolCalls) > 0 {
 					return parseOpenAIToolCall(toolCalls)
 				}
 				if content, ok := message["content"].(string); ok {
 					return findToolCallFromText(content)
 				}
 			}
 			if text, ok := firstChoice["text"].(string); ok {
 				return findToolCallFromText(text)
 			}
 		}
 	}
 	if content, ok := genericResp["content"].(string); ok {
 		return findToolCallFromText(content)
 	}
 	return findToolCallFromText(string(resp))
 }
 func parseOpenAIToolCall(toolCalls []interface{}) (func() []byte, string, bool) {
 	if len(toolCalls) == 0 {
 		return nil, "", false
 	}
 	tc := toolCalls[0].(map[string]interface{})
 	id, _ := tc["id"].(string)
 	function, _ := tc["function"].(map[string]interface{})
 	name, _ := function["name"].(string)
 	argsStr, _ := function["arguments"].(string)
 	var args map[string]string
 	if err := json.Unmarshal([]byte(argsStr), &args); err != nil {
 		return func() []byte {
 			return []byte(fmt.Sprintf(`{"error": "failed to parse arguments: %v"}`, err))
 		}, id, true
 	}
 	return func() []byte {
 		fn, ok := pwToolMap[name]
 		if !ok {
 			return []byte(fmt.Sprintf(`{"error": "tool %s not found"}`, name))
 		}
 		return fn(args)
 	}, id, true
 }
 func findToolCallFromText(text string) (func() []byte, string, bool) {
 	jsStr := toolCallRE.FindString(text)
 	if jsStr == "" {
 		return nil, "", false
 	}
 	jsStr = strings.TrimSpace(jsStr)
 	jsStr = strings.TrimPrefix(jsStr, "__tool_call__")
 	jsStr = strings.TrimSuffix(jsStr, "__tool_call__")
 	jsStr = strings.TrimSpace(jsStr)
 	start := strings.Index(jsStr, "{")
 	end := strings.LastIndex(jsStr, "}")
 	if start == -1 || end == -1 || end <= start {
 		return func() []byte {
 			return []byte(`{"error": "no valid JSON found in tool call"}`)
 		}, "", true
 	}
 	jsStr = jsStr[start : end+1]
 	var fc models.FuncCall
 	if err := json.Unmarshal([]byte(jsStr), &fc); err != nil {
 		return func() []byte {
 			return []byte(fmt.Sprintf(`{"error": "failed to parse tool call: %v}`, err))
 		}, "", true
 	}
 	if fc.ID == "" {
 		fc.ID = "call_" + generateToolCallID()
 	}
 	return func() []byte {
 		fn, ok := pwToolMap[fc.Name]
 		if !ok {
 			return []byte(fmt.Sprintf(`{"error": "tool %s not found"}`, fc.Name))
 		}
 		return fn(fc.Args)
 	}, fc.ID, true
 }
 func generateToolCallID() string {
 	return fmt.Sprintf("%d", len(pwToolMap)%10000)
 }
--- a/agent/request.go
+++ b/agent/request.go
@@ -80,6 +80,20 @@ func (ag *AgentClient) FormMsg(msg string) (io.Reader, error) {
 	return bytes.NewReader(b), nil
 }
 func (ag *AgentClient) FormMsgWithToolCallID(msg, toolCallID string) (io.Reader, error) {
 	m := models.RoleMsg{
 		Role:       "tool",
 		Content:    msg,
 		ToolCallID: toolCallID,
 	}
 	ag.chatBody.Messages = append(ag.chatBody.Messages, m)
 	b, err := ag.buildRequest()
 	if err != nil {
 		return nil, err
 	}
 	return bytes.NewReader(b), nil
 }
 // buildRequest creates the appropriate LLM request based on the current API endpoint.
 func (ag *AgentClient) buildRequest() ([]byte, error) {
 	isCompletion, isChat, isDeepSeek, isOpenRouter := detectAPI(ag.cfg.CurrentAPI)
--- a/agent/webagent.go
+++ b/agent/webagent.go
@@ -17,8 +17,8 @@ func NewWebAgentB(client *AgentClient, sysprompt string) *WebAgentB {
 // Process applies the formatting function to raw output
 func (a *WebAgentB) Process(args map[string]string, rawOutput []byte) []byte {
-	msg, err := a.FormMsg(a.sysprompt,
+	msg, err := a.FormMsg(
-		fmt.Sprintf("request:\n%+v\ntool response:\n%v", args, string(rawOutput)))
+		fmt.Sprintf("%s\n\nrequest:\n%+v\ntool response:\n%v", a.sysprompt, args, string(rawOutput)))
 	if err != nil {
 		a.Log().Error("failed to process the request", "error", err)
 		return []byte("failed to process the request; err: " + err.Error())
--- a/tools.go
+++ b/tools.go
@@ -1491,6 +1491,47 @@ func registerWindowTools() {
 	}
 }
 var browserAgentSysPrompt = `You are an autonomous browser automation agent. Your goal is to complete the user's task by intelligently using browser automation tools.
 Available tools:
 - pw_start: Start browser (must call first)
 - pw_stop: Stop browser (call when done)
 - pw_is_running: Check if browser is running
 - pw_navigate: Go to a URL
 - pw_click: Click an element by CSS selector
 - pw_fill: Type text into an input
 - pw_extract_text: Get text from page/element
 - pw_screenshot: Take a screenshot (returns file path)
 - pw_screenshot_and_view: Take screenshot with image for viewing
 - pw_wait_for_selector: Wait for element to appear
 - pw_drag: Drag mouse from one point to another
 - pw_click_at: Click at X,Y coordinates
 - pw_get_html: Get HTML content
 - pw_get_dom: Get structured DOM tree
 - pw_search_elements: Search for elements by text or selector
 Workflow:
 1. Start browser if not running (pw_start)
 2. Navigate to required pages (pw_navigate)
 3. Interact with elements as needed (click, fill, etc.)
 4. Extract information or take screenshots as requested
 5. Stop browser when done (pw_stop)
 Always provide clear feedback about what you're doing and what you found.`
 func runBrowserAgent(args map[string]string) []byte {
 	task, ok := args["task"]
 	if !ok || task == "" {
 		return []byte(`{"error": "task argument is required"}`)
 	}
 	client := getWebAgentClient()
 	pwAgent := agent.NewPWAgent(client, browserAgentSysPrompt)
 	pwAgent.SetTools(agent.GetPWTools())
 	return pwAgent.ProcessTask(task)
 }
 func registerPlaywrightTools() {
 	removePlaywrightToolsFromBaseTools()
 	if cfg != nil && cfg.PlaywrightEnabled {
@@ -1776,6 +1817,42 @@ func registerPlaywrightTools() {
 		}
 		baseTools = append(baseTools, playwrightTools...)
 		toolSysMsg += browserToolSysMsg
 		agent.RegisterPWTool("pw_start", pwStart)
 		agent.RegisterPWTool("pw_stop", pwStop)
 		agent.RegisterPWTool("pw_is_running", pwIsRunning)
 		agent.RegisterPWTool("pw_navigate", pwNavigate)
 		agent.RegisterPWTool("pw_click", pwClick)
 		agent.RegisterPWTool("pw_click_at", pwClickAt)
 		agent.RegisterPWTool("pw_fill", pwFill)
 		agent.RegisterPWTool("pw_extract_text", pwExtractText)
 		agent.RegisterPWTool("pw_screenshot", pwScreenshot)
 		agent.RegisterPWTool("pw_screenshot_and_view", pwScreenshotAndView)
 		agent.RegisterPWTool("pw_wait_for_selector", pwWaitForSelector)
 		agent.RegisterPWTool("pw_drag", pwDrag)
 		agent.RegisterPWTool("pw_get_html", pwGetHTML)
 		agent.RegisterPWTool("pw_get_dom", pwGetDOM)
 		agent.RegisterPWTool("pw_search_elements", pwSearchElements)
 		browserAgentTool := []models.Tool{
 			{
 				Type: "function",
 				Function: models.ToolFunc{
 					Name:        "browser_agent",
 					Description: "Autonomous browser automation agent. Use for complex multi-step browser tasks like 'go to website, login, and take screenshot'. The agent will plan and execute steps automatically using browser tools.",
 					Parameters: models.ToolFuncParams{
 						Type:     "object",
 						Required: []string{"task"},
 						Properties: map[string]models.ToolArgProps{
 							"task": {Type: "string", Description: "The task to accomplish, e.g., 'go to github.com and take a screenshot of the homepage'"},
 						},
 					},
 				},
 			},
 		}
 		baseTools = append(baseTools, browserAgentTool...)
 		fnMap["browser_agent"] = runBrowserAgent
 	}
 }