Compare commits
8 Commits
742f1ca838
...
feat/img-t
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3389b1d83b | ||
|
|
4f6000a43a | ||
|
|
9ba46b40cc | ||
|
|
5bb456272e | ||
|
|
8999f48fb9 | ||
|
|
b2f280a7f1 | ||
|
|
65cbd5d6a6 | ||
|
|
caac1d397a |
76
bot.go
76
bot.go
@@ -433,6 +433,33 @@ func isModelLoaded(modelID string) (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func ModelHasVision(api, modelID string) bool {
|
||||
switch {
|
||||
case strings.Contains(api, "deepseek"):
|
||||
return false
|
||||
case strings.Contains(api, "openrouter"):
|
||||
resp, err := http.Get("https://openrouter.ai/api/v1/models")
|
||||
if err != nil {
|
||||
logger.Warn("failed to fetch OR models for vision check", "error", err)
|
||||
return false
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
orm := &models.ORModels{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(orm); err != nil {
|
||||
logger.Warn("failed to decode OR models for vision check", "error", err)
|
||||
return false
|
||||
}
|
||||
return orm.HasVision(modelID)
|
||||
default:
|
||||
models, err := fetchLCPModelsWithStatus()
|
||||
if err != nil {
|
||||
logger.Warn("failed to fetch LCP models for vision check", "error", err)
|
||||
return false
|
||||
}
|
||||
return models.HasVision(modelID)
|
||||
}
|
||||
}
|
||||
|
||||
// monitorModelLoad starts a goroutine that periodically checks if the specified model is loaded.
|
||||
func monitorModelLoad(modelID string) {
|
||||
go func() {
|
||||
@@ -1174,17 +1201,59 @@ func findCall(msg, toolCall string) bool {
|
||||
toolRunningMode = false
|
||||
toolMsg := string(resp)
|
||||
logger.Info("llm used a tool call", "tool_name", fc.Name, "too_args", fc.Args, "id", fc.ID, "tool_resp", toolMsg)
|
||||
fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
|
||||
"\n\n", len(chatBody.Messages), cfg.ToolRole, toolMsg)
|
||||
// Create tool response message with the proper tool_call_id
|
||||
// Mark shell commands as always visible
|
||||
isShellCommand := fc.Name == "execute_command"
|
||||
toolResponseMsg := models.RoleMsg{
|
||||
// Check if response is multimodal content (image)
|
||||
var toolResponseMsg models.RoleMsg
|
||||
if strings.HasPrefix(strings.TrimSpace(toolMsg), `{"type":"multimodal_content"`) {
|
||||
// Parse multimodal content response
|
||||
multimodalResp := models.MultimodalToolResp{}
|
||||
if err := json.Unmarshal([]byte(toolMsg), &multimodalResp); err == nil && multimodalResp.Type == "multimodal_content" {
|
||||
// Create RoleMsg with ContentParts
|
||||
var contentParts []any
|
||||
for _, part := range multimodalResp.Parts {
|
||||
partType := part["type"]
|
||||
switch partType {
|
||||
case "text":
|
||||
contentParts = append(contentParts, models.TextContentPart{Type: "text", Text: part["text"]})
|
||||
case "image_url":
|
||||
contentParts = append(contentParts, models.ImageContentPart{
|
||||
Type: "image_url",
|
||||
ImageURL: struct {
|
||||
URL string `json:"url"`
|
||||
}{URL: part["url"]},
|
||||
})
|
||||
default:
|
||||
continue
|
||||
}
|
||||
}
|
||||
toolResponseMsg = models.RoleMsg{
|
||||
Role: cfg.ToolRole,
|
||||
ContentParts: contentParts,
|
||||
HasContentParts: true,
|
||||
ToolCallID: lastToolCall.ID,
|
||||
IsShellCommand: isShellCommand,
|
||||
}
|
||||
} else {
|
||||
// Fallback to regular content
|
||||
toolResponseMsg = models.RoleMsg{
|
||||
Role: cfg.ToolRole,
|
||||
Content: toolMsg,
|
||||
ToolCallID: lastToolCall.ID,
|
||||
IsShellCommand: isShellCommand,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
toolResponseMsg = models.RoleMsg{
|
||||
Role: cfg.ToolRole,
|
||||
Content: toolMsg,
|
||||
ToolCallID: lastToolCall.ID,
|
||||
IsShellCommand: isShellCommand,
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
|
||||
"\n\n", len(chatBody.Messages), cfg.ToolRole, toolResponseMsg.GetText())
|
||||
chatBody.Messages = append(chatBody.Messages, toolResponseMsg)
|
||||
logger.Debug("findCall: added actual tool response", "role", toolResponseMsg.Role, "content_len", len(toolResponseMsg.Content), "tool_call_id", toolResponseMsg.ToolCallID, "message_count_after_add", len(chatBody.Messages))
|
||||
// Clear the stored tool call ID after using it
|
||||
@@ -1339,6 +1408,7 @@ func updateModelLists() {
|
||||
chatBody.Model = m
|
||||
cachedModelColor = "green"
|
||||
updateStatusLine()
|
||||
UpdateToolCapabilities()
|
||||
app.Draw()
|
||||
return
|
||||
}
|
||||
|
||||
60
llm.go
60
llm.go
@@ -3,7 +3,6 @@ package main
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"gf-lt/models"
|
||||
"io"
|
||||
"strings"
|
||||
@@ -119,25 +118,22 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
|
||||
logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI)
|
||||
localImageAttachmentPath := imageAttachmentPath
|
||||
var multimodalData []string
|
||||
if msg != "" { // otherwise let the bot to continue
|
||||
var newMsg models.RoleMsg
|
||||
if localImageAttachmentPath != "" {
|
||||
newMsg = models.NewMultimodalMsg(role, []any{})
|
||||
newMsg.AddTextPart(msg)
|
||||
imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath)
|
||||
if err != nil {
|
||||
logger.Error("failed to create image URL from path for completion",
|
||||
"error", err, "path", localImageAttachmentPath)
|
||||
return nil, err
|
||||
}
|
||||
// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
|
||||
parts := strings.SplitN(imageURL, ",", 2)
|
||||
if len(parts) == 2 {
|
||||
multimodalData = append(multimodalData, parts[1])
|
||||
} else {
|
||||
logger.Error("invalid image data URL format", "url", imageURL)
|
||||
return nil, errors.New("invalid image data URL format")
|
||||
}
|
||||
newMsg.AddImagePart(imageURL, localImageAttachmentPath)
|
||||
imageAttachmentPath = "" // Clear the attachment after use
|
||||
} else { // not a multimodal msg or image passed in tool call
|
||||
newMsg = models.RoleMsg{Role: role, Content: msg}
|
||||
}
|
||||
if msg != "" { // otherwise let the bot to continue
|
||||
newMsg := models.RoleMsg{Role: role, Content: msg}
|
||||
newMsg = *processMessageTag(&newMsg)
|
||||
chatBody.Messages = append(chatBody.Messages, newMsg)
|
||||
}
|
||||
@@ -146,22 +142,40 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
|
||||
chatBody.Messages = append(chatBody.Messages, models.RoleMsg{Role: cfg.ToolRole, Content: toolSysMsg})
|
||||
}
|
||||
filteredMessages, botPersona := filterMessagesForCurrentCharacter(chatBody.Messages)
|
||||
// Build prompt and extract images inline as we process each message
|
||||
messages := make([]string, len(filteredMessages))
|
||||
for i := range filteredMessages {
|
||||
messages[i] = stripThinkingFromMsg(&filteredMessages[i]).ToPrompt()
|
||||
m := stripThinkingFromMsg(&filteredMessages[i])
|
||||
messages[i] = m.ToPrompt()
|
||||
// Extract images from this message and add marker inline
|
||||
if len(m.ContentParts) > 0 {
|
||||
for _, part := range m.ContentParts {
|
||||
var imgURL string
|
||||
// Check for struct type
|
||||
if imgPart, ok := part.(models.ImageContentPart); ok {
|
||||
imgURL = imgPart.ImageURL.URL
|
||||
} else if partMap, ok := part.(map[string]any); ok {
|
||||
// Check for map type (from JSON unmarshaling)
|
||||
if partType, exists := partMap["type"]; exists && partType == "image_url" {
|
||||
if imgURLMap, ok := partMap["image_url"].(map[string]any); ok {
|
||||
if url, ok := imgURLMap["url"].(string); ok {
|
||||
imgURL = url
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if imgURL != "" {
|
||||
// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
|
||||
parts := strings.SplitN(imgURL, ",", 2)
|
||||
if len(parts) == 2 {
|
||||
multimodalData = append(multimodalData, parts[1])
|
||||
messages[i] += " <__media__>"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
prompt := strings.Join(messages, "\n")
|
||||
// Add multimodal media markers to the prompt text when multimodal data is present
|
||||
// This is required by llama.cpp multimodal models so they know where to insert media
|
||||
if len(multimodalData) > 0 {
|
||||
// Add a media marker for each item in the multimodal data
|
||||
var sb strings.Builder
|
||||
sb.WriteString(prompt)
|
||||
for range multimodalData {
|
||||
sb.WriteString(" <__media__>") // llama.cpp default multimodal marker
|
||||
}
|
||||
prompt = sb.String()
|
||||
}
|
||||
// needs to be after <__media__> if there are images
|
||||
if !resume {
|
||||
botMsgStart := "\n" + botPersona + ":\n"
|
||||
|
||||
@@ -2,6 +2,7 @@ package models
|
||||
|
||||
const (
|
||||
LoadedMark = "(loaded) "
|
||||
ToolRespMultyType = "multimodel_content"
|
||||
)
|
||||
|
||||
type APIType int
|
||||
|
||||
@@ -391,7 +391,6 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Determine the image format based on file extension
|
||||
var mimeType string
|
||||
switch {
|
||||
@@ -408,10 +407,8 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
|
||||
default:
|
||||
mimeType = "image/jpeg" // default
|
||||
}
|
||||
|
||||
// Encode to base64
|
||||
encoded := base64.StdEncoding.EncodeToString(data)
|
||||
|
||||
// Create data URL
|
||||
return fmt.Sprintf("data:%s;base64,%s", mimeType, encoded), nil
|
||||
}
|
||||
@@ -611,6 +608,20 @@ func (lcp *LCPModels) ListModels() []string {
|
||||
return resp
|
||||
}
|
||||
|
||||
func (lcp *LCPModels) HasVision(modelID string) bool {
|
||||
for _, m := range lcp.Data {
|
||||
if m.ID == modelID {
|
||||
args := m.Status.Args
|
||||
for i := 0; i < len(args)-1; i++ {
|
||||
if args[i] == "--mmproj" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
type ResponseStats struct {
|
||||
Tokens int
|
||||
Duration float64
|
||||
@@ -623,3 +634,8 @@ type ChatRoundReq struct {
|
||||
Regen bool
|
||||
Resume bool
|
||||
}
|
||||
|
||||
type MultimodalToolResp struct {
|
||||
Type string `json:"type"`
|
||||
Parts []map[string]string `json:"parts"`
|
||||
}
|
||||
|
||||
@@ -172,3 +172,16 @@ func (orm *ORModels) ListModels(free bool) []string {
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
func (orm *ORModels) HasVision(modelID string) bool {
|
||||
for i := range orm.Data {
|
||||
if orm.Data[i].ID == modelID {
|
||||
for _, mod := range orm.Data[i].Architecture.InputModalities {
|
||||
if mod == "image" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -143,6 +143,7 @@ func showAPILinkSelectionPopup() {
|
||||
apiListWidget.SetSelectedFunc(func(index int, mainText string, secondaryText string, shortcut rune) {
|
||||
// Update the API in config
|
||||
cfg.CurrentAPI = mainText
|
||||
UpdateToolCapabilities()
|
||||
// Update model list based on new API
|
||||
// Helper function to get model list for a given API (same as in props_table.go)
|
||||
getModelListForAPI := func(api string) []string {
|
||||
@@ -160,8 +161,9 @@ func showAPILinkSelectionPopup() {
|
||||
newModelList := getModelListForAPI(cfg.CurrentAPI)
|
||||
// Ensure chatBody.Model is in the new list; if not, set to first available model
|
||||
if len(newModelList) > 0 && !slices.Contains(newModelList, chatBody.Model) {
|
||||
chatBody.Model = newModelList[0]
|
||||
chatBody.Model = strings.TrimPrefix(newModelList[0], models.LoadedMark)
|
||||
cfg.CurrentModel = chatBody.Model
|
||||
UpdateToolCapabilities()
|
||||
}
|
||||
pages.RemovePage("apiLinkSelectionPopup")
|
||||
app.SetFocus(textArea)
|
||||
|
||||
379
tools.go
379
tools.go
@@ -85,6 +85,11 @@ Your current tools:
|
||||
"when_to_use": "when asked to read the content of a file"
|
||||
},
|
||||
{
|
||||
"name":"file_read_image",
|
||||
"args": ["path"],
|
||||
"when_to_use": "when asked to read or view an image file"
|
||||
},
|
||||
{
|
||||
"name":"file_write",
|
||||
"args": ["path", "content"],
|
||||
"when_to_use": "when needed to overwrite content to a file"
|
||||
@@ -170,8 +175,36 @@ After that you are free to respond to the user.
|
||||
webAgentsOnce sync.Once
|
||||
)
|
||||
|
||||
var windowToolSysMsg = `
|
||||
Additional window tools (available only if xdotool and maim are installed):
|
||||
[
|
||||
{
|
||||
"name":"list_windows",
|
||||
"args": [],
|
||||
"when_to_use": "when asked to list visible windows; returns map of window ID to window name"
|
||||
},
|
||||
{
|
||||
"name":"capture_window",
|
||||
"args": ["window"],
|
||||
"when_to_use": "when asked to take a screenshot of a specific window; saves to /tmp; window can be ID or name substring; returns file path"
|
||||
},
|
||||
{
|
||||
"name":"capture_window_and_view",
|
||||
"args": ["window"],
|
||||
"when_to_use": "when asked to take a screenshot of a specific window and show it; saves to /tmp and returns image for viewing; window can be ID or name substring"
|
||||
}
|
||||
]
|
||||
`
|
||||
|
||||
var WebSearcher searcher.WebSurfer
|
||||
|
||||
var (
|
||||
windowToolsAvailable bool
|
||||
xdotoolPath string
|
||||
maimPath string
|
||||
modelHasVision bool
|
||||
)
|
||||
|
||||
func init() {
|
||||
sa, err := searcher.NewWebSurfer(searcher.SearcherTypeScraper, "")
|
||||
if err != nil {
|
||||
@@ -181,6 +214,47 @@ func init() {
|
||||
if err := rag.Init(cfg, logger, store); err != nil {
|
||||
logger.Warn("failed to init rag; rag_search tool will not be available", "error", err)
|
||||
}
|
||||
checkWindowTools()
|
||||
registerWindowTools()
|
||||
}
|
||||
|
||||
func checkWindowTools() {
|
||||
xdotoolPath, _ = exec.LookPath("xdotool")
|
||||
maimPath, _ = exec.LookPath("maim")
|
||||
windowToolsAvailable = xdotoolPath != "" && maimPath != ""
|
||||
if windowToolsAvailable {
|
||||
logger.Info("window tools available: xdotool and maim found")
|
||||
} else {
|
||||
if xdotoolPath == "" {
|
||||
logger.Warn("xdotool not found, window listing tools will not be available")
|
||||
}
|
||||
if maimPath == "" {
|
||||
logger.Warn("maim not found, window capture tools will not be available")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func UpdateToolCapabilities() {
|
||||
if !cfg.ToolUse {
|
||||
return
|
||||
}
|
||||
modelHasVision = false
|
||||
if cfg == nil || cfg.CurrentAPI == "" {
|
||||
logger.Warn("cannot determine model capabilities: cfg or CurrentAPI is nil")
|
||||
registerWindowTools()
|
||||
return
|
||||
}
|
||||
prevHasVision := modelHasVision
|
||||
modelHasVision = ModelHasVision(cfg.CurrentAPI, cfg.CurrentModel)
|
||||
if modelHasVision {
|
||||
logger.Info("model has vision support", "model", cfg.CurrentModel, "api", cfg.CurrentAPI)
|
||||
} else {
|
||||
logger.Info("model does not have vision support", "model", cfg.CurrentModel, "api", cfg.CurrentAPI)
|
||||
if windowToolsAvailable && !prevHasVision && !modelHasVision {
|
||||
_ = notifyUser("window tools", "Window capture-and-view unavailable: model lacks vision support")
|
||||
}
|
||||
}
|
||||
registerWindowTools()
|
||||
}
|
||||
|
||||
// getWebAgentClient returns a singleton AgentClient for web agents.
|
||||
@@ -469,6 +543,43 @@ func fileRead(args map[string]string) []byte {
|
||||
return jsonResult
|
||||
}
|
||||
|
||||
func fileReadImage(args map[string]string) []byte {
|
||||
path, ok := args["path"]
|
||||
if !ok || path == "" {
|
||||
msg := "path not provided to file_read_image tool"
|
||||
logger.Error(msg)
|
||||
return []byte(msg)
|
||||
}
|
||||
path = resolvePath(path)
|
||||
dataURL, err := models.CreateImageURLFromPath(path)
|
||||
if err != nil {
|
||||
msg := "failed to read image; error: " + err.Error()
|
||||
logger.Error(msg)
|
||||
return []byte(msg)
|
||||
}
|
||||
// result := map[string]any{
|
||||
// "type": "multimodal_content",
|
||||
// "parts": []map[string]string{
|
||||
// {"type": "text", "text": "Image at " + path},
|
||||
// {"type": "image_url", "url": dataURL},
|
||||
// },
|
||||
// }
|
||||
result := models.MultimodalToolResp{
|
||||
Type: "multimodal_content",
|
||||
Parts: []map[string]string{
|
||||
{"type": "text", "text": "Image at " + path},
|
||||
{"type": "image_url", "url": dataURL},
|
||||
},
|
||||
}
|
||||
jsonResult, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
msg := "failed to marshal result; error: " + err.Error()
|
||||
logger.Error(msg)
|
||||
return []byte(msg)
|
||||
}
|
||||
return jsonResult
|
||||
}
|
||||
|
||||
func fileWrite(args map[string]string) []byte {
|
||||
path, ok := args["path"]
|
||||
if !ok || path == "" {
|
||||
@@ -1088,6 +1199,142 @@ func summarizeChat(args map[string]string) []byte {
|
||||
return []byte(chatText)
|
||||
}
|
||||
|
||||
func windowIDToHex(decimalID string) string {
|
||||
id, err := strconv.ParseInt(decimalID, 10, 64)
|
||||
if err != nil {
|
||||
return decimalID
|
||||
}
|
||||
return fmt.Sprintf("0x%x", id)
|
||||
}
|
||||
|
||||
func listWindows(args map[string]string) []byte {
|
||||
if !windowToolsAvailable {
|
||||
return []byte("window tools not available: xdotool or maim not found")
|
||||
}
|
||||
cmd := exec.Command(xdotoolPath, "search", "--name", ".")
|
||||
output, err := cmd.Output()
|
||||
if err != nil {
|
||||
msg := "failed to list windows: " + err.Error()
|
||||
logger.Error(msg)
|
||||
return []byte(msg)
|
||||
}
|
||||
windowIDs := strings.Fields(string(output))
|
||||
windows := make(map[string]string)
|
||||
for _, id := range windowIDs {
|
||||
id = strings.TrimSpace(id)
|
||||
if id == "" {
|
||||
continue
|
||||
}
|
||||
nameCmd := exec.Command(xdotoolPath, "getwindowname", id)
|
||||
nameOutput, err := nameCmd.Output()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(string(nameOutput))
|
||||
windows[id] = name
|
||||
}
|
||||
data, err := json.Marshal(windows)
|
||||
if err != nil {
|
||||
msg := "failed to marshal window list: " + err.Error()
|
||||
logger.Error(msg)
|
||||
return []byte(msg)
|
||||
}
|
||||
return data
|
||||
}
|
||||
|
||||
func captureWindow(args map[string]string) []byte {
|
||||
if !windowToolsAvailable {
|
||||
return []byte("window tools not available: xdotool or maim not found")
|
||||
}
|
||||
window, ok := args["window"]
|
||||
if !ok || window == "" {
|
||||
return []byte("window parameter required (window ID or name)")
|
||||
}
|
||||
var windowID string
|
||||
if _, err := strconv.Atoi(window); err == nil {
|
||||
windowID = window
|
||||
} else {
|
||||
cmd := exec.Command(xdotoolPath, "search", "--name", window)
|
||||
output, err := cmd.Output()
|
||||
if err != nil || len(strings.Fields(string(output))) == 0 {
|
||||
return []byte("window not found: " + window)
|
||||
}
|
||||
windowID = strings.Fields(string(output))[0]
|
||||
}
|
||||
nameCmd := exec.Command(xdotoolPath, "getwindowname", windowID)
|
||||
nameOutput, _ := nameCmd.Output()
|
||||
windowName := strings.TrimSpace(string(nameOutput))
|
||||
windowName = regexp.MustCompile(`[^a-zA-Z]+`).ReplaceAllString(windowName, "")
|
||||
if windowName == "" {
|
||||
windowName = "window"
|
||||
}
|
||||
timestamp := time.Now().Unix()
|
||||
filename := fmt.Sprintf("/tmp/%s_%d.jpg", windowName, timestamp)
|
||||
cmd := exec.Command(maimPath, "-i", windowIDToHex(windowID), filename)
|
||||
if err := cmd.Run(); err != nil {
|
||||
msg := "failed to capture window: " + err.Error()
|
||||
logger.Error(msg)
|
||||
return []byte(msg)
|
||||
}
|
||||
return []byte("screenshot saved: " + filename)
|
||||
}
|
||||
|
||||
func captureWindowAndView(args map[string]string) []byte {
|
||||
if !windowToolsAvailable {
|
||||
return []byte("window tools not available: xdotool or maim not found")
|
||||
}
|
||||
window, ok := args["window"]
|
||||
if !ok || window == "" {
|
||||
return []byte("window parameter required (window ID or name)")
|
||||
}
|
||||
var windowID string
|
||||
if _, err := strconv.Atoi(window); err == nil {
|
||||
windowID = window
|
||||
} else {
|
||||
cmd := exec.Command(xdotoolPath, "search", "--name", window)
|
||||
output, err := cmd.Output()
|
||||
if err != nil || len(strings.Fields(string(output))) == 0 {
|
||||
return []byte("window not found: " + window)
|
||||
}
|
||||
windowID = strings.Fields(string(output))[0]
|
||||
}
|
||||
nameCmd := exec.Command(xdotoolPath, "getwindowname", windowID)
|
||||
nameOutput, _ := nameCmd.Output()
|
||||
windowName := strings.TrimSpace(string(nameOutput))
|
||||
windowName = regexp.MustCompile(`[^a-zA-Z]+`).ReplaceAllString(windowName, "")
|
||||
if windowName == "" {
|
||||
windowName = "window"
|
||||
}
|
||||
timestamp := time.Now().Unix()
|
||||
filename := fmt.Sprintf("/tmp/%s_%d.jpg", windowName, timestamp)
|
||||
captureCmd := exec.Command(maimPath, "-i", windowIDToHex(windowID), filename)
|
||||
if err := captureCmd.Run(); err != nil {
|
||||
msg := "failed to capture window: " + err.Error()
|
||||
logger.Error(msg)
|
||||
return []byte(msg)
|
||||
}
|
||||
dataURL, err := models.CreateImageURLFromPath(filename)
|
||||
if err != nil {
|
||||
msg := "failed to create image URL: " + err.Error()
|
||||
logger.Error(msg)
|
||||
return []byte(msg)
|
||||
}
|
||||
result := models.MultimodalToolResp{
|
||||
Type: "multimodal_content",
|
||||
Parts: []map[string]string{
|
||||
{"type": "text", "text": "Screenshot saved: " + filename},
|
||||
{"type": "image_url", "url": dataURL},
|
||||
},
|
||||
}
|
||||
jsonResult, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
msg := "failed to marshal result: " + err.Error()
|
||||
logger.Error(msg)
|
||||
return []byte(msg)
|
||||
}
|
||||
return jsonResult
|
||||
}
|
||||
|
||||
type fnSig func(map[string]string) []byte
|
||||
|
||||
var fnMap = map[string]fnSig{
|
||||
@@ -1101,6 +1348,7 @@ var fnMap = map[string]fnSig{
|
||||
"read_url_raw": readURLRaw,
|
||||
"file_create": fileCreate,
|
||||
"file_read": fileRead,
|
||||
"file_read_image": fileReadImage,
|
||||
"file_write": fileWrite,
|
||||
"file_write_append": fileWriteAppend,
|
||||
"file_edit": fileEdit,
|
||||
@@ -1116,6 +1364,66 @@ var fnMap = map[string]fnSig{
|
||||
"summarize_chat": summarizeChat,
|
||||
}
|
||||
|
||||
func registerWindowTools() {
|
||||
if windowToolsAvailable {
|
||||
fnMap["list_windows"] = listWindows
|
||||
fnMap["capture_window"] = captureWindow
|
||||
windowTools := []models.Tool{
|
||||
{
|
||||
Type: "function",
|
||||
Function: models.ToolFunc{
|
||||
Name: "list_windows",
|
||||
Description: "List all visible windows with their IDs and names. Returns a map of window ID to window name.",
|
||||
Parameters: models.ToolFuncParams{
|
||||
Type: "object",
|
||||
Required: []string{},
|
||||
Properties: map[string]models.ToolArgProps{},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Type: "function",
|
||||
Function: models.ToolFunc{
|
||||
Name: "capture_window",
|
||||
Description: "Capture a screenshot of a specific window and save it to /tmp. Requires window parameter (window ID or name substring).",
|
||||
Parameters: models.ToolFuncParams{
|
||||
Type: "object",
|
||||
Required: []string{"window"},
|
||||
Properties: map[string]models.ToolArgProps{
|
||||
"window": models.ToolArgProps{
|
||||
Type: "string",
|
||||
Description: "window ID or window name (partial match)",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
if modelHasVision {
|
||||
fnMap["capture_window_and_view"] = captureWindowAndView
|
||||
windowTools = append(windowTools, models.Tool{
|
||||
Type: "function",
|
||||
Function: models.ToolFunc{
|
||||
Name: "capture_window_and_view",
|
||||
Description: "Capture a screenshot of a specific window, save it to /tmp, and return the image for viewing. Requires window parameter (window ID or name substring).",
|
||||
Parameters: models.ToolFuncParams{
|
||||
Type: "object",
|
||||
Required: []string{"window"},
|
||||
Properties: map[string]models.ToolArgProps{
|
||||
"window": models.ToolArgProps{
|
||||
Type: "string",
|
||||
Description: "window ID or window name (partial match)",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
baseTools = append(baseTools, windowTools...)
|
||||
toolSysMsg += windowToolSysMsg
|
||||
}
|
||||
}
|
||||
|
||||
// callToolWithAgent calls the tool and applies any registered agent.
|
||||
func callToolWithAgent(name string, args map[string]string) []byte {
|
||||
registerWebAgents()
|
||||
@@ -1327,6 +1635,24 @@ var baseTools = []models.Tool{
|
||||
},
|
||||
},
|
||||
},
|
||||
// file_read_image
|
||||
models.Tool{
|
||||
Type: "function",
|
||||
Function: models.ToolFunc{
|
||||
Name: "file_read_image",
|
||||
Description: "Read an image file and return it for multimodal LLM viewing. Supports png, jpg, jpeg, gif, webp formats. Use when you need the LLM to see and analyze an image.",
|
||||
Parameters: models.ToolFuncParams{
|
||||
Type: "object",
|
||||
Required: []string{"path"},
|
||||
Properties: map[string]models.ToolArgProps{
|
||||
"path": models.ToolArgProps{
|
||||
Type: "string",
|
||||
Description: "path of the image file to read",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
// file_write
|
||||
models.Tool{
|
||||
Type: "function",
|
||||
@@ -1580,3 +1906,56 @@ var baseTools = []models.Tool{
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
if windowToolsAvailable {
|
||||
baseTools = append(baseTools,
|
||||
models.Tool{
|
||||
Type: "function",
|
||||
Function: models.ToolFunc{
|
||||
Name: "list_windows",
|
||||
Description: "List all visible windows with their IDs and names. Returns a map of window ID to window name.",
|
||||
Parameters: models.ToolFuncParams{
|
||||
Type: "object",
|
||||
Required: []string{},
|
||||
Properties: map[string]models.ToolArgProps{},
|
||||
},
|
||||
},
|
||||
},
|
||||
models.Tool{
|
||||
Type: "function",
|
||||
Function: models.ToolFunc{
|
||||
Name: "capture_window",
|
||||
Description: "Capture a screenshot of a specific window and save it to /tmp. Requires window parameter (window ID or name substring).",
|
||||
Parameters: models.ToolFuncParams{
|
||||
Type: "object",
|
||||
Required: []string{"window"},
|
||||
Properties: map[string]models.ToolArgProps{
|
||||
"window": models.ToolArgProps{
|
||||
Type: "string",
|
||||
Description: "window ID or window name (partial match)",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
models.Tool{
|
||||
Type: "function",
|
||||
Function: models.ToolFunc{
|
||||
Name: "capture_window_and_view",
|
||||
Description: "Capture a screenshot of a specific window, save it to /tmp, and return the image for viewing. Requires window parameter (window ID or name substring).",
|
||||
Parameters: models.ToolFuncParams{
|
||||
Type: "object",
|
||||
Required: []string{"window"},
|
||||
Properties: map[string]models.ToolArgProps{
|
||||
"window": models.ToolArgProps{
|
||||
Type: "string",
|
||||
Description: "window ID or window name (partial match)",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
3
tui.go
3
tui.go
@@ -213,8 +213,7 @@ func init() {
|
||||
pages.SwitchToPage("main") // or whatever your main page is named
|
||||
})
|
||||
confirmModal.SetInputCapture(func(event *tcell.EventKey) *tcell.EventKey {
|
||||
switch event.Key() {
|
||||
case tcell.KeyRune:
|
||||
if event.Key() == tcell.KeyRune {
|
||||
switch event.Rune() {
|
||||
case 'y', 'Y':
|
||||
persona := cfg.UserRole
|
||||
|
||||
Reference in New Issue
Block a user