14 Commits

Author SHA1 Message Date
Grail Finder
4bddce3700 Enha: compute estimate of non llm text 2026-03-02 15:21:45 +03:00
Grail Finder
fcc71987bf Feat: token use estimation 2026-03-02 14:54:20 +03:00
Grail Finder
8458edf5a8 Enha: interrupt llm and tool both 2026-03-02 12:19:50 +03:00
Grail Finder
07b06bb0d3 Enha: tabcompletion is back in textarea 2026-03-02 12:09:27 +03:00
Grail Finder
3389b1d83b Fix: linter complaints 2026-03-02 11:39:55 +03:00
Grail Finder
4f6000a43a Enha: check if model has vision before giving it vision tools 2026-03-02 11:25:20 +03:00
Grail Finder
9ba46b40cc Feat: screencapture for completion 2026-03-02 11:12:04 +03:00
Grail Finder
5bb456272e Feat: capture window (screenshot) 2026-03-02 10:33:41 +03:00
Grail Finder
8999f48fb9 Fix (completion): handle multiple images in history 2026-03-02 09:23:22 +03:00
Grail Finder
b2f280a7f1 Feat: read img for completion 2026-03-02 07:46:08 +03:00
Grail Finder
65cbd5d6a6 Fix (ctrl+v): trim loaded mark from the model 2026-03-02 07:19:21 +03:00
Grail Finder
caac1d397a Feat: read img tool for chat endpoint 2026-03-02 07:12:28 +03:00
Grail Finder
742f1ca838 Enha: modal affirmation popup on sending empty msg 2026-03-01 16:21:18 +03:00
Grail Finder
e36bade353 Fix: escape with empty textarea not generating response 2026-03-01 13:33:25 +03:00
9 changed files with 770 additions and 76 deletions

86
bot.go
View File

@@ -64,6 +64,8 @@ var (
"meta-llama/llama-3.3-70b-instruct:free",
}
LocalModels = []string{}
localModelsData *models.LCPModels
orModelsData *models.ORModels
)
var thinkBlockRE = regexp.MustCompile(`(?s)<think>.*?</think>`)
@@ -355,6 +357,7 @@ func fetchORModels(free bool) ([]string, error) {
if err := json.NewDecoder(resp.Body).Decode(data); err != nil {
return nil, err
}
orModelsData = data
freeModels := data.ListModels(free)
return freeModels, nil
}
@@ -416,6 +419,7 @@ func fetchLCPModelsWithStatus() (*models.LCPModels, error) {
if err := json.NewDecoder(resp.Body).Decode(data); err != nil {
return nil, err
}
localModelsData = data
return data, nil
}
@@ -433,6 +437,33 @@ func isModelLoaded(modelID string) (bool, error) {
return false, nil
}
func ModelHasVision(api, modelID string) bool {
switch {
case strings.Contains(api, "deepseek"):
return false
case strings.Contains(api, "openrouter"):
resp, err := http.Get("https://openrouter.ai/api/v1/models")
if err != nil {
logger.Warn("failed to fetch OR models for vision check", "error", err)
return false
}
defer resp.Body.Close()
orm := &models.ORModels{}
if err := json.NewDecoder(resp.Body).Decode(orm); err != nil {
logger.Warn("failed to decode OR models for vision check", "error", err)
return false
}
return orm.HasVision(modelID)
default:
models, err := fetchLCPModelsWithStatus()
if err != nil {
logger.Warn("failed to fetch LCP models for vision check", "error", err)
return false
}
return models.HasVision(modelID)
}
}
// monitorModelLoad starts a goroutine that periodically checks if the specified model is loaded.
func monitorModelLoad(modelID string) {
go func() {
@@ -718,7 +749,7 @@ func sendMsgToLLM(body io.Reader) {
}
interrupt:
if interruptResp { // read bytes, so it would not get into beginning of the next req
interruptResp = false
// interruptResp = false
logger.Info("interrupted bot response", "chunk_counter", counter)
streamDone <- true
break
@@ -772,6 +803,7 @@ func showSpinner() {
}
func chatRound(r *models.ChatRoundReq) error {
interruptResp = false
botRespMode = true
go showSpinner()
updateStatusLine()
@@ -937,6 +969,9 @@ out:
}
// Strip think blocks before parsing for tool calls
respTextNoThink := thinkBlockRE.ReplaceAllString(respText.String(), "")
if interruptResp {
return nil
}
if findCall(respTextNoThink, toolResp.String()) {
return nil
}
@@ -1174,17 +1209,59 @@ func findCall(msg, toolCall string) bool {
toolRunningMode = false
toolMsg := string(resp)
logger.Info("llm used a tool call", "tool_name", fc.Name, "too_args", fc.Args, "id", fc.ID, "tool_resp", toolMsg)
fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
"\n\n", len(chatBody.Messages), cfg.ToolRole, toolMsg)
// Create tool response message with the proper tool_call_id
// Mark shell commands as always visible
isShellCommand := fc.Name == "execute_command"
toolResponseMsg := models.RoleMsg{
// Check if response is multimodal content (image)
var toolResponseMsg models.RoleMsg
if strings.HasPrefix(strings.TrimSpace(toolMsg), `{"type":"multimodal_content"`) {
// Parse multimodal content response
multimodalResp := models.MultimodalToolResp{}
if err := json.Unmarshal([]byte(toolMsg), &multimodalResp); err == nil && multimodalResp.Type == "multimodal_content" {
// Create RoleMsg with ContentParts
var contentParts []any
for _, part := range multimodalResp.Parts {
partType := part["type"]
switch partType {
case "text":
contentParts = append(contentParts, models.TextContentPart{Type: "text", Text: part["text"]})
case "image_url":
contentParts = append(contentParts, models.ImageContentPart{
Type: "image_url",
ImageURL: struct {
URL string `json:"url"`
}{URL: part["url"]},
})
default:
continue
}
}
toolResponseMsg = models.RoleMsg{
Role: cfg.ToolRole,
ContentParts: contentParts,
HasContentParts: true,
ToolCallID: lastToolCall.ID,
IsShellCommand: isShellCommand,
}
} else {
// Fallback to regular content
toolResponseMsg = models.RoleMsg{
Role: cfg.ToolRole,
Content: toolMsg,
ToolCallID: lastToolCall.ID,
IsShellCommand: isShellCommand,
}
}
} else {
toolResponseMsg = models.RoleMsg{
Role: cfg.ToolRole,
Content: toolMsg,
ToolCallID: lastToolCall.ID,
IsShellCommand: isShellCommand,
}
}
fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
"\n\n", len(chatBody.Messages), cfg.ToolRole, toolResponseMsg.GetText())
chatBody.Messages = append(chatBody.Messages, toolResponseMsg)
logger.Debug("findCall: added actual tool response", "role", toolResponseMsg.Role, "content_len", len(toolResponseMsg.Content), "tool_call_id", toolResponseMsg.ToolCallID, "message_count_after_add", len(chatBody.Messages))
// Clear the stored tool call ID after using it
@@ -1339,6 +1416,7 @@ func updateModelLists() {
chatBody.Model = m
cachedModelColor = "green"
updateStatusLine()
UpdateToolCapabilities()
app.Draw()
return
}

View File

@@ -11,6 +11,7 @@ import (
"path"
"path/filepath"
"slices"
"strconv"
"strings"
"time"
"unicode"
@@ -376,9 +377,90 @@ func makeStatusLine() string {
roleInject := fmt.Sprintf(" | [%s:-:b]role injection[-:-:-] (alt+7)", boolColors[injectRole])
statusLine += roleInject
}
// context tokens
contextTokens := getContextTokens()
maxCtx := getMaxContextTokens()
if maxCtx == 0 {
maxCtx = 16384
}
if contextTokens > 0 {
contextInfo := fmt.Sprintf(" | context-estim: [orange:-:b]%d/%d[-:-:-]", contextTokens, maxCtx)
statusLine += contextInfo
}
return statusLine + imageInfo + shellModeInfo
}
func getContextTokens() int {
if chatBody == nil || chatBody.Messages == nil {
return 0
}
total := 0
messages := chatBody.Messages
for i := range messages {
msg := &messages[i]
if msg.Stats != nil && msg.Stats.Tokens > 0 {
total += msg.Stats.Tokens
} else if msg.GetText() != "" {
total += len(msg.GetText()) / 4
}
}
return total
}
const deepseekContext = 128000
func getMaxContextTokens() int {
if chatBody == nil || chatBody.Model == "" {
return 0
}
modelName := chatBody.Model
switch {
case strings.Contains(cfg.CurrentAPI, "openrouter"):
if orModelsData != nil {
for i := range orModelsData.Data {
m := &orModelsData.Data[i]
if m.ID == modelName {
return m.ContextLength
}
}
}
case strings.Contains(cfg.CurrentAPI, "deepseek"):
return deepseekContext
default:
if localModelsData != nil {
for i := range localModelsData.Data {
m := &localModelsData.Data[i]
if m.ID == modelName {
for _, arg := range m.Status.Args {
if strings.HasPrefix(arg, "--ctx-size") {
if strings.Contains(arg, "=") {
val := strings.Split(arg, "=")[1]
if n, err := strconv.Atoi(val); err == nil {
return n
}
} else {
idx := -1
for j, a := range m.Status.Args {
if a == "--ctx-size" && j+1 < len(m.Status.Args) {
idx = j + 1
break
}
}
if idx != -1 {
if n, err := strconv.Atoi(m.Status.Args[idx]); err == nil {
return n
}
}
}
}
}
}
}
}
}
return 0
}
// set of roles within card definition and mention in chat history
func listChatRoles() []string {
currentChat, ok := chatMap[activeChatName]

60
llm.go
View File

@@ -3,7 +3,6 @@ package main
import (
"bytes"
"encoding/json"
"errors"
"gf-lt/models"
"io"
"strings"
@@ -119,25 +118,22 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI)
localImageAttachmentPath := imageAttachmentPath
var multimodalData []string
if msg != "" { // otherwise let the bot to continue
var newMsg models.RoleMsg
if localImageAttachmentPath != "" {
newMsg = models.NewMultimodalMsg(role, []any{})
newMsg.AddTextPart(msg)
imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath)
if err != nil {
logger.Error("failed to create image URL from path for completion",
"error", err, "path", localImageAttachmentPath)
return nil, err
}
// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
parts := strings.SplitN(imageURL, ",", 2)
if len(parts) == 2 {
multimodalData = append(multimodalData, parts[1])
} else {
logger.Error("invalid image data URL format", "url", imageURL)
return nil, errors.New("invalid image data URL format")
}
newMsg.AddImagePart(imageURL, localImageAttachmentPath)
imageAttachmentPath = "" // Clear the attachment after use
} else { // not a multimodal msg or image passed in tool call
newMsg = models.RoleMsg{Role: role, Content: msg}
}
if msg != "" { // otherwise let the bot to continue
newMsg := models.RoleMsg{Role: role, Content: msg}
newMsg = *processMessageTag(&newMsg)
chatBody.Messages = append(chatBody.Messages, newMsg)
}
@@ -146,22 +142,40 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
chatBody.Messages = append(chatBody.Messages, models.RoleMsg{Role: cfg.ToolRole, Content: toolSysMsg})
}
filteredMessages, botPersona := filterMessagesForCurrentCharacter(chatBody.Messages)
// Build prompt and extract images inline as we process each message
messages := make([]string, len(filteredMessages))
for i := range filteredMessages {
messages[i] = stripThinkingFromMsg(&filteredMessages[i]).ToPrompt()
m := stripThinkingFromMsg(&filteredMessages[i])
messages[i] = m.ToPrompt()
// Extract images from this message and add marker inline
if len(m.ContentParts) > 0 {
for _, part := range m.ContentParts {
var imgURL string
// Check for struct type
if imgPart, ok := part.(models.ImageContentPart); ok {
imgURL = imgPart.ImageURL.URL
} else if partMap, ok := part.(map[string]any); ok {
// Check for map type (from JSON unmarshaling)
if partType, exists := partMap["type"]; exists && partType == "image_url" {
if imgURLMap, ok := partMap["image_url"].(map[string]any); ok {
if url, ok := imgURLMap["url"].(string); ok {
imgURL = url
}
}
}
}
if imgURL != "" {
// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
parts := strings.SplitN(imgURL, ",", 2)
if len(parts) == 2 {
multimodalData = append(multimodalData, parts[1])
messages[i] += " <__media__>"
}
}
}
}
}
prompt := strings.Join(messages, "\n")
// Add multimodal media markers to the prompt text when multimodal data is present
// This is required by llama.cpp multimodal models so they know where to insert media
if len(multimodalData) > 0 {
// Add a media marker for each item in the multimodal data
var sb strings.Builder
sb.WriteString(prompt)
for range multimodalData {
sb.WriteString(" <__media__>") // llama.cpp default multimodal marker
}
prompt = sb.String()
}
// needs to be after <__media__> if there are images
if !resume {
botMsgStart := "\n" + botPersona + ":\n"

View File

@@ -2,6 +2,7 @@ package models
const (
LoadedMark = "(loaded) "
ToolRespMultyType = "multimodel_content"
)
type APIType int

View File

@@ -391,7 +391,6 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
if err != nil {
return "", err
}
// Determine the image format based on file extension
var mimeType string
switch {
@@ -408,10 +407,8 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
default:
mimeType = "image/jpeg" // default
}
// Encode to base64
encoded := base64.StdEncoding.EncodeToString(data)
// Create data URL
return fmt.Sprintf("data:%s;base64,%s", mimeType, encoded), nil
}
@@ -611,6 +608,20 @@ func (lcp *LCPModels) ListModels() []string {
return resp
}
func (lcp *LCPModels) HasVision(modelID string) bool {
for _, m := range lcp.Data {
if m.ID == modelID {
args := m.Status.Args
for i := 0; i < len(args)-1; i++ {
if args[i] == "--mmproj" {
return true
}
}
}
}
return false
}
type ResponseStats struct {
Tokens int
Duration float64
@@ -623,3 +634,8 @@ type ChatRoundReq struct {
Regen bool
Resume bool
}
type MultimodalToolResp struct {
Type string `json:"type"`
Parts []map[string]string `json:"parts"`
}

View File

@@ -172,3 +172,16 @@ func (orm *ORModels) ListModels(free bool) []string {
}
return resp
}
func (orm *ORModels) HasVision(modelID string) bool {
for i := range orm.Data {
if orm.Data[i].ID == modelID {
for _, mod := range orm.Data[i].Architecture.InputModalities {
if mod == "image" {
return true
}
}
}
}
return false
}

View File

@@ -143,6 +143,7 @@ func showAPILinkSelectionPopup() {
apiListWidget.SetSelectedFunc(func(index int, mainText string, secondaryText string, shortcut rune) {
// Update the API in config
cfg.CurrentAPI = mainText
UpdateToolCapabilities()
// Update model list based on new API
// Helper function to get model list for a given API (same as in props_table.go)
getModelListForAPI := func(api string) []string {
@@ -160,8 +161,9 @@ func showAPILinkSelectionPopup() {
newModelList := getModelListForAPI(cfg.CurrentAPI)
// Ensure chatBody.Model is in the new list; if not, set to first available model
if len(newModelList) > 0 && !slices.Contains(newModelList, chatBody.Model) {
chatBody.Model = newModelList[0]
chatBody.Model = strings.TrimPrefix(newModelList[0], models.LoadedMark)
cfg.CurrentModel = chatBody.Model
UpdateToolCapabilities()
}
pages.RemovePage("apiLinkSelectionPopup")
app.SetFocus(textArea)
@@ -404,6 +406,66 @@ func showShellFileCompletionPopup(filter string) {
app.SetFocus(widget)
}
func showTextAreaFileCompletionPopup(filter string) {
baseDir := cfg.FilePickerDir
if baseDir == "" {
baseDir = "."
}
complMatches := scanFiles(baseDir, filter)
if len(complMatches) == 0 {
return
}
if len(complMatches) == 1 {
currentText := textArea.GetText()
atIdx := strings.LastIndex(currentText, "@")
if atIdx >= 0 {
before := currentText[:atIdx]
textArea.SetText(before+complMatches[0], true)
}
return
}
widget := tview.NewList().ShowSecondaryText(false).
SetSelectedBackgroundColor(tcell.ColorGray)
widget.SetTitle("file completion").SetBorder(true)
for _, m := range complMatches {
widget.AddItem(m, "", 0, nil)
}
widget.SetSelectedFunc(func(index int, mainText string, secondaryText string, shortcut rune) {
currentText := textArea.GetText()
atIdx := strings.LastIndex(currentText, "@")
if atIdx >= 0 {
before := currentText[:atIdx]
textArea.SetText(before+mainText, true)
}
pages.RemovePage("textAreaFileCompletionPopup")
app.SetFocus(textArea)
})
widget.SetInputCapture(func(event *tcell.EventKey) *tcell.EventKey {
if event.Key() == tcell.KeyEscape {
pages.RemovePage("textAreaFileCompletionPopup")
app.SetFocus(textArea)
return nil
}
if event.Key() == tcell.KeyRune && event.Rune() == 'x' {
pages.RemovePage("textAreaFileCompletionPopup")
app.SetFocus(textArea)
return nil
}
return event
})
modal := func(p tview.Primitive, width, height int) tview.Primitive {
return tview.NewFlex().
AddItem(nil, 0, 1, false).
AddItem(tview.NewFlex().SetDirection(tview.FlexRow).
AddItem(nil, 0, 1, false).
AddItem(p, height, 1, true).
AddItem(nil, 0, 1, false), width, 1, true).
AddItem(nil, 0, 1, false)
}
pages.AddPage("textAreaFileCompletionPopup", modal(widget, 80, 20), true, true)
app.SetFocus(widget)
}
func updateWidgetColors(theme *tview.Theme) {
bgColor := theme.PrimitiveBackgroundColor
fgColor := theme.PrimaryTextColor

379
tools.go
View File

@@ -85,6 +85,11 @@ Your current tools:
"when_to_use": "when asked to read the content of a file"
},
{
"name":"file_read_image",
"args": ["path"],
"when_to_use": "when asked to read or view an image file"
},
{
"name":"file_write",
"args": ["path", "content"],
"when_to_use": "when needed to overwrite content to a file"
@@ -170,8 +175,36 @@ After that you are free to respond to the user.
webAgentsOnce sync.Once
)
var windowToolSysMsg = `
Additional window tools (available only if xdotool and maim are installed):
[
{
"name":"list_windows",
"args": [],
"when_to_use": "when asked to list visible windows; returns map of window ID to window name"
},
{
"name":"capture_window",
"args": ["window"],
"when_to_use": "when asked to take a screenshot of a specific window; saves to /tmp; window can be ID or name substring; returns file path"
},
{
"name":"capture_window_and_view",
"args": ["window"],
"when_to_use": "when asked to take a screenshot of a specific window and show it; saves to /tmp and returns image for viewing; window can be ID or name substring"
}
]
`
var WebSearcher searcher.WebSurfer
var (
windowToolsAvailable bool
xdotoolPath string
maimPath string
modelHasVision bool
)
func init() {
sa, err := searcher.NewWebSurfer(searcher.SearcherTypeScraper, "")
if err != nil {
@@ -181,6 +214,47 @@ func init() {
if err := rag.Init(cfg, logger, store); err != nil {
logger.Warn("failed to init rag; rag_search tool will not be available", "error", err)
}
checkWindowTools()
registerWindowTools()
}
func checkWindowTools() {
xdotoolPath, _ = exec.LookPath("xdotool")
maimPath, _ = exec.LookPath("maim")
windowToolsAvailable = xdotoolPath != "" && maimPath != ""
if windowToolsAvailable {
logger.Info("window tools available: xdotool and maim found")
} else {
if xdotoolPath == "" {
logger.Warn("xdotool not found, window listing tools will not be available")
}
if maimPath == "" {
logger.Warn("maim not found, window capture tools will not be available")
}
}
}
func UpdateToolCapabilities() {
if !cfg.ToolUse {
return
}
modelHasVision = false
if cfg == nil || cfg.CurrentAPI == "" {
logger.Warn("cannot determine model capabilities: cfg or CurrentAPI is nil")
registerWindowTools()
return
}
prevHasVision := modelHasVision
modelHasVision = ModelHasVision(cfg.CurrentAPI, cfg.CurrentModel)
if modelHasVision {
logger.Info("model has vision support", "model", cfg.CurrentModel, "api", cfg.CurrentAPI)
} else {
logger.Info("model does not have vision support", "model", cfg.CurrentModel, "api", cfg.CurrentAPI)
if windowToolsAvailable && !prevHasVision && !modelHasVision {
_ = notifyUser("window tools", "Window capture-and-view unavailable: model lacks vision support")
}
}
registerWindowTools()
}
// getWebAgentClient returns a singleton AgentClient for web agents.
@@ -469,6 +543,43 @@ func fileRead(args map[string]string) []byte {
return jsonResult
}
func fileReadImage(args map[string]string) []byte {
path, ok := args["path"]
if !ok || path == "" {
msg := "path not provided to file_read_image tool"
logger.Error(msg)
return []byte(msg)
}
path = resolvePath(path)
dataURL, err := models.CreateImageURLFromPath(path)
if err != nil {
msg := "failed to read image; error: " + err.Error()
logger.Error(msg)
return []byte(msg)
}
// result := map[string]any{
// "type": "multimodal_content",
// "parts": []map[string]string{
// {"type": "text", "text": "Image at " + path},
// {"type": "image_url", "url": dataURL},
// },
// }
result := models.MultimodalToolResp{
Type: "multimodal_content",
Parts: []map[string]string{
{"type": "text", "text": "Image at " + path},
{"type": "image_url", "url": dataURL},
},
}
jsonResult, err := json.Marshal(result)
if err != nil {
msg := "failed to marshal result; error: " + err.Error()
logger.Error(msg)
return []byte(msg)
}
return jsonResult
}
func fileWrite(args map[string]string) []byte {
path, ok := args["path"]
if !ok || path == "" {
@@ -1088,6 +1199,142 @@ func summarizeChat(args map[string]string) []byte {
return []byte(chatText)
}
func windowIDToHex(decimalID string) string {
id, err := strconv.ParseInt(decimalID, 10, 64)
if err != nil {
return decimalID
}
return fmt.Sprintf("0x%x", id)
}
func listWindows(args map[string]string) []byte {
if !windowToolsAvailable {
return []byte("window tools not available: xdotool or maim not found")
}
cmd := exec.Command(xdotoolPath, "search", "--name", ".")
output, err := cmd.Output()
if err != nil {
msg := "failed to list windows: " + err.Error()
logger.Error(msg)
return []byte(msg)
}
windowIDs := strings.Fields(string(output))
windows := make(map[string]string)
for _, id := range windowIDs {
id = strings.TrimSpace(id)
if id == "" {
continue
}
nameCmd := exec.Command(xdotoolPath, "getwindowname", id)
nameOutput, err := nameCmd.Output()
if err != nil {
continue
}
name := strings.TrimSpace(string(nameOutput))
windows[id] = name
}
data, err := json.Marshal(windows)
if err != nil {
msg := "failed to marshal window list: " + err.Error()
logger.Error(msg)
return []byte(msg)
}
return data
}
func captureWindow(args map[string]string) []byte {
if !windowToolsAvailable {
return []byte("window tools not available: xdotool or maim not found")
}
window, ok := args["window"]
if !ok || window == "" {
return []byte("window parameter required (window ID or name)")
}
var windowID string
if _, err := strconv.Atoi(window); err == nil {
windowID = window
} else {
cmd := exec.Command(xdotoolPath, "search", "--name", window)
output, err := cmd.Output()
if err != nil || len(strings.Fields(string(output))) == 0 {
return []byte("window not found: " + window)
}
windowID = strings.Fields(string(output))[0]
}
nameCmd := exec.Command(xdotoolPath, "getwindowname", windowID)
nameOutput, _ := nameCmd.Output()
windowName := strings.TrimSpace(string(nameOutput))
windowName = regexp.MustCompile(`[^a-zA-Z]+`).ReplaceAllString(windowName, "")
if windowName == "" {
windowName = "window"
}
timestamp := time.Now().Unix()
filename := fmt.Sprintf("/tmp/%s_%d.jpg", windowName, timestamp)
cmd := exec.Command(maimPath, "-i", windowIDToHex(windowID), filename)
if err := cmd.Run(); err != nil {
msg := "failed to capture window: " + err.Error()
logger.Error(msg)
return []byte(msg)
}
return []byte("screenshot saved: " + filename)
}
func captureWindowAndView(args map[string]string) []byte {
if !windowToolsAvailable {
return []byte("window tools not available: xdotool or maim not found")
}
window, ok := args["window"]
if !ok || window == "" {
return []byte("window parameter required (window ID or name)")
}
var windowID string
if _, err := strconv.Atoi(window); err == nil {
windowID = window
} else {
cmd := exec.Command(xdotoolPath, "search", "--name", window)
output, err := cmd.Output()
if err != nil || len(strings.Fields(string(output))) == 0 {
return []byte("window not found: " + window)
}
windowID = strings.Fields(string(output))[0]
}
nameCmd := exec.Command(xdotoolPath, "getwindowname", windowID)
nameOutput, _ := nameCmd.Output()
windowName := strings.TrimSpace(string(nameOutput))
windowName = regexp.MustCompile(`[^a-zA-Z]+`).ReplaceAllString(windowName, "")
if windowName == "" {
windowName = "window"
}
timestamp := time.Now().Unix()
filename := fmt.Sprintf("/tmp/%s_%d.jpg", windowName, timestamp)
captureCmd := exec.Command(maimPath, "-i", windowIDToHex(windowID), filename)
if err := captureCmd.Run(); err != nil {
msg := "failed to capture window: " + err.Error()
logger.Error(msg)
return []byte(msg)
}
dataURL, err := models.CreateImageURLFromPath(filename)
if err != nil {
msg := "failed to create image URL: " + err.Error()
logger.Error(msg)
return []byte(msg)
}
result := models.MultimodalToolResp{
Type: "multimodal_content",
Parts: []map[string]string{
{"type": "text", "text": "Screenshot saved: " + filename},
{"type": "image_url", "url": dataURL},
},
}
jsonResult, err := json.Marshal(result)
if err != nil {
msg := "failed to marshal result: " + err.Error()
logger.Error(msg)
return []byte(msg)
}
return jsonResult
}
type fnSig func(map[string]string) []byte
var fnMap = map[string]fnSig{
@@ -1101,6 +1348,7 @@ var fnMap = map[string]fnSig{
"read_url_raw": readURLRaw,
"file_create": fileCreate,
"file_read": fileRead,
"file_read_image": fileReadImage,
"file_write": fileWrite,
"file_write_append": fileWriteAppend,
"file_edit": fileEdit,
@@ -1116,6 +1364,66 @@ var fnMap = map[string]fnSig{
"summarize_chat": summarizeChat,
}
func registerWindowTools() {
if windowToolsAvailable {
fnMap["list_windows"] = listWindows
fnMap["capture_window"] = captureWindow
windowTools := []models.Tool{
{
Type: "function",
Function: models.ToolFunc{
Name: "list_windows",
Description: "List all visible windows with their IDs and names. Returns a map of window ID to window name.",
Parameters: models.ToolFuncParams{
Type: "object",
Required: []string{},
Properties: map[string]models.ToolArgProps{},
},
},
},
{
Type: "function",
Function: models.ToolFunc{
Name: "capture_window",
Description: "Capture a screenshot of a specific window and save it to /tmp. Requires window parameter (window ID or name substring).",
Parameters: models.ToolFuncParams{
Type: "object",
Required: []string{"window"},
Properties: map[string]models.ToolArgProps{
"window": models.ToolArgProps{
Type: "string",
Description: "window ID or window name (partial match)",
},
},
},
},
},
}
if modelHasVision {
fnMap["capture_window_and_view"] = captureWindowAndView
windowTools = append(windowTools, models.Tool{
Type: "function",
Function: models.ToolFunc{
Name: "capture_window_and_view",
Description: "Capture a screenshot of a specific window, save it to /tmp, and return the image for viewing. Requires window parameter (window ID or name substring).",
Parameters: models.ToolFuncParams{
Type: "object",
Required: []string{"window"},
Properties: map[string]models.ToolArgProps{
"window": models.ToolArgProps{
Type: "string",
Description: "window ID or window name (partial match)",
},
},
},
},
})
}
baseTools = append(baseTools, windowTools...)
toolSysMsg += windowToolSysMsg
}
}
// callToolWithAgent calls the tool and applies any registered agent.
func callToolWithAgent(name string, args map[string]string) []byte {
registerWebAgents()
@@ -1327,6 +1635,24 @@ var baseTools = []models.Tool{
},
},
},
// file_read_image
models.Tool{
Type: "function",
Function: models.ToolFunc{
Name: "file_read_image",
Description: "Read an image file and return it for multimodal LLM viewing. Supports png, jpg, jpeg, gif, webp formats. Use when you need the LLM to see and analyze an image.",
Parameters: models.ToolFuncParams{
Type: "object",
Required: []string{"path"},
Properties: map[string]models.ToolArgProps{
"path": models.ToolArgProps{
Type: "string",
Description: "path of the image file to read",
},
},
},
},
},
// file_write
models.Tool{
Type: "function",
@@ -1580,3 +1906,56 @@ var baseTools = []models.Tool{
},
},
}
func init() {
if windowToolsAvailable {
baseTools = append(baseTools,
models.Tool{
Type: "function",
Function: models.ToolFunc{
Name: "list_windows",
Description: "List all visible windows with their IDs and names. Returns a map of window ID to window name.",
Parameters: models.ToolFuncParams{
Type: "object",
Required: []string{},
Properties: map[string]models.ToolArgProps{},
},
},
},
models.Tool{
Type: "function",
Function: models.ToolFunc{
Name: "capture_window",
Description: "Capture a screenshot of a specific window and save it to /tmp. Requires window parameter (window ID or name substring).",
Parameters: models.ToolFuncParams{
Type: "object",
Required: []string{"window"},
Properties: map[string]models.ToolArgProps{
"window": models.ToolArgProps{
Type: "string",
Description: "window ID or window name (partial match)",
},
},
},
},
},
models.Tool{
Type: "function",
Function: models.ToolFunc{
Name: "capture_window_and_view",
Description: "Capture a screenshot of a specific window, save it to /tmp, and return the image for viewing. Requires window parameter (window ID or name substring).",
Parameters: models.ToolFuncParams{
Type: "object",
Required: []string{"window"},
Properties: map[string]models.ToolArgProps{
"window": models.ToolArgProps{
Type: "string",
Description: "window ID or window name (partial match)",
},
},
},
},
},
)
}
}

51
tui.go
View File

@@ -35,6 +35,8 @@ var (
renameWindow *tview.InputField
roleEditWindow *tview.InputField
shellInput *tview.InputField
confirmModal *tview.Modal
confirmPageName = "confirm"
fullscreenMode bool
positionVisible bool = true
scrollToEndEnabled bool = true
@@ -195,6 +197,39 @@ func init() {
}
return event
})
confirmModal = tview.NewModal().
SetText("You are trying to send an empty message.\nIt makes sense if the last message in the chat is from you.\nAre you sure?").
AddButtons([]string{"Yes", "No"}).
SetButtonBackgroundColor(tcell.ColorBlack).
SetButtonTextColor(tcell.ColorWhite).
SetDoneFunc(func(buttonIndex int, buttonLabel string) {
if buttonLabel == "Yes" {
persona := cfg.UserRole
if cfg.WriteNextMsgAs != "" {
persona = cfg.WriteNextMsgAs
}
chatRoundChan <- &models.ChatRoundReq{Role: persona, UserMsg: ""}
} // In both Yes and No, go back to the main page
pages.SwitchToPage("main") // or whatever your main page is named
})
confirmModal.SetInputCapture(func(event *tcell.EventKey) *tcell.EventKey {
if event.Key() == tcell.KeyRune {
switch event.Rune() {
case 'y', 'Y':
persona := cfg.UserRole
if cfg.WriteNextMsgAs != "" {
persona = cfg.WriteNextMsgAs
}
chatRoundChan <- &models.ChatRoundReq{Role: persona, UserMsg: ""}
pages.SwitchToPage("main")
return nil
case 'n', 'N', 'x', 'X':
pages.SwitchToPage("main")
return nil
}
}
return event
})
textArea = tview.NewTextArea().
SetPlaceholder("input is multiline; press <Enter> to start the next line;\npress <Esc> to send the message.")
textArea.SetBorder(true).SetTitle("input")
@@ -691,6 +726,7 @@ func init() {
if event.Key() == tcell.KeyF6 {
interruptResp = true
botRespMode = false
toolRunningMode = false
return nil
}
if event.Key() == tcell.KeyF7 {
@@ -997,7 +1033,6 @@ func init() {
return nil
}
msgText := textArea.GetText()
if msgText != "" {
nl := "\n\n" // keep empty lines between messages
prevText := textView.GetText(true)
persona := cfg.UserRole
@@ -1029,9 +1064,23 @@ func init() {
textView.ScrollToEnd()
}
colorText()
} else {
pages.AddPage(confirmPageName, confirmModal, true, true)
return nil
}
// go chatRound(msgText, persona, textView, false, false)
chatRoundChan <- &models.ChatRoundReq{Role: persona, UserMsg: msgText}
return nil
}
if event.Key() == tcell.KeyTab {
currentF := app.GetFocus()
if currentF == textArea {
currentText := textArea.GetText()
atIndex := strings.LastIndex(currentText, "@")
if atIndex >= 0 {
filter := currentText[atIndex+1:]
showTextAreaFileCompletionPopup(filter)
}
}
return nil
}