Enha: compute estimate of non llm text

Feat: token use estimation
Enha: interrupt llm and tool both
2026-03-02 15:21:45 +03:00 · 2026-03-02 14:54:20 +03:00 · 2026-03-02 12:19:50 +03:00 · 2026-03-02 12:09:27 +03:00 · 2026-03-02 11:39:55 +03:00 · 2026-03-02 11:25:20 +03:00
9 changed files with 770 additions and 76 deletions
--- a/bot.go
+++ b/bot.go
@@ -64,6 +64,8 @@ var (
 		"meta-llama/llama-3.3-70b-instruct:free",
 	}
 	LocalModels     = []string{}
+	localModelsData *models.LCPModels
+	orModelsData    *models.ORModels
 )

 var thinkBlockRE = regexp.MustCompile(`(?s)<think>.*?</think>`)
@@ -355,6 +357,7 @@ func fetchORModels(free bool) ([]string, error) {
 	if err := json.NewDecoder(resp.Body).Decode(data); err != nil {
 		return nil, err
 	}
+	orModelsData = data
 	freeModels := data.ListModels(free)
 	return freeModels, nil
 }
@@ -416,6 +419,7 @@ func fetchLCPModelsWithStatus() (*models.LCPModels, error) {
 	if err := json.NewDecoder(resp.Body).Decode(data); err != nil {
 		return nil, err
 	}
+	localModelsData = data
 	return data, nil
 }

@@ -433,6 +437,33 @@ func isModelLoaded(modelID string) (bool, error) {
 	return false, nil
 }

+func ModelHasVision(api, modelID string) bool {
+	switch {
+	case strings.Contains(api, "deepseek"):
+		return false
+	case strings.Contains(api, "openrouter"):
+		resp, err := http.Get("https://openrouter.ai/api/v1/models")
+		if err != nil {
+			logger.Warn("failed to fetch OR models for vision check", "error", err)
+			return false
+		}
+		defer resp.Body.Close()
+		orm := &models.ORModels{}
+		if err := json.NewDecoder(resp.Body).Decode(orm); err != nil {
+			logger.Warn("failed to decode OR models for vision check", "error", err)
+			return false
+		}
+		return orm.HasVision(modelID)
+	default:
+		models, err := fetchLCPModelsWithStatus()
+		if err != nil {
+			logger.Warn("failed to fetch LCP models for vision check", "error", err)
+			return false
+		}
+		return models.HasVision(modelID)
+	}
+}
+
 // monitorModelLoad starts a goroutine that periodically checks if the specified model is loaded.
 func monitorModelLoad(modelID string) {
 	go func() {
@@ -718,7 +749,7 @@ func sendMsgToLLM(body io.Reader) {
 		}
 	interrupt:
 		if interruptResp { // read bytes, so it would not get into beginning of the next req
-			interruptResp = false
+			// interruptResp = false
 			logger.Info("interrupted bot response", "chunk_counter", counter)
 			streamDone <- true
 			break
@@ -772,6 +803,7 @@ func showSpinner() {
 }

 func chatRound(r *models.ChatRoundReq) error {
+	interruptResp = false
 	botRespMode = true
 	go showSpinner()
 	updateStatusLine()
@@ -937,6 +969,9 @@ out:
 	}
 	// Strip think blocks before parsing for tool calls
 	respTextNoThink := thinkBlockRE.ReplaceAllString(respText.String(), "")
+	if interruptResp {
+		return nil
+	}
 	if findCall(respTextNoThink, toolResp.String()) {
 		return nil
 	}
@@ -1174,17 +1209,59 @@ func findCall(msg, toolCall string) bool {
 	toolRunningMode = false
 	toolMsg := string(resp)
 	logger.Info("llm used a tool call", "tool_name", fc.Name, "too_args", fc.Args, "id", fc.ID, "tool_resp", toolMsg)
-	fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
-		"\n\n", len(chatBody.Messages), cfg.ToolRole, toolMsg)
 	// Create tool response message with the proper tool_call_id
 	// Mark shell commands as always visible
 	isShellCommand := fc.Name == "execute_command"
-	toolResponseMsg := models.RoleMsg{
+	// Check if response is multimodal content (image)
+	var toolResponseMsg models.RoleMsg
+	if strings.HasPrefix(strings.TrimSpace(toolMsg), `{"type":"multimodal_content"`) {
+		// Parse multimodal content response
+		multimodalResp := models.MultimodalToolResp{}
+		if err := json.Unmarshal([]byte(toolMsg), &multimodalResp); err == nil && multimodalResp.Type == "multimodal_content" {
+			// Create RoleMsg with ContentParts
+			var contentParts []any
+			for _, part := range multimodalResp.Parts {
+				partType := part["type"]
+				switch partType {
+				case "text":
+					contentParts = append(contentParts, models.TextContentPart{Type: "text", Text: part["text"]})
+				case "image_url":
+					contentParts = append(contentParts, models.ImageContentPart{
+						Type: "image_url",
+						ImageURL: struct {
+							URL string `json:"url"`
+						}{URL: part["url"]},
+					})
+				default:
+					continue
+				}
+			}
+			toolResponseMsg = models.RoleMsg{
+				Role:            cfg.ToolRole,
+				ContentParts:    contentParts,
+				HasContentParts: true,
+				ToolCallID:      lastToolCall.ID,
+				IsShellCommand:  isShellCommand,
+			}
+		} else {
+			// Fallback to regular content
+			toolResponseMsg = models.RoleMsg{
 				Role:           cfg.ToolRole,
 				Content:        toolMsg,
 				ToolCallID:     lastToolCall.ID,
 				IsShellCommand: isShellCommand,
 			}
+		}
+	} else {
+		toolResponseMsg = models.RoleMsg{
+			Role:           cfg.ToolRole,
+			Content:        toolMsg,
+			ToolCallID:     lastToolCall.ID,
+			IsShellCommand: isShellCommand,
+		}
+	}
+	fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
+		"\n\n", len(chatBody.Messages), cfg.ToolRole, toolResponseMsg.GetText())
 	chatBody.Messages = append(chatBody.Messages, toolResponseMsg)
 	logger.Debug("findCall: added actual tool response", "role", toolResponseMsg.Role, "content_len", len(toolResponseMsg.Content), "tool_call_id", toolResponseMsg.ToolCallID, "message_count_after_add", len(chatBody.Messages))
 	// Clear the stored tool call ID after using it
@@ -1339,6 +1416,7 @@ func updateModelLists() {
 				chatBody.Model = m
 				cachedModelColor = "green"
 				updateStatusLine()
+				UpdateToolCapabilities()
 				app.Draw()
 				return
 			}
--- a/helpfuncs.go
+++ b/helpfuncs.go
@@ -11,6 +11,7 @@ import (
 	"path"
 	"path/filepath"
 	"slices"
+	"strconv"
 	"strings"
 	"time"
 	"unicode"
@@ -376,9 +377,90 @@ func makeStatusLine() string {
 		roleInject := fmt.Sprintf(" | [%s:-:b]role injection[-:-:-] (alt+7)", boolColors[injectRole])
 		statusLine += roleInject
 	}
+	// context tokens
+	contextTokens := getContextTokens()
+	maxCtx := getMaxContextTokens()
+	if maxCtx == 0 {
+		maxCtx = 16384
+	}
+	if contextTokens > 0 {
+		contextInfo := fmt.Sprintf(" | context-estim: [orange:-:b]%d/%d[-:-:-]", contextTokens, maxCtx)
+		statusLine += contextInfo
+	}
 	return statusLine + imageInfo + shellModeInfo
 }

+func getContextTokens() int {
+	if chatBody == nil || chatBody.Messages == nil {
+		return 0
+	}
+	total := 0
+	messages := chatBody.Messages
+	for i := range messages {
+		msg := &messages[i]
+		if msg.Stats != nil && msg.Stats.Tokens > 0 {
+			total += msg.Stats.Tokens
+		} else if msg.GetText() != "" {
+			total += len(msg.GetText()) / 4
+		}
+	}
+	return total
+}
+
+const deepseekContext = 128000
+
+func getMaxContextTokens() int {
+	if chatBody == nil || chatBody.Model == "" {
+		return 0
+	}
+	modelName := chatBody.Model
+	switch {
+	case strings.Contains(cfg.CurrentAPI, "openrouter"):
+		if orModelsData != nil {
+			for i := range orModelsData.Data {
+				m := &orModelsData.Data[i]
+				if m.ID == modelName {
+					return m.ContextLength
+				}
+			}
+		}
+	case strings.Contains(cfg.CurrentAPI, "deepseek"):
+		return deepseekContext
+	default:
+		if localModelsData != nil {
+			for i := range localModelsData.Data {
+				m := &localModelsData.Data[i]
+				if m.ID == modelName {
+					for _, arg := range m.Status.Args {
+						if strings.HasPrefix(arg, "--ctx-size") {
+							if strings.Contains(arg, "=") {
+								val := strings.Split(arg, "=")[1]
+								if n, err := strconv.Atoi(val); err == nil {
+									return n
+								}
+							} else {
+								idx := -1
+								for j, a := range m.Status.Args {
+									if a == "--ctx-size" && j+1 < len(m.Status.Args) {
+										idx = j + 1
+										break
+									}
+								}
+								if idx != -1 {
+									if n, err := strconv.Atoi(m.Status.Args[idx]); err == nil {
+										return n
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	return 0
+}
+
 // set of roles within card definition and mention in chat history
 func listChatRoles() []string {
 	currentChat, ok := chatMap[activeChatName]
--- a/llm.go
+++ b/llm.go
@@ -3,7 +3,6 @@ package main
 import (
 	"bytes"
 	"encoding/json"
-	"errors"
 	"gf-lt/models"
 	"io"
 	"strings"
@@ -119,25 +118,22 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
 	logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI)
 	localImageAttachmentPath := imageAttachmentPath
 	var multimodalData []string
+	if msg != "" { // otherwise let the bot to continue
+		var newMsg models.RoleMsg
 		if localImageAttachmentPath != "" {
+			newMsg = models.NewMultimodalMsg(role, []any{})
+			newMsg.AddTextPart(msg)
 			imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath)
 			if err != nil {
 				logger.Error("failed to create image URL from path for completion",
 					"error", err, "path", localImageAttachmentPath)
 				return nil, err
 			}
-		// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
-		parts := strings.SplitN(imageURL, ",", 2)
-		if len(parts) == 2 {
-			multimodalData = append(multimodalData, parts[1])
-		} else {
-			logger.Error("invalid image data URL format", "url", imageURL)
-			return nil, errors.New("invalid image data URL format")
-		}
+			newMsg.AddImagePart(imageURL, localImageAttachmentPath)
 			imageAttachmentPath = "" // Clear the attachment after use
+		} else { // not a multimodal msg or image passed in tool call
+			newMsg = models.RoleMsg{Role: role, Content: msg}
 		}
-	if msg != "" { // otherwise let the bot to continue
-		newMsg := models.RoleMsg{Role: role, Content: msg}
 		newMsg = *processMessageTag(&newMsg)
 		chatBody.Messages = append(chatBody.Messages, newMsg)
 	}
@@ -146,22 +142,40 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
 		chatBody.Messages = append(chatBody.Messages, models.RoleMsg{Role: cfg.ToolRole, Content: toolSysMsg})
 	}
 	filteredMessages, botPersona := filterMessagesForCurrentCharacter(chatBody.Messages)
+	// Build prompt and extract images inline as we process each message
 	messages := make([]string, len(filteredMessages))
 	for i := range filteredMessages {
-		messages[i] = stripThinkingFromMsg(&filteredMessages[i]).ToPrompt()
+		m := stripThinkingFromMsg(&filteredMessages[i])
+		messages[i] = m.ToPrompt()
+		// Extract images from this message and add marker inline
+		if len(m.ContentParts) > 0 {
+			for _, part := range m.ContentParts {
+				var imgURL string
+				// Check for struct type
+				if imgPart, ok := part.(models.ImageContentPart); ok {
+					imgURL = imgPart.ImageURL.URL
+				} else if partMap, ok := part.(map[string]any); ok {
+					// Check for map type (from JSON unmarshaling)
+					if partType, exists := partMap["type"]; exists && partType == "image_url" {
+						if imgURLMap, ok := partMap["image_url"].(map[string]any); ok {
+							if url, ok := imgURLMap["url"].(string); ok {
+								imgURL = url
+							}
+						}
+					}
+				}
+				if imgURL != "" {
+					// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
+					parts := strings.SplitN(imgURL, ",", 2)
+					if len(parts) == 2 {
+						multimodalData = append(multimodalData, parts[1])
+						messages[i] += " <__media__>"
+					}
+				}
+			}
+		}
 	}
 	prompt := strings.Join(messages, "\n")
-	// Add multimodal media markers to the prompt text when multimodal data is present
-	// This is required by llama.cpp multimodal models so they know where to insert media
-	if len(multimodalData) > 0 {
-		// Add a media marker for each item in the multimodal data
-		var sb strings.Builder
-		sb.WriteString(prompt)
-		for range multimodalData {
-			sb.WriteString(" <__media__>") // llama.cpp default multimodal marker
-		}
-		prompt = sb.String()
-	}
 	// needs to be after <__media__> if there are images
 	if !resume {
 		botMsgStart := "\n" + botPersona + ":\n"
--- a/models/consts.go
+++ b/models/consts.go
@@ -2,6 +2,7 @@ package models

 const (
 	LoadedMark        = "(loaded) "
+	ToolRespMultyType = "multimodel_content"
 )

 type APIType int
--- a/models/models.go
+++ b/models/models.go
@@ -391,7 +391,6 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
 	if err != nil {
 		return "", err
 	}
-
 	// Determine the image format based on file extension
 	var mimeType string
 	switch {
@@ -408,10 +407,8 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
 	default:
 		mimeType = "image/jpeg" // default
 	}
-
 	// Encode to base64
 	encoded := base64.StdEncoding.EncodeToString(data)
-
 	// Create data URL
 	return fmt.Sprintf("data:%s;base64,%s", mimeType, encoded), nil
 }
@@ -611,6 +608,20 @@ func (lcp *LCPModels) ListModels() []string {
 	return resp
 }

+func (lcp *LCPModels) HasVision(modelID string) bool {
+	for _, m := range lcp.Data {
+		if m.ID == modelID {
+			args := m.Status.Args
+			for i := 0; i < len(args)-1; i++ {
+				if args[i] == "--mmproj" {
+					return true
+				}
+			}
+		}
+	}
+	return false
+}
+
 type ResponseStats struct {
 	Tokens       int
 	Duration     float64
@@ -623,3 +634,8 @@ type ChatRoundReq struct {
 	Regen   bool
 	Resume  bool
 }
+
+type MultimodalToolResp struct {
+	Type  string              `json:"type"`
+	Parts []map[string]string `json:"parts"`
+}
--- a/models/openrouter.go
+++ b/models/openrouter.go
@@ -172,3 +172,16 @@ func (orm *ORModels) ListModels(free bool) []string {
 	}
 	return resp
 }
+
+func (orm *ORModels) HasVision(modelID string) bool {
+	for i := range orm.Data {
+		if orm.Data[i].ID == modelID {
+			for _, mod := range orm.Data[i].Architecture.InputModalities {
+				if mod == "image" {
+					return true
+				}
+			}
+		}
+	}
+	return false
+}
--- a/popups.go
+++ b/popups.go
@@ -143,6 +143,7 @@ func showAPILinkSelectionPopup() {
 	apiListWidget.SetSelectedFunc(func(index int, mainText string, secondaryText string, shortcut rune) {
 		// Update the API in config
 		cfg.CurrentAPI = mainText
+		UpdateToolCapabilities()
 		// Update model list based on new API
 		// Helper function to get model list for a given API (same as in props_table.go)
 		getModelListForAPI := func(api string) []string {
@@ -160,8 +161,9 @@ func showAPILinkSelectionPopup() {
 		newModelList := getModelListForAPI(cfg.CurrentAPI)
 		// Ensure chatBody.Model is in the new list; if not, set to first available model
 		if len(newModelList) > 0 && !slices.Contains(newModelList, chatBody.Model) {
-			chatBody.Model = newModelList[0]
+			chatBody.Model = strings.TrimPrefix(newModelList[0], models.LoadedMark)
 			cfg.CurrentModel = chatBody.Model
+			UpdateToolCapabilities()
 		}
 		pages.RemovePage("apiLinkSelectionPopup")
 		app.SetFocus(textArea)
@@ -404,6 +406,66 @@ func showShellFileCompletionPopup(filter string) {
 	app.SetFocus(widget)
 }

+func showTextAreaFileCompletionPopup(filter string) {
+	baseDir := cfg.FilePickerDir
+	if baseDir == "" {
+		baseDir = "."
+	}
+	complMatches := scanFiles(baseDir, filter)
+	if len(complMatches) == 0 {
+		return
+	}
+	if len(complMatches) == 1 {
+		currentText := textArea.GetText()
+		atIdx := strings.LastIndex(currentText, "@")
+		if atIdx >= 0 {
+			before := currentText[:atIdx]
+			textArea.SetText(before+complMatches[0], true)
+		}
+		return
+	}
+	widget := tview.NewList().ShowSecondaryText(false).
+		SetSelectedBackgroundColor(tcell.ColorGray)
+	widget.SetTitle("file completion").SetBorder(true)
+	for _, m := range complMatches {
+		widget.AddItem(m, "", 0, nil)
+	}
+	widget.SetSelectedFunc(func(index int, mainText string, secondaryText string, shortcut rune) {
+		currentText := textArea.GetText()
+		atIdx := strings.LastIndex(currentText, "@")
+		if atIdx >= 0 {
+			before := currentText[:atIdx]
+			textArea.SetText(before+mainText, true)
+		}
+		pages.RemovePage("textAreaFileCompletionPopup")
+		app.SetFocus(textArea)
+	})
+	widget.SetInputCapture(func(event *tcell.EventKey) *tcell.EventKey {
+		if event.Key() == tcell.KeyEscape {
+			pages.RemovePage("textAreaFileCompletionPopup")
+			app.SetFocus(textArea)
+			return nil
+		}
+		if event.Key() == tcell.KeyRune && event.Rune() == 'x' {
+			pages.RemovePage("textAreaFileCompletionPopup")
+			app.SetFocus(textArea)
+			return nil
+		}
+		return event
+	})
+	modal := func(p tview.Primitive, width, height int) tview.Primitive {
+		return tview.NewFlex().
+			AddItem(nil, 0, 1, false).
+			AddItem(tview.NewFlex().SetDirection(tview.FlexRow).
+				AddItem(nil, 0, 1, false).
+				AddItem(p, height, 1, true).
+				AddItem(nil, 0, 1, false), width, 1, true).
+			AddItem(nil, 0, 1, false)
+	}
+	pages.AddPage("textAreaFileCompletionPopup", modal(widget, 80, 20), true, true)
+	app.SetFocus(widget)
+}
+
 func updateWidgetColors(theme *tview.Theme) {
 	bgColor := theme.PrimitiveBackgroundColor
 	fgColor := theme.PrimaryTextColor
--- a/tools.go
+++ b/tools.go
@@ -85,6 +85,11 @@ Your current tools:
 "when_to_use": "when asked to read the content of a file"
 },
 {
+"name":"file_read_image",
+"args": ["path"],
+"when_to_use": "when asked to read or view an image file"
+},
+{
 "name":"file_write",
 "args": ["path", "content"],
 "when_to_use": "when needed to overwrite content to a file"
@@ -170,8 +175,36 @@ After that you are free to respond to the user.
 	webAgentsOnce      sync.Once
 )

+var windowToolSysMsg = `
+Additional window tools (available only if xdotool and maim are installed):
+[
+{
+"name":"list_windows",
+"args": [],
+"when_to_use": "when asked to list visible windows; returns map of window ID to window name"
+},
+{
+"name":"capture_window",
+"args": ["window"],
+"when_to_use": "when asked to take a screenshot of a specific window; saves to /tmp; window can be ID or name substring; returns file path"
+},
+{
+"name":"capture_window_and_view",
+"args": ["window"],
+"when_to_use": "when asked to take a screenshot of a specific window and show it; saves to /tmp and returns image for viewing; window can be ID or name substring"
+}
+]
+`
+
 var WebSearcher searcher.WebSurfer

+var (
+	windowToolsAvailable bool
+	xdotoolPath          string
+	maimPath             string
+	modelHasVision       bool
+)
+
 func init() {
 	sa, err := searcher.NewWebSurfer(searcher.SearcherTypeScraper, "")
 	if err != nil {
@@ -181,6 +214,47 @@ func init() {
 	if err := rag.Init(cfg, logger, store); err != nil {
 		logger.Warn("failed to init rag; rag_search tool will not be available", "error", err)
 	}
+	checkWindowTools()
+	registerWindowTools()
+}
+
+func checkWindowTools() {
+	xdotoolPath, _ = exec.LookPath("xdotool")
+	maimPath, _ = exec.LookPath("maim")
+	windowToolsAvailable = xdotoolPath != "" && maimPath != ""
+	if windowToolsAvailable {
+		logger.Info("window tools available: xdotool and maim found")
+	} else {
+		if xdotoolPath == "" {
+			logger.Warn("xdotool not found, window listing tools will not be available")
+		}
+		if maimPath == "" {
+			logger.Warn("maim not found, window capture tools will not be available")
+		}
+	}
+}
+
+func UpdateToolCapabilities() {
+	if !cfg.ToolUse {
+		return
+	}
+	modelHasVision = false
+	if cfg == nil || cfg.CurrentAPI == "" {
+		logger.Warn("cannot determine model capabilities: cfg or CurrentAPI is nil")
+		registerWindowTools()
+		return
+	}
+	prevHasVision := modelHasVision
+	modelHasVision = ModelHasVision(cfg.CurrentAPI, cfg.CurrentModel)
+	if modelHasVision {
+		logger.Info("model has vision support", "model", cfg.CurrentModel, "api", cfg.CurrentAPI)
+	} else {
+		logger.Info("model does not have vision support", "model", cfg.CurrentModel, "api", cfg.CurrentAPI)
+		if windowToolsAvailable && !prevHasVision && !modelHasVision {
+			_ = notifyUser("window tools", "Window capture-and-view unavailable: model lacks vision support")
+		}
+	}
+	registerWindowTools()
 }

 // getWebAgentClient returns a singleton AgentClient for web agents.
@@ -469,6 +543,43 @@ func fileRead(args map[string]string) []byte {
 	return jsonResult
 }

+func fileReadImage(args map[string]string) []byte {
+	path, ok := args["path"]
+	if !ok || path == "" {
+		msg := "path not provided to file_read_image tool"
+		logger.Error(msg)
+		return []byte(msg)
+	}
+	path = resolvePath(path)
+	dataURL, err := models.CreateImageURLFromPath(path)
+	if err != nil {
+		msg := "failed to read image; error: " + err.Error()
+		logger.Error(msg)
+		return []byte(msg)
+	}
+	// result := map[string]any{
+	// 	"type": "multimodal_content",
+	// 	"parts": []map[string]string{
+	// 		{"type": "text", "text": "Image at " + path},
+	// 		{"type": "image_url", "url": dataURL},
+	// 	},
+	// }
+	result := models.MultimodalToolResp{
+		Type: "multimodal_content",
+		Parts: []map[string]string{
+			{"type": "text", "text": "Image at " + path},
+			{"type": "image_url", "url": dataURL},
+		},
+	}
+	jsonResult, err := json.Marshal(result)
+	if err != nil {
+		msg := "failed to marshal result; error: " + err.Error()
+		logger.Error(msg)
+		return []byte(msg)
+	}
+	return jsonResult
+}
+
 func fileWrite(args map[string]string) []byte {
 	path, ok := args["path"]
 	if !ok || path == "" {
@@ -1088,6 +1199,142 @@ func summarizeChat(args map[string]string) []byte {
 	return []byte(chatText)
 }

+func windowIDToHex(decimalID string) string {
+	id, err := strconv.ParseInt(decimalID, 10, 64)
+	if err != nil {
+		return decimalID
+	}
+	return fmt.Sprintf("0x%x", id)
+}
+
+func listWindows(args map[string]string) []byte {
+	if !windowToolsAvailable {
+		return []byte("window tools not available: xdotool or maim not found")
+	}
+	cmd := exec.Command(xdotoolPath, "search", "--name", ".")
+	output, err := cmd.Output()
+	if err != nil {
+		msg := "failed to list windows: " + err.Error()
+		logger.Error(msg)
+		return []byte(msg)
+	}
+	windowIDs := strings.Fields(string(output))
+	windows := make(map[string]string)
+	for _, id := range windowIDs {
+		id = strings.TrimSpace(id)
+		if id == "" {
+			continue
+		}
+		nameCmd := exec.Command(xdotoolPath, "getwindowname", id)
+		nameOutput, err := nameCmd.Output()
+		if err != nil {
+			continue
+		}
+		name := strings.TrimSpace(string(nameOutput))
+		windows[id] = name
+	}
+	data, err := json.Marshal(windows)
+	if err != nil {
+		msg := "failed to marshal window list: " + err.Error()
+		logger.Error(msg)
+		return []byte(msg)
+	}
+	return data
+}
+
+func captureWindow(args map[string]string) []byte {
+	if !windowToolsAvailable {
+		return []byte("window tools not available: xdotool or maim not found")
+	}
+	window, ok := args["window"]
+	if !ok || window == "" {
+		return []byte("window parameter required (window ID or name)")
+	}
+	var windowID string
+	if _, err := strconv.Atoi(window); err == nil {
+		windowID = window
+	} else {
+		cmd := exec.Command(xdotoolPath, "search", "--name", window)
+		output, err := cmd.Output()
+		if err != nil || len(strings.Fields(string(output))) == 0 {
+			return []byte("window not found: " + window)
+		}
+		windowID = strings.Fields(string(output))[0]
+	}
+	nameCmd := exec.Command(xdotoolPath, "getwindowname", windowID)
+	nameOutput, _ := nameCmd.Output()
+	windowName := strings.TrimSpace(string(nameOutput))
+	windowName = regexp.MustCompile(`[^a-zA-Z]+`).ReplaceAllString(windowName, "")
+	if windowName == "" {
+		windowName = "window"
+	}
+	timestamp := time.Now().Unix()
+	filename := fmt.Sprintf("/tmp/%s_%d.jpg", windowName, timestamp)
+	cmd := exec.Command(maimPath, "-i", windowIDToHex(windowID), filename)
+	if err := cmd.Run(); err != nil {
+		msg := "failed to capture window: " + err.Error()
+		logger.Error(msg)
+		return []byte(msg)
+	}
+	return []byte("screenshot saved: " + filename)
+}
+
+func captureWindowAndView(args map[string]string) []byte {
+	if !windowToolsAvailable {
+		return []byte("window tools not available: xdotool or maim not found")
+	}
+	window, ok := args["window"]
+	if !ok || window == "" {
+		return []byte("window parameter required (window ID or name)")
+	}
+	var windowID string
+	if _, err := strconv.Atoi(window); err == nil {
+		windowID = window
+	} else {
+		cmd := exec.Command(xdotoolPath, "search", "--name", window)
+		output, err := cmd.Output()
+		if err != nil || len(strings.Fields(string(output))) == 0 {
+			return []byte("window not found: " + window)
+		}
+		windowID = strings.Fields(string(output))[0]
+	}
+	nameCmd := exec.Command(xdotoolPath, "getwindowname", windowID)
+	nameOutput, _ := nameCmd.Output()
+	windowName := strings.TrimSpace(string(nameOutput))
+	windowName = regexp.MustCompile(`[^a-zA-Z]+`).ReplaceAllString(windowName, "")
+	if windowName == "" {
+		windowName = "window"
+	}
+	timestamp := time.Now().Unix()
+	filename := fmt.Sprintf("/tmp/%s_%d.jpg", windowName, timestamp)
+	captureCmd := exec.Command(maimPath, "-i", windowIDToHex(windowID), filename)
+	if err := captureCmd.Run(); err != nil {
+		msg := "failed to capture window: " + err.Error()
+		logger.Error(msg)
+		return []byte(msg)
+	}
+	dataURL, err := models.CreateImageURLFromPath(filename)
+	if err != nil {
+		msg := "failed to create image URL: " + err.Error()
+		logger.Error(msg)
+		return []byte(msg)
+	}
+	result := models.MultimodalToolResp{
+		Type: "multimodal_content",
+		Parts: []map[string]string{
+			{"type": "text", "text": "Screenshot saved: " + filename},
+			{"type": "image_url", "url": dataURL},
+		},
+	}
+	jsonResult, err := json.Marshal(result)
+	if err != nil {
+		msg := "failed to marshal result: " + err.Error()
+		logger.Error(msg)
+		return []byte(msg)
+	}
+	return jsonResult
+}
+
 type fnSig func(map[string]string) []byte

 var fnMap = map[string]fnSig{
@@ -1101,6 +1348,7 @@ var fnMap = map[string]fnSig{
 	"read_url_raw":      readURLRaw,
 	"file_create":       fileCreate,
 	"file_read":         fileRead,
+	"file_read_image":   fileReadImage,
 	"file_write":        fileWrite,
 	"file_write_append": fileWriteAppend,
 	"file_edit":         fileEdit,
@@ -1116,6 +1364,66 @@ var fnMap = map[string]fnSig{
 	"summarize_chat":    summarizeChat,
 }

+func registerWindowTools() {
+	if windowToolsAvailable {
+		fnMap["list_windows"] = listWindows
+		fnMap["capture_window"] = captureWindow
+		windowTools := []models.Tool{
+			{
+				Type: "function",
+				Function: models.ToolFunc{
+					Name:        "list_windows",
+					Description: "List all visible windows with their IDs and names. Returns a map of window ID to window name.",
+					Parameters: models.ToolFuncParams{
+						Type:       "object",
+						Required:   []string{},
+						Properties: map[string]models.ToolArgProps{},
+					},
+				},
+			},
+			{
+				Type: "function",
+				Function: models.ToolFunc{
+					Name:        "capture_window",
+					Description: "Capture a screenshot of a specific window and save it to /tmp. Requires window parameter (window ID or name substring).",
+					Parameters: models.ToolFuncParams{
+						Type:     "object",
+						Required: []string{"window"},
+						Properties: map[string]models.ToolArgProps{
+							"window": models.ToolArgProps{
+								Type:        "string",
+								Description: "window ID or window name (partial match)",
+							},
+						},
+					},
+				},
+			},
+		}
+		if modelHasVision {
+			fnMap["capture_window_and_view"] = captureWindowAndView
+			windowTools = append(windowTools, models.Tool{
+				Type: "function",
+				Function: models.ToolFunc{
+					Name:        "capture_window_and_view",
+					Description: "Capture a screenshot of a specific window, save it to /tmp, and return the image for viewing. Requires window parameter (window ID or name substring).",
+					Parameters: models.ToolFuncParams{
+						Type:     "object",
+						Required: []string{"window"},
+						Properties: map[string]models.ToolArgProps{
+							"window": models.ToolArgProps{
+								Type:        "string",
+								Description: "window ID or window name (partial match)",
+							},
+						},
+					},
+				},
+			})
+		}
+		baseTools = append(baseTools, windowTools...)
+		toolSysMsg += windowToolSysMsg
+	}
+}
+
 // callToolWithAgent calls the tool and applies any registered agent.
 func callToolWithAgent(name string, args map[string]string) []byte {
 	registerWebAgents()
@@ -1327,6 +1635,24 @@ var baseTools = []models.Tool{
 			},
 		},
 	},
+	// file_read_image
+	models.Tool{
+		Type: "function",
+		Function: models.ToolFunc{
+			Name:        "file_read_image",
+			Description: "Read an image file and return it for multimodal LLM viewing. Supports png, jpg, jpeg, gif, webp formats. Use when you need the LLM to see and analyze an image.",
+			Parameters: models.ToolFuncParams{
+				Type:     "object",
+				Required: []string{"path"},
+				Properties: map[string]models.ToolArgProps{
+					"path": models.ToolArgProps{
+						Type:        "string",
+						Description: "path of the image file to read",
+					},
+				},
+			},
+		},
+	},
 	// file_write
 	models.Tool{
 		Type: "function",
@@ -1580,3 +1906,56 @@ var baseTools = []models.Tool{
 		},
 	},
 }
+
+func init() {
+	if windowToolsAvailable {
+		baseTools = append(baseTools,
+			models.Tool{
+				Type: "function",
+				Function: models.ToolFunc{
+					Name:        "list_windows",
+					Description: "List all visible windows with their IDs and names. Returns a map of window ID to window name.",
+					Parameters: models.ToolFuncParams{
+						Type:       "object",
+						Required:   []string{},
+						Properties: map[string]models.ToolArgProps{},
+					},
+				},
+			},
+			models.Tool{
+				Type: "function",
+				Function: models.ToolFunc{
+					Name:        "capture_window",
+					Description: "Capture a screenshot of a specific window and save it to /tmp. Requires window parameter (window ID or name substring).",
+					Parameters: models.ToolFuncParams{
+						Type:     "object",
+						Required: []string{"window"},
+						Properties: map[string]models.ToolArgProps{
+							"window": models.ToolArgProps{
+								Type:        "string",
+								Description: "window ID or window name (partial match)",
+							},
+						},
+					},
+				},
+			},
+			models.Tool{
+				Type: "function",
+				Function: models.ToolFunc{
+					Name:        "capture_window_and_view",
+					Description: "Capture a screenshot of a specific window, save it to /tmp, and return the image for viewing. Requires window parameter (window ID or name substring).",
+					Parameters: models.ToolFuncParams{
+						Type:     "object",
+						Required: []string{"window"},
+						Properties: map[string]models.ToolArgProps{
+							"window": models.ToolArgProps{
+								Type:        "string",
+								Description: "window ID or window name (partial match)",
+							},
+						},
+					},
+				},
+			},
+		)
+	}
+}
--- a/tui.go
+++ b/tui.go
@@ -35,6 +35,8 @@ var (
 	renameWindow       *tview.InputField
 	roleEditWindow     *tview.InputField
 	shellInput         *tview.InputField
+	confirmModal       *tview.Modal
+	confirmPageName    = "confirm"
 	fullscreenMode     bool
 	positionVisible    bool = true
 	scrollToEndEnabled bool = true
@@ -195,6 +197,39 @@ func init() {
 		}
 		return event
 	})
+	confirmModal = tview.NewModal().
+		SetText("You are trying to send an empty message.\nIt makes sense if the last message in the chat is from you.\nAre you sure?").
+		AddButtons([]string{"Yes", "No"}).
+		SetButtonBackgroundColor(tcell.ColorBlack).
+		SetButtonTextColor(tcell.ColorWhite).
+		SetDoneFunc(func(buttonIndex int, buttonLabel string) {
+			if buttonLabel == "Yes" {
+				persona := cfg.UserRole
+				if cfg.WriteNextMsgAs != "" {
+					persona = cfg.WriteNextMsgAs
+				}
+				chatRoundChan <- &models.ChatRoundReq{Role: persona, UserMsg: ""}
+			} // In both Yes and No, go back to the main page
+			pages.SwitchToPage("main") // or whatever your main page is named
+		})
+	confirmModal.SetInputCapture(func(event *tcell.EventKey) *tcell.EventKey {
+		if event.Key() == tcell.KeyRune {
+			switch event.Rune() {
+			case 'y', 'Y':
+				persona := cfg.UserRole
+				if cfg.WriteNextMsgAs != "" {
+					persona = cfg.WriteNextMsgAs
+				}
+				chatRoundChan <- &models.ChatRoundReq{Role: persona, UserMsg: ""}
+				pages.SwitchToPage("main")
+				return nil
+			case 'n', 'N', 'x', 'X':
+				pages.SwitchToPage("main")
+				return nil
+			}
+		}
+		return event
+	})
 	textArea = tview.NewTextArea().
 		SetPlaceholder("input is multiline; press <Enter> to start the next line;\npress <Esc> to send the message.")
 	textArea.SetBorder(true).SetTitle("input")
@@ -691,6 +726,7 @@ func init() {
 		if event.Key() == tcell.KeyF6 {
 			interruptResp = true
 			botRespMode = false
+			toolRunningMode = false
 			return nil
 		}
 		if event.Key() == tcell.KeyF7 {
@@ -997,7 +1033,6 @@ func init() {
 				return nil
 			}
 			msgText := textArea.GetText()
-			if msgText != "" {
 			nl := "\n\n" // keep empty lines between messages
 			prevText := textView.GetText(true)
 			persona := cfg.UserRole
@@ -1029,9 +1064,23 @@ func init() {
 					textView.ScrollToEnd()
 				}
 				colorText()
+			} else {
+				pages.AddPage(confirmPageName, confirmModal, true, true)
+				return nil
 			}
 			// go chatRound(msgText, persona, textView, false, false)
 			chatRoundChan <- &models.ChatRoundReq{Role: persona, UserMsg: msgText}
+			return nil
+		}
+		if event.Key() == tcell.KeyTab {
+			currentF := app.GetFocus()
+			if currentF == textArea {
+				currentText := textArea.GetText()
+				atIndex := strings.LastIndex(currentText, "@")
+				if atIndex >= 0 {
+					filter := currentText[atIndex+1:]
+					showTextAreaFileCompletionPopup(filter)
+				}
 			}
 			return nil
 		}
Author	SHA1	Message	Date
Grail Finder	4bddce3700	Enha: compute estimate of non llm text	2026-03-02 15:21:45 +03:00
Grail Finder	fcc71987bf	Feat: token use estimation	2026-03-02 14:54:20 +03:00
Grail Finder	8458edf5a8	Enha: interrupt llm and tool both	2026-03-02 12:19:50 +03:00
Grail Finder	07b06bb0d3	Enha: tabcompletion is back in textarea	2026-03-02 12:09:27 +03:00
Grail Finder	3389b1d83b	Fix: linter complaints	2026-03-02 11:39:55 +03:00
Grail Finder	4f6000a43a	Enha: check if model has vision before giving it vision tools	2026-03-02 11:25:20 +03:00
Grail Finder	9ba46b40cc	Feat: screencapture for completion	2026-03-02 11:12:04 +03:00
Grail Finder	5bb456272e	Feat: capture window (screenshot)	2026-03-02 10:33:41 +03:00
Grail Finder	8999f48fb9	Fix (completion): handle multiple images in history	2026-03-02 09:23:22 +03:00
Grail Finder	b2f280a7f1	Feat: read img for completion	2026-03-02 07:46:08 +03:00
Grail Finder	65cbd5d6a6	Fix (ctrl+v): trim loaded mark from the model	2026-03-02 07:19:21 +03:00
Grail Finder	caac1d397a	Feat: read img tool for chat endpoint	2026-03-02 07:12:28 +03:00
Grail Finder	742f1ca838	Enha: modal affirmation popup on sending empty msg	2026-03-01 16:21:18 +03:00
Grail Finder	e36bade353	Fix: escape with empty textarea not generating response	2026-03-01 13:33:25 +03:00