Feat: read img tool for chat endpoint

2026-03-02 07:12:28 +03:00
parent 742f1ca838
commit caac1d397a
4 changed files with 113 additions and 11 deletions
--- a/bot.go
+++ b/bot.go
@@ -1174,17 +1174,60 @@ func findCall(msg, toolCall string) bool {
 	toolRunningMode = false
 	toolMsg := string(resp)
 	logger.Info("llm used a tool call", "tool_name", fc.Name, "too_args", fc.Args, "id", fc.ID, "tool_resp", toolMsg)
 	fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
 		"\n\n", len(chatBody.Messages), cfg.ToolRole, toolMsg)
 	// Create tool response message with the proper tool_call_id
 	// Mark shell commands as always visible
 	isShellCommand := fc.Name == "execute_command"
-	toolResponseMsg := models.RoleMsg{
+
-		Role:           cfg.ToolRole,
+	// Check if response is multimodal content (image)
-		Content:        toolMsg,
+	var toolResponseMsg models.RoleMsg
-		ToolCallID:     lastToolCall.ID,
+	if strings.HasPrefix(strings.TrimSpace(toolMsg), `{"type":"multimodal_content"`) {
-		IsShellCommand: isShellCommand,
+		// Parse multimodal content response
 		multimodalResp := models.MultimodalToolResp{}
 		if err := json.Unmarshal([]byte(toolMsg), &multimodalResp); err == nil && multimodalResp.Type == "multimodal_content" {
 			// Create RoleMsg with ContentParts
 			var contentParts []any
 			for _, part := range multimodalResp.Parts {
 				partType, ok := part["type"]
 				if !ok {
 					continue
 				}
 				if partType == "text" {
 					contentParts = append(contentParts, models.TextContentPart{Type: "text", Text: part["text"]})
 				} else if partType == "image_url" {
 					contentParts = append(contentParts, models.ImageContentPart{
 						Type: "image_url",
 						ImageURL: struct {
 							URL string `json:"url"`
 						}{URL: part["url"]},
 					})
 				}
 			}
 			toolResponseMsg = models.RoleMsg{
 				Role:            cfg.ToolRole,
 				ContentParts:    contentParts,
 				HasContentParts: true,
 				ToolCallID:      lastToolCall.ID,
 				IsShellCommand:  isShellCommand,
 			}
 		} else {
 			// Fallback to regular content
 			toolResponseMsg = models.RoleMsg{
 				Role:           cfg.ToolRole,
 				Content:        toolMsg,
 				ToolCallID:     lastToolCall.ID,
 				IsShellCommand: isShellCommand,
 			}
 		}
 	} else {
 		toolResponseMsg = models.RoleMsg{
 			Role:           cfg.ToolRole,
 			Content:        toolMsg,
 			ToolCallID:     lastToolCall.ID,
 			IsShellCommand: isShellCommand,
 		}
 	}
 	fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
 		"\n\n", len(chatBody.Messages), cfg.ToolRole, toolResponseMsg.GetText())
 	chatBody.Messages = append(chatBody.Messages, toolResponseMsg)
 	logger.Debug("findCall: added actual tool response", "role", toolResponseMsg.Role, "content_len", len(toolResponseMsg.Content), "tool_call_id", toolResponseMsg.ToolCallID, "message_count_after_add", len(chatBody.Messages))
 	// Clear the stored tool call ID after using it
--- a/models/consts.go
+++ b/models/consts.go
@@ -1,7 +1,8 @@
 package models
 const (
-	LoadedMark = "(loaded) "
+	LoadedMark        = "(loaded) "
 	ToolRespMultyType = "multimodel_content"
 )
 type APIType int
--- a/models/models.go
+++ b/models/models.go
@@ -391,7 +391,6 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
 	if err != nil {
 		return "", err
 	}
 	// Determine the image format based on file extension
 	var mimeType string
 	switch {
@@ -408,10 +407,8 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
 	default:
 		mimeType = "image/jpeg" // default
 	}
 	// Encode to base64
 	encoded := base64.StdEncoding.EncodeToString(data)
 	// Create data URL
 	return fmt.Sprintf("data:%s;base64,%s", mimeType, encoded), nil
 }
@@ -623,3 +620,8 @@ type ChatRoundReq struct {
 	Regen   bool
 	Resume  bool
 }
 type MultimodalToolResp struct {
 	Type  string              `json:"type"`
 	Parts []map[string]string `json:"parts"`
 }
--- a/tools.go
+++ b/tools.go
@@ -469,6 +469,43 @@ func fileRead(args map[string]string) []byte {
 	return jsonResult
 }
 func fileReadImage(args map[string]string) []byte {
 	path, ok := args["path"]
 	if !ok || path == "" {
 		msg := "path not provided to file_read_image tool"
 		logger.Error(msg)
 		return []byte(msg)
 	}
 	path = resolvePath(path)
 	dataURL, err := models.CreateImageURLFromPath(path)
 	if err != nil {
 		msg := "failed to read image; error: " + err.Error()
 		logger.Error(msg)
 		return []byte(msg)
 	}
 	// result := map[string]any{
 	// 	"type": "multimodal_content",
 	// 	"parts": []map[string]string{
 	// 		{"type": "text", "text": "Image at " + path},
 	// 		{"type": "image_url", "url": dataURL},
 	// 	},
 	// }
 	result := models.MultimodalToolResp{
 		Type: "multimodal_content",
 		Parts: []map[string]string{
 			{"type": "text", "text": "Image at " + path},
 			{"type": "image_url", "url": dataURL},
 		},
 	}
 	jsonResult, err := json.Marshal(result)
 	if err != nil {
 		msg := "failed to marshal result; error: " + err.Error()
 		logger.Error(msg)
 		return []byte(msg)
 	}
 	return jsonResult
 }
 func fileWrite(args map[string]string) []byte {
 	path, ok := args["path"]
 	if !ok || path == "" {
@@ -1101,6 +1138,7 @@ var fnMap = map[string]fnSig{
 	"read_url_raw":      readURLRaw,
 	"file_create":       fileCreate,
 	"file_read":         fileRead,
 	"file_read_image":   fileReadImage,
 	"file_write":        fileWrite,
 	"file_write_append": fileWriteAppend,
 	"file_edit":         fileEdit,
@@ -1327,6 +1365,24 @@ var baseTools = []models.Tool{
 			},
 		},
 	},
 	// file_read_image
 	models.Tool{
 		Type: "function",
 		Function: models.ToolFunc{
 			Name:        "file_read_image",
 			Description: "Read an image file and return it for multimodal LLM viewing. Supports png, jpg, jpeg, gif, webp formats. Use when you need the LLM to see and analyze an image.",
 			Parameters: models.ToolFuncParams{
 				Type:     "object",
 				Required: []string{"path"},
 				Properties: map[string]models.ToolArgProps{
 					"path": models.ToolArgProps{
 						Type:        "string",
 						Description: "path of the image file to read",
 					},
 				},
 			},
 		},
 	},
 	// file_write
 	models.Tool{
 		Type: "function",