Feat: read img tool for chat endpoint

This commit is contained in:
Grail Finder
2026-03-02 07:12:28 +03:00
parent 742f1ca838
commit caac1d397a
4 changed files with 113 additions and 11 deletions

57
bot.go
View File

@@ -1174,17 +1174,60 @@ func findCall(msg, toolCall string) bool {
toolRunningMode = false toolRunningMode = false
toolMsg := string(resp) toolMsg := string(resp)
logger.Info("llm used a tool call", "tool_name", fc.Name, "too_args", fc.Args, "id", fc.ID, "tool_resp", toolMsg) logger.Info("llm used a tool call", "tool_name", fc.Name, "too_args", fc.Args, "id", fc.ID, "tool_resp", toolMsg)
fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
"\n\n", len(chatBody.Messages), cfg.ToolRole, toolMsg)
// Create tool response message with the proper tool_call_id // Create tool response message with the proper tool_call_id
// Mark shell commands as always visible // Mark shell commands as always visible
isShellCommand := fc.Name == "execute_command" isShellCommand := fc.Name == "execute_command"
toolResponseMsg := models.RoleMsg{
Role: cfg.ToolRole, // Check if response is multimodal content (image)
Content: toolMsg, var toolResponseMsg models.RoleMsg
ToolCallID: lastToolCall.ID, if strings.HasPrefix(strings.TrimSpace(toolMsg), `{"type":"multimodal_content"`) {
IsShellCommand: isShellCommand, // Parse multimodal content response
multimodalResp := models.MultimodalToolResp{}
if err := json.Unmarshal([]byte(toolMsg), &multimodalResp); err == nil && multimodalResp.Type == "multimodal_content" {
// Create RoleMsg with ContentParts
var contentParts []any
for _, part := range multimodalResp.Parts {
partType, ok := part["type"]
if !ok {
continue
}
if partType == "text" {
contentParts = append(contentParts, models.TextContentPart{Type: "text", Text: part["text"]})
} else if partType == "image_url" {
contentParts = append(contentParts, models.ImageContentPart{
Type: "image_url",
ImageURL: struct {
URL string `json:"url"`
}{URL: part["url"]},
})
}
}
toolResponseMsg = models.RoleMsg{
Role: cfg.ToolRole,
ContentParts: contentParts,
HasContentParts: true,
ToolCallID: lastToolCall.ID,
IsShellCommand: isShellCommand,
}
} else {
// Fallback to regular content
toolResponseMsg = models.RoleMsg{
Role: cfg.ToolRole,
Content: toolMsg,
ToolCallID: lastToolCall.ID,
IsShellCommand: isShellCommand,
}
}
} else {
toolResponseMsg = models.RoleMsg{
Role: cfg.ToolRole,
Content: toolMsg,
ToolCallID: lastToolCall.ID,
IsShellCommand: isShellCommand,
}
} }
fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
"\n\n", len(chatBody.Messages), cfg.ToolRole, toolResponseMsg.GetText())
chatBody.Messages = append(chatBody.Messages, toolResponseMsg) chatBody.Messages = append(chatBody.Messages, toolResponseMsg)
logger.Debug("findCall: added actual tool response", "role", toolResponseMsg.Role, "content_len", len(toolResponseMsg.Content), "tool_call_id", toolResponseMsg.ToolCallID, "message_count_after_add", len(chatBody.Messages)) logger.Debug("findCall: added actual tool response", "role", toolResponseMsg.Role, "content_len", len(toolResponseMsg.Content), "tool_call_id", toolResponseMsg.ToolCallID, "message_count_after_add", len(chatBody.Messages))
// Clear the stored tool call ID after using it // Clear the stored tool call ID after using it

View File

@@ -1,7 +1,8 @@
package models package models
const ( const (
LoadedMark = "(loaded) " LoadedMark = "(loaded) "
ToolRespMultyType = "multimodel_content"
) )
type APIType int type APIType int

View File

@@ -391,7 +391,6 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
if err != nil { if err != nil {
return "", err return "", err
} }
// Determine the image format based on file extension // Determine the image format based on file extension
var mimeType string var mimeType string
switch { switch {
@@ -408,10 +407,8 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
default: default:
mimeType = "image/jpeg" // default mimeType = "image/jpeg" // default
} }
// Encode to base64 // Encode to base64
encoded := base64.StdEncoding.EncodeToString(data) encoded := base64.StdEncoding.EncodeToString(data)
// Create data URL // Create data URL
return fmt.Sprintf("data:%s;base64,%s", mimeType, encoded), nil return fmt.Sprintf("data:%s;base64,%s", mimeType, encoded), nil
} }
@@ -623,3 +620,8 @@ type ChatRoundReq struct {
Regen bool Regen bool
Resume bool Resume bool
} }
type MultimodalToolResp struct {
Type string `json:"type"`
Parts []map[string]string `json:"parts"`
}

View File

@@ -469,6 +469,43 @@ func fileRead(args map[string]string) []byte {
return jsonResult return jsonResult
} }
func fileReadImage(args map[string]string) []byte {
path, ok := args["path"]
if !ok || path == "" {
msg := "path not provided to file_read_image tool"
logger.Error(msg)
return []byte(msg)
}
path = resolvePath(path)
dataURL, err := models.CreateImageURLFromPath(path)
if err != nil {
msg := "failed to read image; error: " + err.Error()
logger.Error(msg)
return []byte(msg)
}
// result := map[string]any{
// "type": "multimodal_content",
// "parts": []map[string]string{
// {"type": "text", "text": "Image at " + path},
// {"type": "image_url", "url": dataURL},
// },
// }
result := models.MultimodalToolResp{
Type: "multimodal_content",
Parts: []map[string]string{
{"type": "text", "text": "Image at " + path},
{"type": "image_url", "url": dataURL},
},
}
jsonResult, err := json.Marshal(result)
if err != nil {
msg := "failed to marshal result; error: " + err.Error()
logger.Error(msg)
return []byte(msg)
}
return jsonResult
}
func fileWrite(args map[string]string) []byte { func fileWrite(args map[string]string) []byte {
path, ok := args["path"] path, ok := args["path"]
if !ok || path == "" { if !ok || path == "" {
@@ -1101,6 +1138,7 @@ var fnMap = map[string]fnSig{
"read_url_raw": readURLRaw, "read_url_raw": readURLRaw,
"file_create": fileCreate, "file_create": fileCreate,
"file_read": fileRead, "file_read": fileRead,
"file_read_image": fileReadImage,
"file_write": fileWrite, "file_write": fileWrite,
"file_write_append": fileWriteAppend, "file_write_append": fileWriteAppend,
"file_edit": fileEdit, "file_edit": fileEdit,
@@ -1327,6 +1365,24 @@ var baseTools = []models.Tool{
}, },
}, },
}, },
// file_read_image
models.Tool{
Type: "function",
Function: models.ToolFunc{
Name: "file_read_image",
Description: "Read an image file and return it for multimodal LLM viewing. Supports png, jpg, jpeg, gif, webp formats. Use when you need the LLM to see and analyze an image.",
Parameters: models.ToolFuncParams{
Type: "object",
Required: []string{"path"},
Properties: map[string]models.ToolArgProps{
"path": models.ToolArgProps{
Type: "string",
Description: "path of the image file to read",
},
},
},
},
},
// file_write // file_write
models.Tool{ models.Tool{
Type: "function", Type: "function",