Feat: read img tool for chat endpoint
This commit is contained in:
57
bot.go
57
bot.go
@@ -1174,17 +1174,60 @@ func findCall(msg, toolCall string) bool {
|
||||
toolRunningMode = false
|
||||
toolMsg := string(resp)
|
||||
logger.Info("llm used a tool call", "tool_name", fc.Name, "too_args", fc.Args, "id", fc.ID, "tool_resp", toolMsg)
|
||||
fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
|
||||
"\n\n", len(chatBody.Messages), cfg.ToolRole, toolMsg)
|
||||
// Create tool response message with the proper tool_call_id
|
||||
// Mark shell commands as always visible
|
||||
isShellCommand := fc.Name == "execute_command"
|
||||
toolResponseMsg := models.RoleMsg{
|
||||
Role: cfg.ToolRole,
|
||||
Content: toolMsg,
|
||||
ToolCallID: lastToolCall.ID,
|
||||
IsShellCommand: isShellCommand,
|
||||
|
||||
// Check if response is multimodal content (image)
|
||||
var toolResponseMsg models.RoleMsg
|
||||
if strings.HasPrefix(strings.TrimSpace(toolMsg), `{"type":"multimodal_content"`) {
|
||||
// Parse multimodal content response
|
||||
multimodalResp := models.MultimodalToolResp{}
|
||||
if err := json.Unmarshal([]byte(toolMsg), &multimodalResp); err == nil && multimodalResp.Type == "multimodal_content" {
|
||||
// Create RoleMsg with ContentParts
|
||||
var contentParts []any
|
||||
for _, part := range multimodalResp.Parts {
|
||||
partType, ok := part["type"]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if partType == "text" {
|
||||
contentParts = append(contentParts, models.TextContentPart{Type: "text", Text: part["text"]})
|
||||
} else if partType == "image_url" {
|
||||
contentParts = append(contentParts, models.ImageContentPart{
|
||||
Type: "image_url",
|
||||
ImageURL: struct {
|
||||
URL string `json:"url"`
|
||||
}{URL: part["url"]},
|
||||
})
|
||||
}
|
||||
}
|
||||
toolResponseMsg = models.RoleMsg{
|
||||
Role: cfg.ToolRole,
|
||||
ContentParts: contentParts,
|
||||
HasContentParts: true,
|
||||
ToolCallID: lastToolCall.ID,
|
||||
IsShellCommand: isShellCommand,
|
||||
}
|
||||
} else {
|
||||
// Fallback to regular content
|
||||
toolResponseMsg = models.RoleMsg{
|
||||
Role: cfg.ToolRole,
|
||||
Content: toolMsg,
|
||||
ToolCallID: lastToolCall.ID,
|
||||
IsShellCommand: isShellCommand,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
toolResponseMsg = models.RoleMsg{
|
||||
Role: cfg.ToolRole,
|
||||
Content: toolMsg,
|
||||
ToolCallID: lastToolCall.ID,
|
||||
IsShellCommand: isShellCommand,
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
|
||||
"\n\n", len(chatBody.Messages), cfg.ToolRole, toolResponseMsg.GetText())
|
||||
chatBody.Messages = append(chatBody.Messages, toolResponseMsg)
|
||||
logger.Debug("findCall: added actual tool response", "role", toolResponseMsg.Role, "content_len", len(toolResponseMsg.Content), "tool_call_id", toolResponseMsg.ToolCallID, "message_count_after_add", len(chatBody.Messages))
|
||||
// Clear the stored tool call ID after using it
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
package models
|
||||
|
||||
const (
|
||||
LoadedMark = "(loaded) "
|
||||
LoadedMark = "(loaded) "
|
||||
ToolRespMultyType = "multimodel_content"
|
||||
)
|
||||
|
||||
type APIType int
|
||||
|
||||
@@ -391,7 +391,6 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Determine the image format based on file extension
|
||||
var mimeType string
|
||||
switch {
|
||||
@@ -408,10 +407,8 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
|
||||
default:
|
||||
mimeType = "image/jpeg" // default
|
||||
}
|
||||
|
||||
// Encode to base64
|
||||
encoded := base64.StdEncoding.EncodeToString(data)
|
||||
|
||||
// Create data URL
|
||||
return fmt.Sprintf("data:%s;base64,%s", mimeType, encoded), nil
|
||||
}
|
||||
@@ -623,3 +620,8 @@ type ChatRoundReq struct {
|
||||
Regen bool
|
||||
Resume bool
|
||||
}
|
||||
|
||||
type MultimodalToolResp struct {
|
||||
Type string `json:"type"`
|
||||
Parts []map[string]string `json:"parts"`
|
||||
}
|
||||
|
||||
56
tools.go
56
tools.go
@@ -469,6 +469,43 @@ func fileRead(args map[string]string) []byte {
|
||||
return jsonResult
|
||||
}
|
||||
|
||||
func fileReadImage(args map[string]string) []byte {
|
||||
path, ok := args["path"]
|
||||
if !ok || path == "" {
|
||||
msg := "path not provided to file_read_image tool"
|
||||
logger.Error(msg)
|
||||
return []byte(msg)
|
||||
}
|
||||
path = resolvePath(path)
|
||||
dataURL, err := models.CreateImageURLFromPath(path)
|
||||
if err != nil {
|
||||
msg := "failed to read image; error: " + err.Error()
|
||||
logger.Error(msg)
|
||||
return []byte(msg)
|
||||
}
|
||||
// result := map[string]any{
|
||||
// "type": "multimodal_content",
|
||||
// "parts": []map[string]string{
|
||||
// {"type": "text", "text": "Image at " + path},
|
||||
// {"type": "image_url", "url": dataURL},
|
||||
// },
|
||||
// }
|
||||
result := models.MultimodalToolResp{
|
||||
Type: "multimodal_content",
|
||||
Parts: []map[string]string{
|
||||
{"type": "text", "text": "Image at " + path},
|
||||
{"type": "image_url", "url": dataURL},
|
||||
},
|
||||
}
|
||||
jsonResult, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
msg := "failed to marshal result; error: " + err.Error()
|
||||
logger.Error(msg)
|
||||
return []byte(msg)
|
||||
}
|
||||
return jsonResult
|
||||
}
|
||||
|
||||
func fileWrite(args map[string]string) []byte {
|
||||
path, ok := args["path"]
|
||||
if !ok || path == "" {
|
||||
@@ -1101,6 +1138,7 @@ var fnMap = map[string]fnSig{
|
||||
"read_url_raw": readURLRaw,
|
||||
"file_create": fileCreate,
|
||||
"file_read": fileRead,
|
||||
"file_read_image": fileReadImage,
|
||||
"file_write": fileWrite,
|
||||
"file_write_append": fileWriteAppend,
|
||||
"file_edit": fileEdit,
|
||||
@@ -1327,6 +1365,24 @@ var baseTools = []models.Tool{
|
||||
},
|
||||
},
|
||||
},
|
||||
// file_read_image
|
||||
models.Tool{
|
||||
Type: "function",
|
||||
Function: models.ToolFunc{
|
||||
Name: "file_read_image",
|
||||
Description: "Read an image file and return it for multimodal LLM viewing. Supports png, jpg, jpeg, gif, webp formats. Use when you need the LLM to see and analyze an image.",
|
||||
Parameters: models.ToolFuncParams{
|
||||
Type: "object",
|
||||
Required: []string{"path"},
|
||||
Properties: map[string]models.ToolArgProps{
|
||||
"path": models.ToolArgProps{
|
||||
Type: "string",
|
||||
Description: "path of the image file to read",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
// file_write
|
||||
models.Tool{
|
||||
Type: "function",
|
||||
|
||||
Reference in New Issue
Block a user