Fix (completion): handle multiple images in history

This commit is contained in:
Grail Finder
2026-03-02 09:23:22 +03:00
parent b2f280a7f1
commit 8999f48fb9
2 changed files with 45 additions and 32 deletions

1
bot.go
View File

@@ -1177,7 +1177,6 @@ func findCall(msg, toolCall string) bool {
// Create tool response message with the proper tool_call_id // Create tool response message with the proper tool_call_id
// Mark shell commands as always visible // Mark shell commands as always visible
isShellCommand := fc.Name == "execute_command" isShellCommand := fc.Name == "execute_command"
// Check if response is multimodal content (image) // Check if response is multimodal content (image)
var toolResponseMsg models.RoleMsg var toolResponseMsg models.RoleMsg
if strings.HasPrefix(strings.TrimSpace(toolMsg), `{"type":"multimodal_content"`) { if strings.HasPrefix(strings.TrimSpace(toolMsg), `{"type":"multimodal_content"`) {

60
llm.go
View File

@@ -3,7 +3,6 @@ package main
import ( import (
"bytes" "bytes"
"encoding/json" "encoding/json"
"errors"
"gf-lt/models" "gf-lt/models"
"io" "io"
"strings" "strings"
@@ -119,25 +118,22 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI) logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI)
localImageAttachmentPath := imageAttachmentPath localImageAttachmentPath := imageAttachmentPath
var multimodalData []string var multimodalData []string
if msg != "" { // otherwise let the bot to continue
var newMsg models.RoleMsg
if localImageAttachmentPath != "" { if localImageAttachmentPath != "" {
newMsg = models.NewMultimodalMsg(role, []any{})
newMsg.AddTextPart(msg)
imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath) imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath)
if err != nil { if err != nil {
logger.Error("failed to create image URL from path for completion", logger.Error("failed to create image URL from path for completion",
"error", err, "path", localImageAttachmentPath) "error", err, "path", localImageAttachmentPath)
return nil, err return nil, err
} }
// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...") newMsg.AddImagePart(imageURL, localImageAttachmentPath)
parts := strings.SplitN(imageURL, ",", 2)
if len(parts) == 2 {
multimodalData = append(multimodalData, parts[1])
} else {
logger.Error("invalid image data URL format", "url", imageURL)
return nil, errors.New("invalid image data URL format")
}
imageAttachmentPath = "" // Clear the attachment after use imageAttachmentPath = "" // Clear the attachment after use
} else { // not a multimodal msg or image passed in tool call
newMsg = models.RoleMsg{Role: role, Content: msg}
} }
if msg != "" { // otherwise let the bot to continue
newMsg := models.RoleMsg{Role: role, Content: msg}
newMsg = *processMessageTag(&newMsg) newMsg = *processMessageTag(&newMsg)
chatBody.Messages = append(chatBody.Messages, newMsg) chatBody.Messages = append(chatBody.Messages, newMsg)
} }
@@ -146,22 +142,40 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
chatBody.Messages = append(chatBody.Messages, models.RoleMsg{Role: cfg.ToolRole, Content: toolSysMsg}) chatBody.Messages = append(chatBody.Messages, models.RoleMsg{Role: cfg.ToolRole, Content: toolSysMsg})
} }
filteredMessages, botPersona := filterMessagesForCurrentCharacter(chatBody.Messages) filteredMessages, botPersona := filterMessagesForCurrentCharacter(chatBody.Messages)
// Build prompt and extract images inline as we process each message
messages := make([]string, len(filteredMessages)) messages := make([]string, len(filteredMessages))
for i := range filteredMessages { for i := range filteredMessages {
messages[i] = stripThinkingFromMsg(&filteredMessages[i]).ToPrompt() m := stripThinkingFromMsg(&filteredMessages[i])
messages[i] = m.ToPrompt()
// Extract images from this message and add marker inline
if len(m.ContentParts) > 0 {
for _, part := range m.ContentParts {
var imgURL string
// Check for struct type
if imgPart, ok := part.(models.ImageContentPart); ok {
imgURL = imgPart.ImageURL.URL
} else if partMap, ok := part.(map[string]any); ok {
// Check for map type (from JSON unmarshaling)
if partType, exists := partMap["type"]; exists && partType == "image_url" {
if imgURLMap, ok := partMap["image_url"].(map[string]any); ok {
if url, ok := imgURLMap["url"].(string); ok {
imgURL = url
}
}
}
}
if imgURL != "" {
// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
parts := strings.SplitN(imgURL, ",", 2)
if len(parts) == 2 {
multimodalData = append(multimodalData, parts[1])
messages[i] += " <__media__>"
}
}
}
}
} }
prompt := strings.Join(messages, "\n") prompt := strings.Join(messages, "\n")
// Add multimodal media markers to the prompt text when multimodal data is present
// This is required by llama.cpp multimodal models so they know where to insert media
if len(multimodalData) > 0 {
// Add a media marker for each item in the multimodal data
var sb strings.Builder
sb.WriteString(prompt)
for range multimodalData {
sb.WriteString(" <__media__>") // llama.cpp default multimodal marker
}
prompt = sb.String()
}
// needs to be after <__media__> if there are images // needs to be after <__media__> if there are images
if !resume { if !resume {
botMsgStart := "\n" + botPersona + ":\n" botMsgStart := "\n" + botPersona + ":\n"