Fix (completion): handle multiple images in history
This commit is contained in:
1
bot.go
1
bot.go
@@ -1177,7 +1177,6 @@ func findCall(msg, toolCall string) bool {
|
|||||||
// Create tool response message with the proper tool_call_id
|
// Create tool response message with the proper tool_call_id
|
||||||
// Mark shell commands as always visible
|
// Mark shell commands as always visible
|
||||||
isShellCommand := fc.Name == "execute_command"
|
isShellCommand := fc.Name == "execute_command"
|
||||||
|
|
||||||
// Check if response is multimodal content (image)
|
// Check if response is multimodal content (image)
|
||||||
var toolResponseMsg models.RoleMsg
|
var toolResponseMsg models.RoleMsg
|
||||||
if strings.HasPrefix(strings.TrimSpace(toolMsg), `{"type":"multimodal_content"`) {
|
if strings.HasPrefix(strings.TrimSpace(toolMsg), `{"type":"multimodal_content"`) {
|
||||||
|
|||||||
60
llm.go
60
llm.go
@@ -3,7 +3,6 @@ package main
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
|
||||||
"gf-lt/models"
|
"gf-lt/models"
|
||||||
"io"
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -119,25 +118,22 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
|
|||||||
logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI)
|
logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI)
|
||||||
localImageAttachmentPath := imageAttachmentPath
|
localImageAttachmentPath := imageAttachmentPath
|
||||||
var multimodalData []string
|
var multimodalData []string
|
||||||
|
if msg != "" { // otherwise let the bot to continue
|
||||||
|
var newMsg models.RoleMsg
|
||||||
if localImageAttachmentPath != "" {
|
if localImageAttachmentPath != "" {
|
||||||
|
newMsg = models.NewMultimodalMsg(role, []any{})
|
||||||
|
newMsg.AddTextPart(msg)
|
||||||
imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath)
|
imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Error("failed to create image URL from path for completion",
|
logger.Error("failed to create image URL from path for completion",
|
||||||
"error", err, "path", localImageAttachmentPath)
|
"error", err, "path", localImageAttachmentPath)
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
|
newMsg.AddImagePart(imageURL, localImageAttachmentPath)
|
||||||
parts := strings.SplitN(imageURL, ",", 2)
|
|
||||||
if len(parts) == 2 {
|
|
||||||
multimodalData = append(multimodalData, parts[1])
|
|
||||||
} else {
|
|
||||||
logger.Error("invalid image data URL format", "url", imageURL)
|
|
||||||
return nil, errors.New("invalid image data URL format")
|
|
||||||
}
|
|
||||||
imageAttachmentPath = "" // Clear the attachment after use
|
imageAttachmentPath = "" // Clear the attachment after use
|
||||||
|
} else { // not a multimodal msg or image passed in tool call
|
||||||
|
newMsg = models.RoleMsg{Role: role, Content: msg}
|
||||||
}
|
}
|
||||||
if msg != "" { // otherwise let the bot to continue
|
|
||||||
newMsg := models.RoleMsg{Role: role, Content: msg}
|
|
||||||
newMsg = *processMessageTag(&newMsg)
|
newMsg = *processMessageTag(&newMsg)
|
||||||
chatBody.Messages = append(chatBody.Messages, newMsg)
|
chatBody.Messages = append(chatBody.Messages, newMsg)
|
||||||
}
|
}
|
||||||
@@ -146,22 +142,40 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
|
|||||||
chatBody.Messages = append(chatBody.Messages, models.RoleMsg{Role: cfg.ToolRole, Content: toolSysMsg})
|
chatBody.Messages = append(chatBody.Messages, models.RoleMsg{Role: cfg.ToolRole, Content: toolSysMsg})
|
||||||
}
|
}
|
||||||
filteredMessages, botPersona := filterMessagesForCurrentCharacter(chatBody.Messages)
|
filteredMessages, botPersona := filterMessagesForCurrentCharacter(chatBody.Messages)
|
||||||
|
// Build prompt and extract images inline as we process each message
|
||||||
messages := make([]string, len(filteredMessages))
|
messages := make([]string, len(filteredMessages))
|
||||||
for i := range filteredMessages {
|
for i := range filteredMessages {
|
||||||
messages[i] = stripThinkingFromMsg(&filteredMessages[i]).ToPrompt()
|
m := stripThinkingFromMsg(&filteredMessages[i])
|
||||||
|
messages[i] = m.ToPrompt()
|
||||||
|
// Extract images from this message and add marker inline
|
||||||
|
if len(m.ContentParts) > 0 {
|
||||||
|
for _, part := range m.ContentParts {
|
||||||
|
var imgURL string
|
||||||
|
// Check for struct type
|
||||||
|
if imgPart, ok := part.(models.ImageContentPart); ok {
|
||||||
|
imgURL = imgPart.ImageURL.URL
|
||||||
|
} else if partMap, ok := part.(map[string]any); ok {
|
||||||
|
// Check for map type (from JSON unmarshaling)
|
||||||
|
if partType, exists := partMap["type"]; exists && partType == "image_url" {
|
||||||
|
if imgURLMap, ok := partMap["image_url"].(map[string]any); ok {
|
||||||
|
if url, ok := imgURLMap["url"].(string); ok {
|
||||||
|
imgURL = url
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if imgURL != "" {
|
||||||
|
// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
|
||||||
|
parts := strings.SplitN(imgURL, ",", 2)
|
||||||
|
if len(parts) == 2 {
|
||||||
|
multimodalData = append(multimodalData, parts[1])
|
||||||
|
messages[i] += " <__media__>"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
prompt := strings.Join(messages, "\n")
|
prompt := strings.Join(messages, "\n")
|
||||||
// Add multimodal media markers to the prompt text when multimodal data is present
|
|
||||||
// This is required by llama.cpp multimodal models so they know where to insert media
|
|
||||||
if len(multimodalData) > 0 {
|
|
||||||
// Add a media marker for each item in the multimodal data
|
|
||||||
var sb strings.Builder
|
|
||||||
sb.WriteString(prompt)
|
|
||||||
for range multimodalData {
|
|
||||||
sb.WriteString(" <__media__>") // llama.cpp default multimodal marker
|
|
||||||
}
|
|
||||||
prompt = sb.String()
|
|
||||||
}
|
|
||||||
// needs to be after <__media__> if there are images
|
// needs to be after <__media__> if there are images
|
||||||
if !resume {
|
if !resume {
|
||||||
botMsgStart := "\n" + botPersona + ":\n"
|
botMsgStart := "\n" + botPersona + ":\n"
|
||||||
|
|||||||
Reference in New Issue
Block a user