Feat: image prompt for llama.cpp /completion

This commit is contained in:
Grail Finder
2025-12-09 15:03:21 +03:00
parent 378dceb3f4
commit e1bac8d064
3 changed files with 80 additions and 28 deletions

34
llm.go
View File

@@ -6,6 +6,7 @@ import (
"gf-lt/models" "gf-lt/models"
"io" "io"
"strings" "strings"
"fmt"
) )
var imageAttachmentPath string // Global variable to track image attachment for next message var imageAttachmentPath string // Global variable to track image attachment for next message
@@ -82,6 +83,26 @@ func (lcp LCPCompletion) GetToken() string {
func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, error) { func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, error) {
logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI) logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI)
localImageAttachmentPath := imageAttachmentPath
var multimodalData []string
if localImageAttachmentPath != "" {
imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath)
if err != nil {
logger.Error("failed to create image URL from path for completion", "error", err, "path", localImageAttachmentPath)
return nil, err
}
// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
parts := strings.SplitN(imageURL, ",", 2)
if len(parts) == 2 {
multimodalData = append(multimodalData, parts[1])
} else {
logger.Error("invalid image data URL format", "url", imageURL)
return nil, fmt.Errorf("invalid image data URL format")
}
imageAttachmentPath = "" // Clear the attachment after use
}
if msg != "" { // otherwise let the bot to continue if msg != "" { // otherwise let the bot to continue
newMsg := models.RoleMsg{Role: role, Content: msg} newMsg := models.RoleMsg{Role: role, Content: msg}
chatBody.Messages = append(chatBody.Messages, newMsg) chatBody.Messages = append(chatBody.Messages, newMsg)
@@ -118,9 +139,18 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
if cfg.ThinkUse && !cfg.ToolUse { if cfg.ThinkUse && !cfg.ToolUse {
prompt += "<think>" prompt += "<think>"
} }
// Add multimodal media markers to the prompt text when multimodal data is present
// This is required by llama.cpp multimodal models so they know where to insert media
if len(multimodalData) > 0 {
// Add a media marker for each item in the multimodal data
for range multimodalData {
prompt += " <__media__>" // llama.cpp default multimodal marker
}
}
logger.Debug("checking prompt for /completion", "tool_use", cfg.ToolUse, logger.Debug("checking prompt for /completion", "tool_use", cfg.ToolUse,
"msg", msg, "resume", resume, "prompt", prompt) "msg", msg, "resume", resume, "prompt", prompt, "multimodal_data_count", len(multimodalData))
payload := models.NewLCPReq(prompt, defaultLCPProps, chatBody.MakeStopSlice()) payload := models.NewLCPReq(prompt, multimodalData, defaultLCPProps, chatBody.MakeStopSlice())
data, err := json.Marshal(payload) data, err := json.Marshal(payload)
if err != nil { if err != nil {
logger.Error("failed to form a msg", "error", err) logger.Error("failed to form a msg", "error", err)

View File

@@ -3,6 +3,7 @@ package main
import ( import (
"gf-lt/models" "gf-lt/models"
"fmt" "fmt"
"gf-lt/config"
"strings" "strings"
"testing" "testing"
) )
@@ -25,17 +26,17 @@ func TestRemoveThinking(t *testing.T) {
}, },
} }
for i, tc := range cases { for i, tc := range cases {
t.Run(fmt.Sprintf("run_%d", i), func(t *testing.T) { t.Run(fmt.Sprintf("run_%d", i), func(t *testing.T) {
mNum := len(tc.cb.Messages) cfg = &config.Config{ToolRole: "tool"} // Initialize cfg.ToolRole for test
removeThinking(tc.cb) mNum := len(tc.cb.Messages)
if len(tc.cb.Messages) != mNum-int(tc.toolMsgs) { removeThinking(tc.cb)
t.Error("failed to delete tools msg", tc.cb.Messages, cfg.ToolRole) if len(tc.cb.Messages) != mNum-int(tc.toolMsgs) {
} t.Errorf("failed to delete tools msg %v; expected %d, got %d", tc.cb.Messages, mNum-int(tc.toolMsgs), len(tc.cb.Messages))
for _, msg := range tc.cb.Messages { }
if strings.Contains(msg.Content, "<think>") { for _, msg := range tc.cb.Messages {
t.Errorf("msg contains think tag; msg: %s\n", msg.Content) if strings.Contains(msg.Content, "<think>") {
} t.Errorf("msg contains think tag; msg: %s\n", msg.Content)
} }
}) }
} }) }
} }

View File

@@ -440,13 +440,14 @@ type LLMModels struct {
type LlamaCPPReq struct { type LlamaCPPReq struct {
Stream bool `json:"stream"` Stream bool `json:"stream"`
// Messages []RoleMsg `json:"messages"` // For multimodal requests, prompt should be an object with prompt_string and multimodal_data
Prompt string `json:"prompt"` // For regular requests, prompt is a string
Temperature float32 `json:"temperature"` Prompt interface{} `json:"prompt"` // Can be string or object with prompt_string and multimodal_data
DryMultiplier float32 `json:"dry_multiplier"` Temperature float32 `json:"temperature"`
Stop []string `json:"stop"` DryMultiplier float32 `json:"dry_multiplier"`
MinP float32 `json:"min_p"` Stop []string `json:"stop"`
NPredict int32 `json:"n_predict"` MinP float32 `json:"min_p"`
NPredict int32 `json:"n_predict"`
// MaxTokens int `json:"max_tokens"` // MaxTokens int `json:"max_tokens"`
// DryBase float64 `json:"dry_base"` // DryBase float64 `json:"dry_base"`
// DryAllowedLength int `json:"dry_allowed_length"` // DryAllowedLength int `json:"dry_allowed_length"`
@@ -466,17 +467,37 @@ type LlamaCPPReq struct {
// Samplers string `json:"samplers"` // Samplers string `json:"samplers"`
} }
func NewLCPReq(prompt string, props map[string]float32, stopStrings []string) LlamaCPPReq { type PromptObject struct {
PromptString string `json:"prompt_string"`
MultimodalData []string `json:"multimodal_data,omitempty"`
// Alternative field name used by some llama.cpp implementations
ImageData []string `json:"image_data,omitempty"` // For compatibility
}
func NewLCPReq(prompt string, multimodalData []string, props map[string]float32, stopStrings []string) LlamaCPPReq {
var finalPrompt interface{}
if len(multimodalData) > 0 {
// When multimodal data is present, use the object format as per Python example:
// { "prompt": { "prompt_string": "...", "multimodal_data": [...] } }
finalPrompt = PromptObject{
PromptString: prompt,
MultimodalData: multimodalData,
ImageData: multimodalData, // Also populate for compatibility with different llama.cpp versions
}
} else {
// When no multimodal data, use plain string
finalPrompt = prompt
}
return LlamaCPPReq{ return LlamaCPPReq{
Stream: true, Stream: true,
Prompt: prompt, Prompt: finalPrompt,
// Temperature: 0.8,
// DryMultiplier: 0.5,
Temperature: props["temperature"], Temperature: props["temperature"],
DryMultiplier: props["dry_multiplier"], DryMultiplier: props["dry_multiplier"],
Stop: stopStrings,
MinP: props["min_p"], MinP: props["min_p"],
NPredict: int32(props["n_predict"]), NPredict: int32(props["n_predict"]),
Stop: stopStrings,
} }
} }