Feat: image prompt for llama.cpp /completion
This commit is contained in:
34
llm.go
34
llm.go
@@ -6,6 +6,7 @@ import (
|
||||
"gf-lt/models"
|
||||
"io"
|
||||
"strings"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
var imageAttachmentPath string // Global variable to track image attachment for next message
|
||||
@@ -82,6 +83,26 @@ func (lcp LCPCompletion) GetToken() string {
|
||||
|
||||
func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, error) {
|
||||
logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI)
|
||||
localImageAttachmentPath := imageAttachmentPath
|
||||
var multimodalData []string
|
||||
|
||||
if localImageAttachmentPath != "" {
|
||||
imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath)
|
||||
if err != nil {
|
||||
logger.Error("failed to create image URL from path for completion", "error", err, "path", localImageAttachmentPath)
|
||||
return nil, err
|
||||
}
|
||||
// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
|
||||
parts := strings.SplitN(imageURL, ",", 2)
|
||||
if len(parts) == 2 {
|
||||
multimodalData = append(multimodalData, parts[1])
|
||||
} else {
|
||||
logger.Error("invalid image data URL format", "url", imageURL)
|
||||
return nil, fmt.Errorf("invalid image data URL format")
|
||||
}
|
||||
imageAttachmentPath = "" // Clear the attachment after use
|
||||
}
|
||||
|
||||
if msg != "" { // otherwise let the bot to continue
|
||||
newMsg := models.RoleMsg{Role: role, Content: msg}
|
||||
chatBody.Messages = append(chatBody.Messages, newMsg)
|
||||
@@ -118,9 +139,18 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
|
||||
if cfg.ThinkUse && !cfg.ToolUse {
|
||||
prompt += "<think>"
|
||||
}
|
||||
// Add multimodal media markers to the prompt text when multimodal data is present
|
||||
// This is required by llama.cpp multimodal models so they know where to insert media
|
||||
if len(multimodalData) > 0 {
|
||||
// Add a media marker for each item in the multimodal data
|
||||
for range multimodalData {
|
||||
prompt += " <__media__>" // llama.cpp default multimodal marker
|
||||
}
|
||||
}
|
||||
|
||||
logger.Debug("checking prompt for /completion", "tool_use", cfg.ToolUse,
|
||||
"msg", msg, "resume", resume, "prompt", prompt)
|
||||
payload := models.NewLCPReq(prompt, defaultLCPProps, chatBody.MakeStopSlice())
|
||||
"msg", msg, "resume", resume, "prompt", prompt, "multimodal_data_count", len(multimodalData))
|
||||
payload := models.NewLCPReq(prompt, multimodalData, defaultLCPProps, chatBody.MakeStopSlice())
|
||||
data, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
logger.Error("failed to form a msg", "error", err)
|
||||
|
||||
@@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"gf-lt/models"
|
||||
"fmt"
|
||||
"gf-lt/config"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
@@ -26,16 +27,16 @@ func TestRemoveThinking(t *testing.T) {
|
||||
}
|
||||
for i, tc := range cases {
|
||||
t.Run(fmt.Sprintf("run_%d", i), func(t *testing.T) {
|
||||
cfg = &config.Config{ToolRole: "tool"} // Initialize cfg.ToolRole for test
|
||||
mNum := len(tc.cb.Messages)
|
||||
removeThinking(tc.cb)
|
||||
if len(tc.cb.Messages) != mNum-int(tc.toolMsgs) {
|
||||
t.Error("failed to delete tools msg", tc.cb.Messages, cfg.ToolRole)
|
||||
t.Errorf("failed to delete tools msg %v; expected %d, got %d", tc.cb.Messages, mNum-int(tc.toolMsgs), len(tc.cb.Messages))
|
||||
}
|
||||
for _, msg := range tc.cb.Messages {
|
||||
if strings.Contains(msg.Content, "<think>") {
|
||||
t.Errorf("msg contains think tag; msg: %s\n", msg.Content)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}) }
|
||||
}
|
||||
|
||||
@@ -440,8 +440,9 @@ type LLMModels struct {
|
||||
|
||||
type LlamaCPPReq struct {
|
||||
Stream bool `json:"stream"`
|
||||
// Messages []RoleMsg `json:"messages"`
|
||||
Prompt string `json:"prompt"`
|
||||
// For multimodal requests, prompt should be an object with prompt_string and multimodal_data
|
||||
// For regular requests, prompt is a string
|
||||
Prompt interface{} `json:"prompt"` // Can be string or object with prompt_string and multimodal_data
|
||||
Temperature float32 `json:"temperature"`
|
||||
DryMultiplier float32 `json:"dry_multiplier"`
|
||||
Stop []string `json:"stop"`
|
||||
@@ -466,17 +467,37 @@ type LlamaCPPReq struct {
|
||||
// Samplers string `json:"samplers"`
|
||||
}
|
||||
|
||||
func NewLCPReq(prompt string, props map[string]float32, stopStrings []string) LlamaCPPReq {
|
||||
type PromptObject struct {
|
||||
PromptString string `json:"prompt_string"`
|
||||
MultimodalData []string `json:"multimodal_data,omitempty"`
|
||||
// Alternative field name used by some llama.cpp implementations
|
||||
ImageData []string `json:"image_data,omitempty"` // For compatibility
|
||||
}
|
||||
|
||||
func NewLCPReq(prompt string, multimodalData []string, props map[string]float32, stopStrings []string) LlamaCPPReq {
|
||||
var finalPrompt interface{}
|
||||
|
||||
if len(multimodalData) > 0 {
|
||||
// When multimodal data is present, use the object format as per Python example:
|
||||
// { "prompt": { "prompt_string": "...", "multimodal_data": [...] } }
|
||||
finalPrompt = PromptObject{
|
||||
PromptString: prompt,
|
||||
MultimodalData: multimodalData,
|
||||
ImageData: multimodalData, // Also populate for compatibility with different llama.cpp versions
|
||||
}
|
||||
} else {
|
||||
// When no multimodal data, use plain string
|
||||
finalPrompt = prompt
|
||||
}
|
||||
|
||||
return LlamaCPPReq{
|
||||
Stream: true,
|
||||
Prompt: prompt,
|
||||
// Temperature: 0.8,
|
||||
// DryMultiplier: 0.5,
|
||||
Prompt: finalPrompt,
|
||||
Temperature: props["temperature"],
|
||||
DryMultiplier: props["dry_multiplier"],
|
||||
Stop: stopStrings,
|
||||
MinP: props["min_p"],
|
||||
NPredict: int32(props["n_predict"]),
|
||||
Stop: stopStrings,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user