Feat: image prompt for llama.cpp /completion
This commit is contained in:
34
llm.go
34
llm.go
@@ -6,6 +6,7 @@ import (
|
|||||||
"gf-lt/models"
|
"gf-lt/models"
|
||||||
"io"
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
|
"fmt"
|
||||||
)
|
)
|
||||||
|
|
||||||
var imageAttachmentPath string // Global variable to track image attachment for next message
|
var imageAttachmentPath string // Global variable to track image attachment for next message
|
||||||
@@ -82,6 +83,26 @@ func (lcp LCPCompletion) GetToken() string {
|
|||||||
|
|
||||||
func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, error) {
|
func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, error) {
|
||||||
logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI)
|
logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI)
|
||||||
|
localImageAttachmentPath := imageAttachmentPath
|
||||||
|
var multimodalData []string
|
||||||
|
|
||||||
|
if localImageAttachmentPath != "" {
|
||||||
|
imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath)
|
||||||
|
if err != nil {
|
||||||
|
logger.Error("failed to create image URL from path for completion", "error", err, "path", localImageAttachmentPath)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
|
||||||
|
parts := strings.SplitN(imageURL, ",", 2)
|
||||||
|
if len(parts) == 2 {
|
||||||
|
multimodalData = append(multimodalData, parts[1])
|
||||||
|
} else {
|
||||||
|
logger.Error("invalid image data URL format", "url", imageURL)
|
||||||
|
return nil, fmt.Errorf("invalid image data URL format")
|
||||||
|
}
|
||||||
|
imageAttachmentPath = "" // Clear the attachment after use
|
||||||
|
}
|
||||||
|
|
||||||
if msg != "" { // otherwise let the bot to continue
|
if msg != "" { // otherwise let the bot to continue
|
||||||
newMsg := models.RoleMsg{Role: role, Content: msg}
|
newMsg := models.RoleMsg{Role: role, Content: msg}
|
||||||
chatBody.Messages = append(chatBody.Messages, newMsg)
|
chatBody.Messages = append(chatBody.Messages, newMsg)
|
||||||
@@ -118,9 +139,18 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
|
|||||||
if cfg.ThinkUse && !cfg.ToolUse {
|
if cfg.ThinkUse && !cfg.ToolUse {
|
||||||
prompt += "<think>"
|
prompt += "<think>"
|
||||||
}
|
}
|
||||||
|
// Add multimodal media markers to the prompt text when multimodal data is present
|
||||||
|
// This is required by llama.cpp multimodal models so they know where to insert media
|
||||||
|
if len(multimodalData) > 0 {
|
||||||
|
// Add a media marker for each item in the multimodal data
|
||||||
|
for range multimodalData {
|
||||||
|
prompt += " <__media__>" // llama.cpp default multimodal marker
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
logger.Debug("checking prompt for /completion", "tool_use", cfg.ToolUse,
|
logger.Debug("checking prompt for /completion", "tool_use", cfg.ToolUse,
|
||||||
"msg", msg, "resume", resume, "prompt", prompt)
|
"msg", msg, "resume", resume, "prompt", prompt, "multimodal_data_count", len(multimodalData))
|
||||||
payload := models.NewLCPReq(prompt, defaultLCPProps, chatBody.MakeStopSlice())
|
payload := models.NewLCPReq(prompt, multimodalData, defaultLCPProps, chatBody.MakeStopSlice())
|
||||||
data, err := json.Marshal(payload)
|
data, err := json.Marshal(payload)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Error("failed to form a msg", "error", err)
|
logger.Error("failed to form a msg", "error", err)
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package main
|
|||||||
import (
|
import (
|
||||||
"gf-lt/models"
|
"gf-lt/models"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"gf-lt/config"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
@@ -26,16 +27,16 @@ func TestRemoveThinking(t *testing.T) {
|
|||||||
}
|
}
|
||||||
for i, tc := range cases {
|
for i, tc := range cases {
|
||||||
t.Run(fmt.Sprintf("run_%d", i), func(t *testing.T) {
|
t.Run(fmt.Sprintf("run_%d", i), func(t *testing.T) {
|
||||||
|
cfg = &config.Config{ToolRole: "tool"} // Initialize cfg.ToolRole for test
|
||||||
mNum := len(tc.cb.Messages)
|
mNum := len(tc.cb.Messages)
|
||||||
removeThinking(tc.cb)
|
removeThinking(tc.cb)
|
||||||
if len(tc.cb.Messages) != mNum-int(tc.toolMsgs) {
|
if len(tc.cb.Messages) != mNum-int(tc.toolMsgs) {
|
||||||
t.Error("failed to delete tools msg", tc.cb.Messages, cfg.ToolRole)
|
t.Errorf("failed to delete tools msg %v; expected %d, got %d", tc.cb.Messages, mNum-int(tc.toolMsgs), len(tc.cb.Messages))
|
||||||
}
|
}
|
||||||
for _, msg := range tc.cb.Messages {
|
for _, msg := range tc.cb.Messages {
|
||||||
if strings.Contains(msg.Content, "<think>") {
|
if strings.Contains(msg.Content, "<think>") {
|
||||||
t.Errorf("msg contains think tag; msg: %s\n", msg.Content)
|
t.Errorf("msg contains think tag; msg: %s\n", msg.Content)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
}) }
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -440,8 +440,9 @@ type LLMModels struct {
|
|||||||
|
|
||||||
type LlamaCPPReq struct {
|
type LlamaCPPReq struct {
|
||||||
Stream bool `json:"stream"`
|
Stream bool `json:"stream"`
|
||||||
// Messages []RoleMsg `json:"messages"`
|
// For multimodal requests, prompt should be an object with prompt_string and multimodal_data
|
||||||
Prompt string `json:"prompt"`
|
// For regular requests, prompt is a string
|
||||||
|
Prompt interface{} `json:"prompt"` // Can be string or object with prompt_string and multimodal_data
|
||||||
Temperature float32 `json:"temperature"`
|
Temperature float32 `json:"temperature"`
|
||||||
DryMultiplier float32 `json:"dry_multiplier"`
|
DryMultiplier float32 `json:"dry_multiplier"`
|
||||||
Stop []string `json:"stop"`
|
Stop []string `json:"stop"`
|
||||||
@@ -466,17 +467,37 @@ type LlamaCPPReq struct {
|
|||||||
// Samplers string `json:"samplers"`
|
// Samplers string `json:"samplers"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewLCPReq(prompt string, props map[string]float32, stopStrings []string) LlamaCPPReq {
|
type PromptObject struct {
|
||||||
|
PromptString string `json:"prompt_string"`
|
||||||
|
MultimodalData []string `json:"multimodal_data,omitempty"`
|
||||||
|
// Alternative field name used by some llama.cpp implementations
|
||||||
|
ImageData []string `json:"image_data,omitempty"` // For compatibility
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewLCPReq(prompt string, multimodalData []string, props map[string]float32, stopStrings []string) LlamaCPPReq {
|
||||||
|
var finalPrompt interface{}
|
||||||
|
|
||||||
|
if len(multimodalData) > 0 {
|
||||||
|
// When multimodal data is present, use the object format as per Python example:
|
||||||
|
// { "prompt": { "prompt_string": "...", "multimodal_data": [...] } }
|
||||||
|
finalPrompt = PromptObject{
|
||||||
|
PromptString: prompt,
|
||||||
|
MultimodalData: multimodalData,
|
||||||
|
ImageData: multimodalData, // Also populate for compatibility with different llama.cpp versions
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// When no multimodal data, use plain string
|
||||||
|
finalPrompt = prompt
|
||||||
|
}
|
||||||
|
|
||||||
return LlamaCPPReq{
|
return LlamaCPPReq{
|
||||||
Stream: true,
|
Stream: true,
|
||||||
Prompt: prompt,
|
Prompt: finalPrompt,
|
||||||
// Temperature: 0.8,
|
|
||||||
// DryMultiplier: 0.5,
|
|
||||||
Temperature: props["temperature"],
|
Temperature: props["temperature"],
|
||||||
DryMultiplier: props["dry_multiplier"],
|
DryMultiplier: props["dry_multiplier"],
|
||||||
|
Stop: stopStrings,
|
||||||
MinP: props["min_p"],
|
MinP: props["min_p"],
|
||||||
NPredict: int32(props["n_predict"]),
|
NPredict: int32(props["n_predict"]),
|
||||||
Stop: stopStrings,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user