4 Commits

Author SHA1 Message Date
Grail Finder
2c495253c2 Chore: panic is not solved (WIP) 2026-03-07 09:40:58 +03:00
Grail Finder
118a0a0d55 Chore: cleanup logs 2026-03-07 09:08:01 +03:00
Grail Finder
44633d64c6 Chore: move to own file 2026-03-07 08:52:10 +03:00
Grail Finder
0598e3e86d Feat: kokoro onnx (WIP) 2026-03-07 08:35:44 +03:00
10 changed files with 909 additions and 567 deletions

View File

@@ -1,4 +1,4 @@
.PHONY: setconfig run lint lintall install-linters setup-whisper build-whisper download-whisper-model docker-up docker-down docker-logs noextra-run installdelve checkdelve fetch-onnx install-onnx-deps
.PHONY: setconfig run lint lintall install-linters setup-whisper build-whisper download-whisper-model docker-up docker-down docker-logs noextra-run installdelve checkdelve fetch-onnx install-onnx-deps fetch-kokoro-voices install-espeak
run: setconfig
go build -tags extra -o gf-lt && ./gf-lt
@@ -33,6 +33,9 @@ lintall: lint
fetch-onnx:
mkdir -p onnx/embedgemma && curl -o onnx/embedgemma/config.json -L https://huggingface.co/onnx-community/embeddinggemma-300m-ONNX/resolve/main/config.json && curl -o onnx/embedgemma/tokenizer.json -L https://huggingface.co/onnx-community/embeddinggemma-300m-ONNX/resolve/main/tokenizer.json && curl -o onnx/embedgemma/model_q4.onnx -L https://huggingface.co/onnx-community/embeddinggemma-300m-ONNX/resolve/main/onnx/model_q4.onnx && curl -o onnx/embedgemma/model_q4.onnx_data -L https://huggingface.co/onnx-community/embeddinggemma-300m-ONNX/resolve/main/onnx/model_q4.onnx_data?download=true
fetch-kokoro-onnx:
mkdir -p onnx/kokoro && curl -o onnx/kokoro/config.json -L https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX/resolve/main/config.json && curl -o onnx/kokoro/tokenizer.json -L https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX/resolve/main/tokenizer.json && curl -o onnx/kokoro/model_quantized.onnx -L https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX/resolve/main/onnx/model_quantized.onnx && curl -o onnx/kokoro/voices.bin -L https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin
install-onnx-deps: ## Install ONNX Runtime with CUDA support (or CPU fallback)
@echo "=== ONNX Runtime Installer ===" && \
echo "" && \
@@ -194,3 +197,25 @@ docker-logs-whisper: ## View logs from Whisper STT service only
docker-logs-kokoro: ## View logs from Kokoro TTS service only
@echo "Displaying logs from Kokoro TTS service..."
docker-compose -f batteries/docker-compose.yml logs -f kokoro-tts
# Kokoro ONNX TTS Setup
install-espeak: ## Install espeak-ng for phoneme tokenization
@echo "=== Installing espeak-ng ===" && \
if command -v espeak-ng >/dev/null 2>&1; then \
echo "espeak-ng is already installed:" && \
espeak-ng --version && \
exit 0; \
fi && \
echo "Installing espeak-ng..." && \
sudo apt-get update && \
sudo apt-get install -y espeak-ng espeak && \
echo "espeak-ng installed successfully!" && \
espeak-ng --version
fetch-kokoro-voices: ## Download Kokoro voice files (PyTorch format)
@echo "=== Downloading Kokoro voices ===" && \
mkdir -p onnx/kokoro/voices && \
echo "Downloading af_bella voice..." && \
curl -L -o onnx/kokoro/voices/af_bella.pt https://raw.githubusercontent.com/hexgrad/kokoro/main/kokoro/voices/af_heart.pt && \
echo "Voice file downloaded to onnx/kokoro/voices/" && \
ls -lh onnx/kokoro/voices/

4
bot.go
View File

@@ -418,7 +418,9 @@ func fetchLCPModelsWithStatus() (*models.LCPModels, error) {
if err := json.NewDecoder(resp.Body).Decode(data); err != nil {
return nil, err
}
localModelsMu.Lock()
localModelsData = data
localModelsMu.Unlock()
return data, nil
}
@@ -1497,7 +1499,7 @@ func init() {
// load cards
basicCard.Role = cfg.AssistantRole
logLevel.Set(slog.LevelInfo)
logger = slog.New(slog.NewTextHandler(logfile, &slog.HandlerOptions{Level: logLevel}))
logger = slog.New(slog.NewTextHandler(logfile, &slog.HandlerOptions{Level: logLevel, AddSource: true}))
store = storage.NewProviderSQL(cfg.DBPATH, logger)
if store == nil {
cancel()

View File

@@ -61,6 +61,10 @@ type Config struct {
TTS_SPEED float32 `toml:"TTS_SPEED"`
TTS_PROVIDER string `toml:"TTS_PROVIDER"`
TTS_LANGUAGE string `toml:"TTS_LANGUAGE"`
// Kokoro ONNX TTS
KokoroModelPath string `toml:"KokoroModelPath"`
KokoroVoicesPath string `toml:"KokoroVoicesPath"`
KokoroVoice string `toml:"KokoroVoice"`
// STT
STT_TYPE string `toml:"STT_TYPE"` // WHISPER_SERVER, WHISPER_BINARY
STT_URL string `toml:"STT_URL"`

View File

@@ -1,218 +0,0 @@
//go:build extra
// +build extra
package extra
import (
"fmt"
"gf-lt/models"
"io"
"log/slog"
"os/exec"
"strings"
"sync"
google_translate_tts "github.com/GrailFinder/google-translate-tts"
"github.com/neurosnap/sentences/english"
)
type GoogleTranslateOrator struct {
logger *slog.Logger
mu sync.Mutex
speech *google_translate_tts.Speech
// fields for playback control
cmd *exec.Cmd
cmdMu sync.Mutex
stopCh chan struct{}
// text buffer and interrupt flag
textBuffer strings.Builder
interrupt bool
Speed float32
}
func (o *GoogleTranslateOrator) stoproutine() {
for {
<-TTSDoneChan
o.logger.Debug("orator got done signal")
o.Stop()
for len(TTSTextChan) > 0 {
<-TTSTextChan
}
o.mu.Lock()
o.textBuffer.Reset()
o.interrupt = true
o.mu.Unlock()
}
}
func (o *GoogleTranslateOrator) readroutine() {
tokenizer, _ := english.NewSentenceTokenizer(nil)
for {
select {
case chunk := <-TTSTextChan:
o.mu.Lock()
o.interrupt = false
_, err := o.textBuffer.WriteString(chunk)
if err != nil {
o.logger.Warn("failed to write to stringbuilder", "error", err)
o.mu.Unlock()
continue
}
text := o.textBuffer.String()
sentences := tokenizer.Tokenize(text)
o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences))
if len(sentences) <= 1 {
o.mu.Unlock()
continue
}
completeSentences := sentences[:len(sentences)-1]
remaining := sentences[len(sentences)-1].Text
o.textBuffer.Reset()
o.textBuffer.WriteString(remaining)
o.mu.Unlock()
for _, sentence := range completeSentences {
o.mu.Lock()
interrupted := o.interrupt
o.mu.Unlock()
if interrupted {
return
}
cleanedText := models.CleanText(sentence.Text)
if cleanedText == "" {
continue
}
o.logger.Debug("calling Speak with sentence", "sent", cleanedText)
if err := o.Speak(cleanedText); err != nil {
o.logger.Error("tts failed", "sentence", cleanedText, "error", err)
}
}
case <-TTSFlushChan:
o.logger.Debug("got flushchan signal start")
// lln is done get the whole message out
if len(TTSTextChan) > 0 { // otherwise might get stuck
for chunk := range TTSTextChan {
o.mu.Lock()
_, err := o.textBuffer.WriteString(chunk)
o.mu.Unlock()
if err != nil {
o.logger.Warn("failed to write to stringbuilder", "error", err)
continue
}
if len(TTSTextChan) == 0 {
break
}
}
}
o.mu.Lock()
remaining := o.textBuffer.String()
remaining = models.CleanText(remaining)
o.textBuffer.Reset()
o.mu.Unlock()
if remaining == "" {
continue
}
o.logger.Debug("calling Speak with remainder", "rem", remaining)
sentencesRem := tokenizer.Tokenize(remaining)
for _, rs := range sentencesRem { // to avoid dumping large volume of text
o.mu.Lock()
interrupt := o.interrupt
o.mu.Unlock()
if interrupt {
break
}
if err := o.Speak(rs.Text); err != nil {
o.logger.Error("tts failed", "sentence", rs.Text, "error", err)
}
}
}
}
}
func (o *GoogleTranslateOrator) GetLogger() *slog.Logger {
return o.logger
}
func (o *GoogleTranslateOrator) Speak(text string) error {
o.logger.Debug("fn: Speak is called", "text-len", len(text))
// Generate MP3 data directly as an io.Reader
reader, err := o.speech.GenerateSpeech(text)
if err != nil {
return fmt.Errorf("generate speech failed: %w", err)
}
// Wrap in io.NopCloser since GenerateSpeech returns io.Reader (no close needed)
body := io.NopCloser(reader)
defer body.Close()
// Build ffplay command with optional speed filter
args := []string{"-nodisp", "-autoexit"}
if o.Speed > 0.1 && o.Speed != 1.0 {
// atempo range is 0.5 to 2.0; you might clamp it here
args = append(args, "-af", fmt.Sprintf("atempo=%.2f", o.Speed))
}
args = append(args, "-i", "pipe:0")
cmd := exec.Command("ffplay", args...)
stdin, err := cmd.StdinPipe()
if err != nil {
return fmt.Errorf("failed to get stdin pipe: %w", err)
}
o.cmdMu.Lock()
o.cmd = cmd
o.stopCh = make(chan struct{})
o.cmdMu.Unlock()
if err := cmd.Start(); err != nil {
return fmt.Errorf("failed to start ffplay: %w", err)
}
copyErr := make(chan error, 1)
go func() {
_, err := io.Copy(stdin, body)
stdin.Close()
copyErr <- err
}()
done := make(chan error, 1)
go func() {
done <- cmd.Wait()
}()
select {
case <-o.stopCh:
if o.cmd != nil && o.cmd.Process != nil {
o.cmd.Process.Kill()
}
<-done
return nil
case copyErrVal := <-copyErr:
if copyErrVal != nil {
if o.cmd != nil && o.cmd.Process != nil {
o.cmd.Process.Kill()
}
<-done
return copyErrVal
}
return <-done
case err := <-done:
return err
}
}
func (o *GoogleTranslateOrator) Stop() {
o.cmdMu.Lock()
defer o.cmdMu.Unlock()
// Signal any running Speak to stop
if o.stopCh != nil {
select {
case <-o.stopCh: // already closed
default:
close(o.stopCh)
}
o.stopCh = nil
}
// Kill the external player process if it's still running
if o.cmd != nil && o.cmd.Process != nil {
o.cmd.Process.Kill()
o.cmd.Wait() // clean up zombie process
o.cmd = nil
}
// Also reset text buffer and interrupt flag (with o.mu)
o.mu.Lock()
o.textBuffer.Reset()
o.interrupt = true
o.mu.Unlock()
}

View File

@@ -1,259 +0,0 @@
//go:build extra
// +build extra
package extra
import (
"bytes"
"encoding/json"
"fmt"
"gf-lt/models"
"io"
"log/slog"
"net/http"
"os/exec"
"strings"
"sync"
"github.com/neurosnap/sentences/english"
)
type KokoroOrator struct {
logger *slog.Logger
mu sync.Mutex
URL string
Format models.AudioFormat
Stream bool
Speed float32
Language string
Voice string
// fields for playback control
cmd *exec.Cmd
cmdMu sync.Mutex
stopCh chan struct{}
// textBuffer, interrupt etc. remain the same
textBuffer strings.Builder
interrupt bool
}
func (o *KokoroOrator) GetLogger() *slog.Logger {
return o.logger
}
func (o *KokoroOrator) Speak(text string) error {
o.logger.Debug("fn: Speak is called", "text-len", len(text))
body, err := o.requestSound(text)
if err != nil {
return fmt.Errorf("request failed: %w", err)
}
defer body.Close()
cmd := exec.Command("ffplay", "-nodisp", "-autoexit", "-i", "pipe:0")
stdin, err := cmd.StdinPipe()
if err != nil {
return fmt.Errorf("failed to get stdin pipe: %w", err)
}
o.cmdMu.Lock()
o.cmd = cmd
o.stopCh = make(chan struct{})
o.cmdMu.Unlock()
if err := cmd.Start(); err != nil {
return fmt.Errorf("failed to start ffplay: %w", err)
}
// Copy audio in background
copyErr := make(chan error, 1)
go func() {
_, err := io.Copy(stdin, body)
stdin.Close()
copyErr <- err
}()
// Wait for player in background
done := make(chan error, 1)
go func() {
done <- cmd.Wait()
}()
// Wait for BOTH copy and player, but ensure we block until done
select {
case <-o.stopCh:
// Stop requested: kill player and wait for it to exit
if o.cmd != nil && o.cmd.Process != nil {
o.cmd.Process.Kill()
}
<-done // Wait for process to actually exit
return nil
case copyErrVal := <-copyErr:
if copyErrVal != nil {
// Copy failed: kill player and wait
if o.cmd != nil && o.cmd.Process != nil {
o.cmd.Process.Kill()
}
<-done
return copyErrVal
}
// Copy succeeded, now wait for playback to complete
return <-done
case err := <-done:
// Playback finished normally (copy must have succeeded or player would have exited early)
return err
}
}
func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
if o.URL == "" {
return nil, fmt.Errorf("TTS URL is empty")
}
payload := map[string]interface{}{
"input": text,
"voice": o.Voice,
"response_format": o.Format,
"download_format": o.Format,
"stream": o.Stream,
"speed": o.Speed,
// "return_download_link": true,
"lang_code": o.Language,
}
payloadBytes, err := json.Marshal(payload)
if err != nil {
return nil, fmt.Errorf("failed to marshal payload: %w", err)
}
req, err := http.NewRequest("POST", o.URL, bytes.NewBuffer(payloadBytes)) //nolint:noctx
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("accept", "application/json")
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, fmt.Errorf("request failed: %w", err)
}
if resp.StatusCode != http.StatusOK {
defer resp.Body.Close()
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
return resp.Body, nil
}
func (o *KokoroOrator) stoproutine() {
for {
<-TTSDoneChan
o.logger.Debug("orator got done signal")
// 1. Stop any ongoing playback (kills external player, closes stopCh)
o.Stop()
// 2. Drain any pending text chunks
for len(TTSTextChan) > 0 {
<-TTSTextChan
}
// 3. Reset internal state
o.mu.Lock()
o.textBuffer.Reset()
o.interrupt = true
o.mu.Unlock()
}
}
func (o *KokoroOrator) Stop() {
o.cmdMu.Lock()
defer o.cmdMu.Unlock()
// Signal any running Speak to stop
if o.stopCh != nil {
select {
case <-o.stopCh: // already closed
default:
close(o.stopCh)
}
o.stopCh = nil
}
// Kill the external player process if it's still running
if o.cmd != nil && o.cmd.Process != nil {
o.cmd.Process.Kill()
o.cmd.Wait() // clean up zombie process
o.cmd = nil
}
// Also reset text buffer and interrupt flag (with o.mu)
o.mu.Lock()
o.textBuffer.Reset()
o.interrupt = true
o.mu.Unlock()
}
func (o *KokoroOrator) readroutine() {
tokenizer, _ := english.NewSentenceTokenizer(nil)
for {
select {
case chunk := <-TTSTextChan:
o.mu.Lock()
o.interrupt = false
_, err := o.textBuffer.WriteString(chunk)
if err != nil {
o.logger.Warn("failed to write to stringbuilder", "error", err)
o.mu.Unlock()
continue
}
text := o.textBuffer.String()
sentences := tokenizer.Tokenize(text)
o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences))
if len(sentences) <= 1 {
o.mu.Unlock()
continue
}
completeSentences := sentences[:len(sentences)-1]
remaining := sentences[len(sentences)-1].Text
o.textBuffer.Reset()
o.textBuffer.WriteString(remaining)
o.mu.Unlock()
for _, sentence := range completeSentences {
o.mu.Lock()
interrupted := o.interrupt
o.mu.Unlock()
if interrupted {
return
}
cleanedText := models.CleanText(sentence.Text)
if cleanedText == "" {
continue
}
o.logger.Debug("calling Speak with sentence", "sent", cleanedText)
if err := o.Speak(cleanedText); err != nil {
o.logger.Error("tts failed", "sentence", cleanedText, "error", err)
}
}
case <-TTSFlushChan:
o.logger.Debug("got flushchan signal start")
// lln is done get the whole message out
if len(TTSTextChan) > 0 { // otherwise might get stuck
for chunk := range TTSTextChan {
o.mu.Lock()
_, err := o.textBuffer.WriteString(chunk)
o.mu.Unlock()
if err != nil {
o.logger.Warn("failed to write to stringbuilder", "error", err)
continue
}
if len(TTSTextChan) == 0 {
break
}
}
}
// flush remaining text
o.mu.Lock()
remaining := o.textBuffer.String()
remaining = models.CleanText(remaining)
o.textBuffer.Reset()
o.mu.Unlock()
if remaining == "" {
continue
}
o.logger.Debug("calling Speak with remainder", "rem", remaining)
sentencesRem := tokenizer.Tokenize(remaining)
for _, rs := range sentencesRem { // to avoid dumping large volume of text
o.mu.Lock()
interrupt := o.interrupt
o.mu.Unlock()
if interrupt {
break
}
if err := o.Speak(rs.Text); err != nil {
o.logger.Error("tts failed", "sentence", rs, "error", err)
}
}
}
}
}

421
extra/kokoro_onnx.go Normal file
View File

@@ -0,0 +1,421 @@
//go:build extra
// +build extra
package extra
import (
"bytes"
"fmt"
"gf-lt/models"
"gf-lt/onnx"
"log/slog"
"os/exec"
"strings"
"sync"
"time"
"github.com/gopxl/beep/v2"
"github.com/gopxl/beep/v2/speaker"
"github.com/gopxl/beep/v2/wav"
"github.com/neurosnap/sentences/english"
"github.com/yalue/onnxruntime_go"
)
// KokoroONNXOrator implements Kokoro TTS using ONNX runtime
type KokoroONNXOrator struct {
logger *slog.Logger
mu sync.Mutex
session *onnxruntime_go.DynamicAdvancedSession
phonemeMap map[string]int
espeakCmd string
voice string
speed float32
styleVector []float32
currentStream *beep.Ctrl
currentDone chan bool
textBuffer strings.Builder
interrupt bool
modelLoaded bool
modelPath string
voicesPath string
}
// Phoneme to token ID mapping from Kokoro tokenizer.json
var kokoroPhonemeMap = map[string]int{
"$": 0, ";": 1, ":": 2, ",": 3, ".": 4, "!": 5, "?": 6, "—": 9, "…": 10, "\"": 11, "(": 12, ")": 13, "“": 14, "”": 15, " ": 16, "̃": 17, "ˢ": 18, "ˤ": 19, "˦": 20, "˨": 21, "ᾝ": 22, "⭧": 23,
"A": 24, "I": 25, "O": 31, "Q": 33, "S": 35, "T": 36, "W": 39, "Y": 41, "ʲ": 42,
"a": 43, "b": 44, "c": 45, "d": 46, "e": 47, "f": 48, "h": 50, "i": 51, "j": 52, "k": 53, "l": 54, "m": 55, "n": 56, "o": 57, "p": 58, "q": 59, "r": 60, "s": 61, "t": 62, "u": 63, "v": 64, "w": 65, "x": 66, "y": 67, "z": 68,
"ɑ": 69, "ɐ": 70, "ɒ": 71, "æ": 72, "β": 75, "ɔ": 76, "ɕ": 77, "ç": 78, "ɖ": 80, "ð": 81, "˔": 82, "ə": 83, "ɚ": 85, "ɛ": 86, "ɜ": 87, "ɟ": 90, "ɡ": 92, "ɥ": 99, "ɨ": 101, "ɪ": 102, "ɝ": 103, "ɯ": 110, "ɰ": 111, "ŋ": 112, "ɳ": 113, "ɲ": 114, "ɴ": 115, "ø": 116, "ɸ": 118, "θ": 119, "œ": 120, "ɹ": 123, "ɾ": 125, "ɺ": 126, "ʁ": 128, "ɽ": 129, "ʂ": 130, "ʃ": 131, "ʈ": 132, "˧": 133, "ʊ": 135, "ʋ": 136, "ʌ": 138, "ɢ": 139, "ɣ": 140, "χ": 142, "ʎ": 143, "ʒ": 147, "ʔ": 148,
"ˈ": 156, "ˌ": 157, "ː": 158, "̰": 162, "̊": 164, "↕": 169, "→": 171, "↗": 172, "↘": 173, "ᶻ": 177,
}
func (o *KokoroONNXOrator) ensureInitialized(modelPath string) error {
if o.modelLoaded {
return nil
}
o.mu.Lock()
defer o.mu.Unlock()
if o.modelLoaded {
return nil
}
if modelPath == "" {
o.logger.Error("modelPath is empty, cannot load ONNX model")
return fmt.Errorf("modelPath is empty, set KokoroModelPath in config")
}
// Initialize ONNX runtime (shared with embedder)
if err := onnx.Init(); err != nil {
o.logger.Error("ONNX init failed", "error", err)
return fmt.Errorf("ONNX init failed: %w", err)
}
if onnx.HasCUDASupport() {
o.logger.Info("ONNX using CUDA")
} else {
o.logger.Info("ONNX using CPU fallback")
}
if o.phonemeMap == nil {
o.phonemeMap = kokoroPhonemeMap
}
if o.espeakCmd == "" {
o.espeakCmd = "espeak-ng"
if _, err := exec.LookPath(o.espeakCmd); err != nil {
o.espeakCmd = "espeak"
if _, err := exec.LookPath(o.espeakCmd); err != nil {
return fmt.Errorf("espeak-ng or espeak not found. Install with: sudo apt-get install espeak-ng")
}
}
}
o.logger.Info("using espeak command", "cmd", o.espeakCmd)
// Load voice embedding if not already loaded
if o.styleVector == nil {
voiceName := o.voice
if voiceName == "" {
voiceName = "af_bella"
}
if o.voicesPath != "" {
styleVec, err := onnx.LoadVoice(o.voicesPath, voiceName)
if err != nil {
o.logger.Warn("failed to load voice, using zeros", "error", err, "voice", voiceName)
o.styleVector = make([]float32, 256)
} else {
// Shape is (510, 1, 256), we want the last 256 values (or first? let's use mean or just pick one)
// Actually, let's average across all 510 to get a single 256-dim vector
if len(styleVec) != 510*256 {
o.logger.Error("voice embedding has unexpected size", "len", len(styleVec))
err = fmt.Errorf("voice embedding has unexpected size", "len", len(styleVec))
return err
}
o.styleVector = make([]float32, 256)
for i := 0; i < 256; i++ {
var sum float32
for j := 0; j < 510; j++ {
sum += styleVec[j*256+i]
}
o.styleVector[i] = sum / 510.0
}
o.logger.Info("loaded voice embedding", "voice", voiceName)
}
} else {
o.logger.Warn("no voices path configured, using zeros for style")
o.styleVector = make([]float32, 256)
}
}
opts, err := onnx.NewSessionOptions()
if err != nil {
return fmt.Errorf("failed to create session options: %w", err)
}
defer func() { _ = opts.Destroy() }()
if onnx.HasCUDASupport() {
o.logger.Info("session options created with CUDA")
} else {
o.logger.Info("session options created with CPU")
}
session, err := onnxruntime_go.NewDynamicAdvancedSession(
modelPath,
[]string{"input_ids", "style", "speed"},
[]string{"waveform"},
opts,
)
if err != nil {
o.logger.Error("failed to create ONNX session", "error", err)
return fmt.Errorf("failed to create ONNX session: %w", err)
}
o.session = session
o.modelLoaded = true
o.logger.Info("Kokoro ONNX model loaded successfully", "model", modelPath)
return nil
}
func (o *KokoroONNXOrator) textToPhonemes(text string) (string, error) {
cmd := exec.Command(o.espeakCmd, "-x", "-q", text)
output, err := cmd.Output()
if err != nil {
o.logger.Error("espeak failed", "error", err, "cmd", o.espeakCmd, "text", text)
return "", fmt.Errorf("espeak failed: %w", err)
}
phonemeStr := strings.TrimSpace(string(output))
return phonemeStr, nil
}
func (o *KokoroONNXOrator) phonemesToTokens(phonemeStr string) ([]int, error) {
if phonemeStr == "" {
o.logger.Error("empty phoneme string")
return nil, fmt.Errorf("empty phoneme string")
}
// Iterate over each character in the phoneme string
tokens := make([]int, 0)
for _, ch := range phonemeStr {
chStr := string(ch)
if tokenID, ok := o.phonemeMap[chStr]; ok {
tokens = append(tokens, tokenID)
}
}
if len(tokens) == 0 {
o.logger.Error("no phonemes mapped to tokens", "phonemeStr", phonemeStr)
return nil, fmt.Errorf("no valid phonemes mapped to tokens")
}
return tokens, nil
}
func (o *KokoroONNXOrator) generateAudio(text string) ([]float32, error) {
if err := o.ensureInitialized(o.modelPath); err != nil {
o.logger.Error("ensureInitialized failed", "error", err)
return nil, err
}
phonemeStr, err := o.textToPhonemes(text)
if err != nil {
o.logger.Error("phoneme conversion failed", "error", err)
return nil, fmt.Errorf("phoneme conversion failed: %w", err)
}
tokens, err := o.phonemesToTokens(phonemeStr)
if err != nil {
o.logger.Error("token conversion failed", "error", err)
return nil, fmt.Errorf("token conversion failed: %w", err)
}
if len(tokens) > 510 {
return nil, fmt.Errorf("text too long: %d tokens (max 510)", len(tokens))
}
tokens = append([]int{0}, tokens...)
tokens = append(tokens, 0)
inputIDs := make([]int64, len(tokens))
for i, t := range tokens {
inputIDs[i] = int64(t)
}
inputTensor, err := onnxruntime_go.NewTensor[int64](
onnxruntime_go.NewShape(1, int64(len(inputIDs))),
inputIDs,
)
if err != nil {
o.logger.Error("failed to create input tensor", "error", err)
return nil, fmt.Errorf("failed to create input tensor: %w", err)
}
defer func() { _ = inputTensor.Destroy() }()
styleTensor, err := onnxruntime_go.NewTensor[float32](
onnxruntime_go.NewShape(1, 256),
o.styleVector,
)
if err != nil {
o.logger.Error("failed to create style tensor", "error", err)
return nil, fmt.Errorf("failed to create style tensor: %w", err)
}
defer func() { _ = styleTensor.Destroy() }()
speedTensor, err := onnxruntime_go.NewTensor[float32](
onnxruntime_go.NewShape(1),
[]float32{o.speed},
)
if err != nil {
o.logger.Error("failed to create speed tensor", "error", err)
return nil, fmt.Errorf("failed to create speed tensor: %w", err)
}
defer func() { _ = speedTensor.Destroy() }()
outputTensor, err := onnxruntime_go.NewEmptyTensor[float32](
onnxruntime_go.NewShape(1, 512),
)
if err != nil {
o.logger.Error("failed to create output tensor", "error", err)
return nil, fmt.Errorf("failed to create output tensor: %w", err)
}
defer func() { _ = outputTensor.Destroy() }()
err = o.session.Run(
[]onnxruntime_go.Value{inputTensor, styleTensor, speedTensor},
[]onnxruntime_go.Value{outputTensor},
)
if err != nil {
o.logger.Error("ONNX inference failed", "error", err)
return nil, fmt.Errorf("ONNX inference failed: %w", err)
}
audioData := outputTensor.GetData()
if len(audioData) == 0 {
o.logger.Error("empty audio output from ONNX")
return nil, fmt.Errorf("empty audio output")
}
audio := make([]float32, len(audioData))
copy(audio, audioData)
return audio, nil
}
func (o *KokoroONNXOrator) Speak(text string) error {
audio, err := o.generateAudio(text)
if err != nil {
o.logger.Error("audio generation failed", "error", err)
return fmt.Errorf("audio generation failed: %w", err)
}
// Create streamer for encoding
encodeStreamer := beep.StreamerFunc(func(samples [][2]float64) (n int, ok bool) {
for i := range samples {
if i >= len(audio) {
return i, false
}
samples[i][0] = float64(audio[i])
samples[i][1] = float64(audio[i])
}
return len(audio), true
})
buf := &seekableBuffer{new(bytes.Buffer)}
err = wav.Encode(buf, encodeStreamer, beep.Format{
SampleRate: 24000,
NumChannels: 1,
Precision: 2,
})
if err != nil {
o.logger.Error("wav encoding failed", "error", err)
return fmt.Errorf("wav encoding failed: %w", err)
}
decodedStreamer, format, err := wav.Decode(bytes.NewReader(buf.Bytes()))
if err != nil {
o.logger.Error("wav decode failed", "error", err)
return fmt.Errorf("wav decode failed: %w", err)
}
defer decodedStreamer.Close()
if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil {
o.logger.Error("speaker init failed", "error", err)
return fmt.Errorf("speaker init failed: %w", err)
}
o.logger.Info("playing audio", "sampleRate", format.SampleRate, "channels", format.NumChannels)
done := make(chan bool)
o.mu.Lock()
o.currentDone = done
o.currentStream = &beep.Ctrl{Streamer: beep.Seq(decodedStreamer, beep.Callback(func() {
o.mu.Lock()
close(done)
o.currentStream = nil
o.currentDone = nil
o.mu.Unlock()
})), Paused: false}
o.mu.Unlock()
speaker.Play(o.currentStream)
<-done
return nil
}
func (o *KokoroONNXOrator) Stop() {
speaker.Lock()
defer speaker.Unlock()
o.mu.Lock()
defer o.mu.Unlock()
if o.currentStream != nil {
o.currentStream.Streamer = nil
}
}
func (o *KokoroONNXOrator) GetLogger() *slog.Logger {
return o.logger
}
func (o *KokoroONNXOrator) stoproutine() {
for {
<-TTSDoneChan
o.Stop()
for len(TTSTextChan) > 0 {
<-TTSTextChan
}
o.mu.Lock()
o.textBuffer.Reset()
if o.currentDone != nil {
select {
case o.currentDone <- true:
default:
}
}
o.interrupt = true
o.mu.Unlock()
}
}
func (o *KokoroONNXOrator) readroutine() {
tokenizer, _ := english.NewSentenceTokenizer(nil)
for {
select {
case chunk := <-TTSTextChan:
o.mu.Lock()
o.interrupt = false
_, err := o.textBuffer.WriteString(chunk)
if err != nil {
o.logger.Warn("failed to write to buffer", "error", err)
o.mu.Unlock()
continue
}
text := o.textBuffer.String()
sentences := tokenizer.Tokenize(text)
if len(sentences) <= 1 {
o.mu.Unlock()
continue
}
completeSentences := sentences[:len(sentences)-1]
remaining := sentences[len(sentences)-1].Text
o.textBuffer.Reset()
o.textBuffer.WriteString(remaining)
o.mu.Unlock()
for _, sentence := range completeSentences {
o.mu.Lock()
interrupted := o.interrupt
o.mu.Unlock()
if interrupted {
return
}
cleanedText := models.CleanText(sentence.Text)
if cleanedText == "" {
continue
}
o.logger.Info("KokoroONNX speak", "text", cleanedText)
if err := o.Speak(cleanedText); err != nil {
o.logger.Error("KokoroONNX tts failed", "text", cleanedText, "error", err)
}
}
case <-TTSFlushChan:
if len(TTSTextChan) > 0 {
for chunk := range TTSTextChan {
o.mu.Lock()
_, err := o.textBuffer.WriteString(chunk)
o.mu.Unlock()
if err != nil {
continue
}
if len(TTSTextChan) == 0 {
break
}
}
}
o.mu.Lock()
remaining := o.textBuffer.String()
remaining = models.CleanText(remaining)
o.textBuffer.Reset()
o.mu.Unlock()
if remaining == "" {
continue
}
sentencesRem := tokenizer.Tokenize(remaining)
for _, rs := range sentencesRem {
o.mu.Lock()
interrupt := o.interrupt
o.mu.Unlock()
if interrupt {
break
}
if err := o.Speak(rs.Text); err != nil {
o.logger.Error("tts failed", "text", rs.Text, "error", err)
}
}
}
}
}

View File

@@ -4,13 +4,25 @@
package extra
import (
"bytes"
"encoding/json"
"fmt"
"gf-lt/config"
"gf-lt/models"
"io"
"log/slog"
"net/http"
"os"
"strings"
"sync"
"time"
google_translate_tts "github.com/GrailFinder/google-translate-tts"
"github.com/GrailFinder/google-translate-tts/handlers"
"github.com/gopxl/beep/v2"
"github.com/gopxl/beep/v2/mp3"
"github.com/gopxl/beep/v2/speaker"
"github.com/neurosnap/sentences/english"
)
var (
@@ -20,6 +32,14 @@ var (
// endsWithPunctuation = regexp.MustCompile(`[;.!?]$`)
)
type seekableBuffer struct {
*bytes.Buffer
}
func (s *seekableBuffer) Seek(offset int64, whence int) (int64, error) {
return 0, nil
}
type Orator interface {
Speak(text string) error
Stop()
@@ -27,6 +47,142 @@ type Orator interface {
GetLogger() *slog.Logger
}
// impl https://github.com/remsky/Kokoro-FastAPI
type KokoroOrator struct {
logger *slog.Logger
mu sync.Mutex
URL string
Format models.AudioFormat
Stream bool
Speed float32
Language string
Voice string
currentStream *beep.Ctrl // Added for playback control
currentDone chan bool
textBuffer strings.Builder
interrupt bool
// textBuffer bytes.Buffer
}
// Google Translate TTS implementation
type GoogleTranslateOrator struct {
logger *slog.Logger
mu sync.Mutex
speech *google_translate_tts.Speech
currentStream *beep.Ctrl
currentDone chan bool
textBuffer strings.Builder
interrupt bool
}
func (o *KokoroOrator) stoproutine() {
for {
<-TTSDoneChan
o.logger.Debug("orator got done signal")
o.Stop()
// drain the channel
for len(TTSTextChan) > 0 {
<-TTSTextChan
}
o.mu.Lock()
o.textBuffer.Reset()
if o.currentDone != nil {
select {
case o.currentDone <- true:
default:
// Channel might be closed, ignore
}
}
o.interrupt = true
o.mu.Unlock()
}
}
func (o *KokoroOrator) readroutine() {
tokenizer, _ := english.NewSentenceTokenizer(nil)
for {
select {
case chunk := <-TTSTextChan:
o.mu.Lock()
o.interrupt = false
_, err := o.textBuffer.WriteString(chunk)
if err != nil {
o.logger.Warn("failed to write to stringbuilder", "error", err)
o.mu.Unlock()
continue
}
text := o.textBuffer.String()
sentences := tokenizer.Tokenize(text)
o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences))
if len(sentences) <= 1 {
o.mu.Unlock()
continue
}
completeSentences := sentences[:len(sentences)-1]
remaining := sentences[len(sentences)-1].Text
o.textBuffer.Reset()
o.textBuffer.WriteString(remaining)
o.mu.Unlock()
for _, sentence := range completeSentences {
o.mu.Lock()
interrupted := o.interrupt
o.mu.Unlock()
if interrupted {
return
}
cleanedText := models.CleanText(sentence.Text)
if cleanedText == "" {
continue
}
o.logger.Debug("calling Speak with sentence", "sent", cleanedText)
if err := o.Speak(cleanedText); err != nil {
o.logger.Error("tts failed", "sentence", cleanedText, "error", err)
}
}
case <-TTSFlushChan:
o.logger.Debug("got flushchan signal start")
// lln is done get the whole message out
if len(TTSTextChan) > 0 { // otherwise might get stuck
for chunk := range TTSTextChan {
o.mu.Lock()
_, err := o.textBuffer.WriteString(chunk)
o.mu.Unlock()
if err != nil {
o.logger.Warn("failed to write to stringbuilder", "error", err)
continue
}
if len(TTSTextChan) == 0 {
break
}
}
}
// flush remaining text
o.mu.Lock()
remaining := o.textBuffer.String()
remaining = models.CleanText(remaining)
o.textBuffer.Reset()
o.mu.Unlock()
if remaining == "" {
continue
}
o.logger.Debug("calling Speak with remainder", "rem", remaining)
sentencesRem := tokenizer.Tokenize(remaining)
for _, rs := range sentencesRem { // to avoid dumping large volume of text
o.mu.Lock()
interrupt := o.interrupt
o.mu.Unlock()
if interrupt {
break
}
if err := o.Speak(rs.Text); err != nil {
o.logger.Error("tts failed", "sentence", rs, "error", err)
}
}
}
}
}
func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
provider := cfg.TTS_PROVIDER
if provider == "" {
@@ -46,6 +202,18 @@ func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
go orator.readroutine()
go orator.stoproutine()
return orator
case "kokoro_onnx":
log.Info("Initializing Kokoro ONNX TTS", "modelPath", cfg.KokoroModelPath, "voicesPath", cfg.KokoroVoicesPath, "voice", cfg.KokoroVoice, "speed", cfg.TTS_SPEED)
orator := &KokoroONNXOrator{
logger: log,
modelPath: cfg.KokoroModelPath,
voicesPath: cfg.KokoroVoicesPath,
speed: cfg.TTS_SPEED,
voice: cfg.KokoroVoice,
}
go orator.readroutine()
go orator.stoproutine()
return orator
default:
language := cfg.TTS_LANGUAGE
if language == "" {
@@ -56,14 +224,270 @@ func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
Language: language,
Proxy: "", // Proxy not supported
Speed: cfg.TTS_SPEED,
Handler: &handlers.Beep{},
}
orator := &GoogleTranslateOrator{
logger: log,
speech: speech,
Speed: cfg.TTS_SPEED,
}
go orator.readroutine()
go orator.stoproutine()
return orator
}
}
func (o *KokoroOrator) GetLogger() *slog.Logger {
return o.logger
}
func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
if o.URL == "" {
return nil, fmt.Errorf("TTS URL is empty")
}
payload := map[string]interface{}{
"input": text,
"voice": o.Voice,
"response_format": o.Format,
"download_format": o.Format,
"stream": o.Stream,
"speed": o.Speed,
// "return_download_link": true,
"lang_code": o.Language,
}
payloadBytes, err := json.Marshal(payload)
if err != nil {
return nil, fmt.Errorf("failed to marshal payload: %w", err)
}
req, err := http.NewRequest("POST", o.URL, bytes.NewBuffer(payloadBytes)) //nolint:noctx
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("accept", "application/json")
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, fmt.Errorf("request failed: %w", err)
}
if resp.StatusCode != http.StatusOK {
defer resp.Body.Close()
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
return resp.Body, nil
}
func (o *KokoroOrator) Speak(text string) error {
o.logger.Debug("fn: Speak is called", "text-len", len(text))
body, err := o.requestSound(text)
if err != nil {
o.logger.Error("request failed", "error", err)
return fmt.Errorf("request failed: %w", err)
}
defer body.Close()
// Decode the mp3 audio from response body
streamer, format, err := mp3.Decode(body)
if err != nil {
o.logger.Error("mp3 decode failed", "error", err)
return fmt.Errorf("mp3 decode failed: %w", err)
}
defer streamer.Close()
// here it spams with errors that speaker cannot be initialized more than once, but how would we deal with many audio records then?
if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil {
o.logger.Debug("failed to init speaker", "error", err)
}
done := make(chan bool)
o.mu.Lock()
o.currentDone = done
o.currentStream = &beep.Ctrl{Streamer: beep.Seq(streamer, beep.Callback(func() {
o.mu.Lock()
close(done)
o.currentStream = nil
o.currentDone = nil
o.mu.Unlock()
})), Paused: false}
o.mu.Unlock()
speaker.Play(o.currentStream)
<-done
return nil
}
func (o *KokoroOrator) Stop() {
// speaker.Clear()
o.logger.Debug("attempted to stop orator", "orator", o)
speaker.Lock()
defer speaker.Unlock()
o.mu.Lock()
defer o.mu.Unlock()
if o.currentStream != nil {
// o.currentStream.Paused = true
o.currentStream.Streamer = nil
}
}
func (o *GoogleTranslateOrator) stoproutine() {
for {
<-TTSDoneChan
o.logger.Debug("orator got done signal")
o.Stop()
// drain the channel
for len(TTSTextChan) > 0 {
<-TTSTextChan
}
o.mu.Lock()
o.textBuffer.Reset()
if o.currentDone != nil {
select {
case o.currentDone <- true:
default:
// Channel might be closed, ignore
}
}
o.interrupt = true
o.mu.Unlock()
}
}
func (o *GoogleTranslateOrator) readroutine() {
tokenizer, _ := english.NewSentenceTokenizer(nil)
for {
select {
case chunk := <-TTSTextChan:
o.mu.Lock()
o.interrupt = false
_, err := o.textBuffer.WriteString(chunk)
if err != nil {
o.logger.Warn("failed to write to stringbuilder", "error", err)
o.mu.Unlock()
continue
}
text := o.textBuffer.String()
sentences := tokenizer.Tokenize(text)
o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences))
if len(sentences) <= 1 {
o.mu.Unlock()
continue
}
completeSentences := sentences[:len(sentences)-1]
remaining := sentences[len(sentences)-1].Text
o.textBuffer.Reset()
o.textBuffer.WriteString(remaining)
o.mu.Unlock()
for _, sentence := range completeSentences {
o.mu.Lock()
interrupted := o.interrupt
o.mu.Unlock()
if interrupted {
return
}
cleanedText := models.CleanText(sentence.Text)
if cleanedText == "" {
continue
}
o.logger.Debug("calling Speak with sentence", "sent", cleanedText)
if err := o.Speak(cleanedText); err != nil {
o.logger.Error("tts failed", "sentence", cleanedText, "error", err)
}
}
case <-TTSFlushChan:
o.logger.Debug("got flushchan signal start")
// lln is done get the whole message out
if len(TTSTextChan) > 0 { // otherwise might get stuck
for chunk := range TTSTextChan {
o.mu.Lock()
_, err := o.textBuffer.WriteString(chunk)
o.mu.Unlock()
if err != nil {
o.logger.Warn("failed to write to stringbuilder", "error", err)
continue
}
if len(TTSTextChan) == 0 {
break
}
}
}
o.mu.Lock()
remaining := o.textBuffer.String()
remaining = models.CleanText(remaining)
o.textBuffer.Reset()
o.mu.Unlock()
if remaining == "" {
continue
}
o.logger.Debug("calling Speak with remainder", "rem", remaining)
sentencesRem := tokenizer.Tokenize(remaining)
for _, rs := range sentencesRem { // to avoid dumping large volume of text
o.mu.Lock()
interrupt := o.interrupt
o.mu.Unlock()
if interrupt {
break
}
if err := o.Speak(rs.Text); err != nil {
o.logger.Error("tts failed", "sentence", rs.Text, "error", err)
}
}
}
}
}
func (o *GoogleTranslateOrator) GetLogger() *slog.Logger {
return o.logger
}
func (o *GoogleTranslateOrator) Speak(text string) error {
o.logger.Debug("fn: Speak is called", "text-len", len(text))
// Generate MP3 data using google-translate-tts
reader, err := o.speech.GenerateSpeech(text)
if err != nil {
o.logger.Error("generate speech failed", "error", err)
return fmt.Errorf("generate speech failed: %w", err)
}
// Decode the mp3 audio from reader (wrap with NopCloser for io.ReadCloser)
streamer, format, err := mp3.Decode(io.NopCloser(reader))
if err != nil {
o.logger.Error("mp3 decode failed", "error", err)
return fmt.Errorf("mp3 decode failed: %w", err)
}
defer streamer.Close()
playbackStreamer := beep.Streamer(streamer)
speed := o.speech.Speed
if speed <= 0 {
speed = 1.0
}
if speed != 1.0 {
playbackStreamer = beep.ResampleRatio(3, float64(speed), streamer)
}
// Initialize speaker with the format's sample rate
if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil {
o.logger.Debug("failed to init speaker", "error", err)
}
done := make(chan bool)
o.mu.Lock()
o.currentDone = done
o.currentStream = &beep.Ctrl{Streamer: beep.Seq(playbackStreamer, beep.Callback(func() {
o.mu.Lock()
close(done)
o.currentStream = nil
o.currentDone = nil
o.mu.Unlock()
})), Paused: false}
o.mu.Unlock()
speaker.Play(o.currentStream)
<-done // wait for playback to complete
return nil
}
func (o *GoogleTranslateOrator) Stop() {
o.logger.Debug("attempted to stop google translate orator")
speaker.Lock()
defer speaker.Unlock()
o.mu.Lock()
defer o.mu.Unlock()
if o.currentStream != nil {
o.currentStream.Streamer = nil
}
// Also stop the speech handler if possible
if o.speech != nil {
_ = o.speech.Stop()
}
}

7
go.mod
View File

@@ -4,11 +4,12 @@ go 1.25.1
require (
github.com/BurntSushi/toml v1.5.0
github.com/GrailFinder/google-translate-tts v0.1.4
github.com/GrailFinder/google-translate-tts v0.1.3
github.com/GrailFinder/searchagent v0.2.0
github.com/PuerkitoBio/goquery v1.11.0
github.com/gdamore/tcell/v2 v2.13.2
github.com/glebarez/go-sqlite v1.22.0
github.com/gopxl/beep/v2 v2.1.1
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b
github.com/jmoiron/sqlx v1.4.0
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
@@ -24,17 +25,21 @@ require (
github.com/andybalholm/cascadia v1.3.3 // indirect
github.com/deckarep/golang-set/v2 v2.8.0 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/ebitengine/oto/v3 v3.4.0 // indirect
github.com/ebitengine/purego v0.9.1 // indirect
github.com/emirpasic/gods v1.18.1 // indirect
github.com/gdamore/encoding v1.0.1 // indirect
github.com/go-jose/go-jose/v3 v3.0.4 // indirect
github.com/go-stack/stack v1.8.1 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/hajimehoshi/go-mp3 v0.3.4 // indirect
github.com/hajimehoshi/oto/v2 v2.3.1 // indirect
github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
github.com/ncruces/go-strftime v1.0.0 // indirect
github.com/patrickmn/go-cache v2.1.0+incompatible // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
github.com/rivo/uniseg v0.4.7 // indirect
github.com/schollz/progressbar/v2 v2.15.0 // indirect

13
go.sum
View File

@@ -2,8 +2,8 @@ filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg=
github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
github.com/GrailFinder/google-translate-tts v0.1.4 h1:NJoPZUGfBrmouQMN19MUcNPNUx4tmf4a8OZRME4E4Mg=
github.com/GrailFinder/google-translate-tts v0.1.4/go.mod h1:YIOLKR7sObazdUCrSex3u9OVBovU55eYgWa25vsQJ18=
github.com/GrailFinder/google-translate-tts v0.1.3 h1:Mww9tNzTWjjSh+OCbTPl/+21oMPKcUecXZfU7nTB/lA=
github.com/GrailFinder/google-translate-tts v0.1.3/go.mod h1:YIOLKR7sObazdUCrSex3u9OVBovU55eYgWa25vsQJ18=
github.com/GrailFinder/searchagent v0.2.0 h1:U2GVjLh/9xZt0xX9OcYk9Q2fMkyzyTiADPUmUisRdtQ=
github.com/GrailFinder/searchagent v0.2.0/go.mod h1:d66tn5+22LI8IGJREUsRBT60P0sFdgQgvQRqyvgItrs=
github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
@@ -17,6 +17,10 @@ github.com/deckarep/golang-set/v2 v2.8.0 h1:swm0rlPCmdWn9mESxKOjWk8hXSqoxOp+Zlfu
github.com/deckarep/golang-set/v2 v2.8.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/ebitengine/oto/v3 v3.4.0 h1:br0PgASsEWaoWn38b2Goe7m1GKFYfNgnsjSd5Gg+/bQ=
github.com/ebitengine/oto/v3 v3.4.0/go.mod h1:IOleLVD0m+CMak3mRVwsYY8vTctQgOM0iiL6S7Ar7eI=
github.com/ebitengine/purego v0.9.1 h1:a/k2f2HQU3Pi399RPW1MOaZyhKJL9w/xFpKAg4q1s0A=
github.com/ebitengine/purego v0.9.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
github.com/gdamore/encoding v1.0.1 h1:YzKZckdBL6jVt2Gc+5p82qhrGiqMdG/eNs6Wy0u3Uhw=
@@ -37,10 +41,13 @@ github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17k
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gopxl/beep/v2 v2.1.1 h1:6FYIYMm2qPAdWkjX+7xwKrViS1x0Po5kDMdRkq8NVbU=
github.com/gopxl/beep/v2 v2.1.1/go.mod h1:ZAm9TGQ9lvpoiFLd4zf5B1IuyxZhgRACMId1XJbaW0E=
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b h1:WEuQWBxelOGHA6z9lABqaMLMrfwVyMdN3UgRLT+YUPo=
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b/go.mod h1:esZFQEUwqC+l76f2R8bIWSwXMaPbp79PppwZ1eJhFco=
github.com/hajimehoshi/go-mp3 v0.3.4 h1:NUP7pBYH8OguP4diaTZ9wJbUbk3tC0KlfzsEpWmYj68=
github.com/hajimehoshi/go-mp3 v0.3.4/go.mod h1:fRtZraRFcWb0pu7ok0LqyFhCUrPeMsGRSVop0eemFmo=
github.com/hajimehoshi/oto/v2 v2.3.1 h1:qrLKpNus2UfD674oxckKjNJmesp9hMh7u7QCrStB3Rc=
github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo=
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
@@ -64,6 +71,8 @@ github.com/neurosnap/sentences v1.1.2 h1:iphYOzx/XckXeBiLIUBkPu2EKMJ+6jDbz/sLJZ7
github.com/neurosnap/sentences v1.1.2/go.mod h1:/pwU4E9XNL21ygMIkOIllv/SMy2ujHwpf8GQPu1YPbQ=
github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc=
github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/playwright-community/playwright-go v0.5700.1 h1:PNFb1byWqrTT720rEO0JL88C6Ju0EmUnR5deFLvtP/U=
github.com/playwright-community/playwright-go v0.5700.1/go.mod h1:MlSn1dZrx8rszbCxY6x3qK89ZesJUYVx21B2JnkoNF0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=

View File

@@ -7,6 +7,7 @@ import (
"fmt"
"gf-lt/config"
"gf-lt/models"
"gf-lt/onnx"
"log/slog"
"net/http"
"os"
@@ -156,43 +157,6 @@ type ONNXEmbedder struct {
modelPath string
}
var onnxInitOnce sync.Once
var onnxReady bool
var onnxLibPath string
var cudaLibPath string
var onnxLibPaths = []string{
"/usr/lib/libonnxruntime.so",
"/usr/lib/libonnxruntime.so.1.24.2",
"/usr/local/lib/libonnxruntime.so",
"/usr/lib/x86_64-linux-gnu/libonnxruntime.so",
"/opt/onnxruntime/lib/libonnxruntime.so",
}
var cudaLibPaths = []string{
"/usr/lib/libonnxruntime_providers_cuda.so",
"/usr/local/lib/libonnxruntime_providers_cuda.so",
"/opt/onnxruntime/lib/libonnxruntime_providers_cuda.so",
}
func findONNXLibrary() string {
for _, path := range onnxLibPaths {
if _, err := os.Stat(path); err == nil {
return path
}
}
return ""
}
func findCUDALibrary() string {
for _, path := range cudaLibPaths {
if _, err := os.Stat(path); err == nil {
return path
}
}
return ""
}
func NewONNXEmbedder(modelPath, tokenizerPath string, dims int, logger *slog.Logger) (*ONNXEmbedder, error) {
// Check if model and tokenizer files exist
if _, err := os.Stat(modelPath); err != nil {
@@ -202,17 +166,16 @@ func NewONNXEmbedder(modelPath, tokenizerPath string, dims int, logger *slog.Log
return nil, fmt.Errorf("tokenizer not found: %w", err)
}
// Find ONNX library
onnxLibPath = findONNXLibrary()
if onnxLibPath == "" {
return nil, errors.New("ONNX runtime library not found in standard locations")
// Initialize ONNX runtime
if err := onnx.Init(); err != nil {
return nil, fmt.Errorf("ONNX init failed: %w", err)
}
if onnx.HasCUDASupport() {
logger.Info("ONNX CUDA support enabled")
} else {
logger.Info("ONNX using CPU fallback")
}
// Find CUDA provider library (optional)
cudaLibPath = findCUDALibrary()
if cudaLibPath == "" {
fmt.Println("WARNING: CUDA provider library not found, will use CPU")
}
emb := &ONNXEmbedder{
tokenizerPath: tokenizerPath,
dims: dims,
@@ -239,26 +202,12 @@ func (e *ONNXEmbedder) ensureInitialized() error {
}
e.tokenizer = tok
}
onnxInitOnce.Do(func() {
onnxruntime_go.SetSharedLibraryPath(onnxLibPath)
if err := onnxruntime_go.InitializeEnvironment(); err != nil {
e.logger.Error("failed to initialize ONNX runtime", "error", err)
onnxReady = false
return
}
// Register CUDA provider if available
if cudaLibPath != "" {
if err := onnxruntime_go.RegisterExecutionProviderLibrary("CUDA", cudaLibPath); err != nil {
e.logger.Warn("failed to register CUDA provider", "error", err)
}
}
onnxReady = true
})
if !onnxReady {
// ONNX runtime already initialized by onnx.Init() in NewONNXEmbedder
if !onnx.IsReady() {
return errors.New("ONNX runtime not ready")
}
// Create session options
opts, err := onnxruntime_go.NewSessionOptions()
opts, err := onnx.NewSessionOptions()
if err != nil {
return fmt.Errorf("failed to create session options: %w", err)
}
@@ -266,27 +215,7 @@ func (e *ONNXEmbedder) ensureInitialized() error {
_ = opts.Destroy()
}()
// Try to add CUDA provider
useCUDA := cudaLibPath != ""
if useCUDA {
cudaOpts, err := onnxruntime_go.NewCUDAProviderOptions()
if err != nil {
e.logger.Warn("failed to create CUDA provider options, falling back to CPU", "error", err)
useCUDA = false
} else {
defer func() {
_ = cudaOpts.Destroy()
}()
if err := cudaOpts.Update(map[string]string{"device_id": "0"}); err != nil {
e.logger.Warn("failed to update CUDA options, falling back to CPU", "error", err)
useCUDA = false
} else if err := opts.AppendExecutionProviderCUDA(cudaOpts); err != nil {
e.logger.Warn("failed to append CUDA provider, falling back to CPU", "error", err)
useCUDA = false
}
}
}
if useCUDA {
if onnx.HasCUDASupport() {
e.logger.Info("Using CUDA for ONNX inference")
} else {
e.logger.Info("Using CPU for ONNX inference")