Feat: stop audio [WIP]

This commit is contained in:
Grail Finder
2025-05-19 09:42:47 +03:00
parent 2e5755c28a
commit a7e7da6f99
6 changed files with 98 additions and 82 deletions

2
bot.go
View File

@@ -524,7 +524,7 @@ func init() {
choseChunkParser() choseChunkParser()
httpClient = createClient(time.Second * 15) httpClient = createClient(time.Second * 15)
if cfg.TTS_ENABLED { if cfg.TTS_ENABLED {
orator = extra.InitOrator(logger, cfg.TTS_URL) orator = extra.NewOrator(logger, cfg)
} }
if cfg.STT_ENABLED { if cfg.STT_ENABLED {
asr = extra.NewWhisperSTT(logger, cfg.STT_URL, 16000) asr = extra.NewWhisperSTT(logger, cfg.STT_URL, 16000)

View File

@@ -15,6 +15,7 @@ RAGWorkers = 5
# extra tts # extra tts
TTS_ENABLED = false TTS_ENABLED = false
TTS_URL = "http://localhost:8880/v1/audio/speech" TTS_URL = "http://localhost:8880/v1/audio/speech"
TTS_SPEED = 1.0
# extra stt # extra stt
STT_ENABLED = false STT_ENABLED = false
STT_URL = "http://localhost:8081/inference" STT_URL = "http://localhost:8081/inference"

View File

@@ -42,6 +42,7 @@ type Config struct {
// TTS // TTS
TTS_URL string `toml:"TTS_URL"` TTS_URL string `toml:"TTS_URL"`
TTS_ENABLED bool `toml:"TTS_ENABLED"` TTS_ENABLED bool `toml:"TTS_ENABLED"`
TTS_SPEED float32 `toml:"TTS_SPEED"`
// STT // STT
STT_URL string `toml:"STT_URL"` STT_URL string `toml:"STT_URL"`
STT_ENABLED bool `toml:"STT_ENABLED"` STT_ENABLED bool `toml:"STT_ENABLED"`

View File

@@ -2,6 +2,7 @@ package extra
import ( import (
"bytes" "bytes"
"elefant/config"
"elefant/models" "elefant/models"
"encoding/json" "encoding/json"
"fmt" "fmt"
@@ -18,13 +19,16 @@ import (
) )
var ( var (
TTSTextChan = make(chan string, 1000) TTSTextChan = make(chan string, 10000)
TTSFlushChan = make(chan bool, 1) TTSFlushChan = make(chan bool, 1)
TTSDoneChan = make(chan bool, 1) TTSDoneChan = make(chan bool, 1)
) )
type Orator interface { type Orator interface {
Speak(text string) error Speak(text string) error
Stop()
// pause and resume?
GetSBuilder() strings.Builder
GetLogger() *slog.Logger GetLogger() *slog.Logger
} }
@@ -34,8 +38,26 @@ type KokoroOrator struct {
URL string URL string
Format models.AudioFormat Format models.AudioFormat
Stream bool Stream bool
Speed int8 Speed float32
Language string Language string
Voice string
currentStream *beep.Ctrl // Added for playback control
textBuffer strings.Builder
}
func stoproutine(orator Orator) {
select {
case <-TTSDoneChan:
orator.GetLogger().Info("orator got done signal")
orator.Stop()
// close(TTSTextChan)
// TTSTextChan = make(chan string, 10000)
// drain the channel
for len(TTSTextChan) > 0 {
<-TTSTextChan
}
return
}
} }
func readroutine(orator Orator) { func readroutine(orator Orator) {
@@ -70,84 +92,44 @@ func readroutine(orator Orator) {
break break
} }
} }
// INFO: if there is a lot of text it will take some time to make with tts at once
// to avoid this pause, it might be better to keep splitting on sentences
// but keepinig in mind that remainder could be ommited by tokenizer
// Flush remaining text // Flush remaining text
remaining := remainder.String() remaining := remainder.String()
orator.GetLogger().Info("flushing", "rem", remaining) remainder.Reset()
if remaining != "" { // but nothing is here? if remaining != "" {
orator.GetLogger().Info("flushing", "remaining", remaining) // orator.GetLogger().Info("flushing", "remaining", remaining)
if err := orator.Speak(remaining); err != nil { if err := orator.Speak(remaining); err != nil {
orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err) orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err)
} }
} }
case <-TTSDoneChan: // case <-TTSDoneChan:
// Flush remaining text // orator.GetLogger().Info("orator got done signal")
if remaining := sentenceBuf.String(); remaining != "" { // orator.Stop()
if err := orator.Speak(remaining); err != nil { // // it that the best way to empty channel?
orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err) // close(TTSTextChan)
} // TTSTextChan = make(chan string, 10000)
} // return
return
} }
} }
} }
func InitOrator(log *slog.Logger, URL string) Orator { func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
orator := &KokoroOrator{ orator := &KokoroOrator{
logger: log, logger: log,
URL: URL, URL: cfg.TTS_URL,
Format: models.AFMP3, Format: models.AFMP3,
Stream: false, Stream: false,
Speed: 1, Speed: cfg.TTS_SPEED,
Language: "a", Language: "a",
Voice: "af_bella(1)+af_sky(1)",
} }
go readroutine(orator) go readroutine(orator)
go stoproutine(orator)
return orator return orator
} }
// type AudioStream struct {
// TextChan chan string // Send text chunks here
// DoneChan chan bool // Close when streaming ends
// }
// func RunOrator(orator Orator) *AudioStream {
// stream := &AudioStream{
// TextChan: make(chan string, 1000),
// DoneChan: make(chan bool),
// }
// go func() {
// tokenizer, _ := english.NewSentenceTokenizer(nil)
// var sentenceBuf bytes.Buffer
// for {
// select {
// case chunk := <-stream.TextChan:
// sentenceBuf.WriteString(chunk)
// text := sentenceBuf.String()
// sentences := tokenizer.Tokenize(text)
// for i, sentence := range sentences {
// if i == len(sentences)-1 {
// sentenceBuf.Reset()
// sentenceBuf.WriteString(sentence.Text)
// continue
// }
// // Send complete sentence to TTS
// if err := orator.Speak(sentence.Text); err != nil {
// orator.GetLogger().Error("tts failed", "sentence", sentence.Text, "error", err)
// }
// }
// case <-stream.DoneChan:
// // Flush remaining text
// if remaining := sentenceBuf.String(); remaining != "" {
// if err := orator.Speak(remaining); err != nil {
// orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err)
// }
// }
// return
// }
// }
// }()
// return stream
// }
func (o *KokoroOrator) GetLogger() *slog.Logger { func (o *KokoroOrator) GetLogger() *slog.Logger {
return o.logger return o.logger
} }
@@ -155,12 +137,12 @@ func (o *KokoroOrator) GetLogger() *slog.Logger {
func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) { func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
payload := map[string]interface{}{ payload := map[string]interface{}{
"input": text, "input": text,
"voice": "af_bella(1)+af_sky(1)", "voice": o.Voice,
"response_format": "mp3", "response_format": o.Format,
"download_format": "mp3", "download_format": o.Format,
"stream": o.Stream, "stream": o.Stream,
"speed": o.Speed, "speed": o.Speed,
"return_download_link": true, // "return_download_link": true,
"lang_code": o.Language, "lang_code": o.Language,
} }
payloadBytes, err := json.Marshal(payload) payloadBytes, err := json.Marshal(payload)
@@ -185,6 +167,7 @@ func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
} }
func (o *KokoroOrator) Speak(text string) error { func (o *KokoroOrator) Speak(text string) error {
o.logger.Info("fn: Speak is called", "text-len", len(text))
body, err := o.requestSound(text) body, err := o.requestSound(text)
if err != nil { if err != nil {
o.logger.Error("request failed", "error", err) o.logger.Error("request failed", "error", err)
@@ -198,11 +181,33 @@ func (o *KokoroOrator) Speak(text string) error {
return fmt.Errorf("mp3 decode failed: %w", err) return fmt.Errorf("mp3 decode failed: %w", err)
} }
defer streamer.Close() defer streamer.Close()
speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)) // here it spams with errors that speaker cannot be initialized more than once, but how would we deal with many audio records then?
if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil {
o.logger.Debug("failed to init speaker", "error", err)
}
done := make(chan bool) done := make(chan bool)
speaker.Play(beep.Seq(streamer, beep.Callback(func() { // Create controllable stream and store reference
o.currentStream = &beep.Ctrl{Streamer: beep.Seq(streamer, beep.Callback(func() {
close(done) close(done)
}))) o.currentStream = nil
<-done })), Paused: false}
speaker.Play(o.currentStream)
<-done // we hang in this routine;
return nil return nil
} }
// TODO: stop works; but new stream does not start afterwards
func (o *KokoroOrator) Stop() {
// speaker.Clear()
o.logger.Info("attempted to stop orator", "orator", o)
speaker.Lock()
defer speaker.Unlock()
if o.currentStream != nil {
o.currentStream.Paused = true
o.currentStream.Streamer = nil
}
}
func (o *KokoroOrator) GetSBuilder() strings.Builder {
return o.textBuffer
}

View File

@@ -3,6 +3,6 @@ package models
type AudioFormat string type AudioFormat string
const ( const (
AFOPUS AudioFormat = "opus" AFWav AudioFormat = "wav"
AFMP3 AudioFormat = "mp3" AFMP3 AudioFormat = "mp3"
) )

9
tui.go
View File

@@ -1,6 +1,7 @@
package main package main
import ( import (
"elefant/extra"
"elefant/models" "elefant/models"
"elefant/pngmeta" "elefant/pngmeta"
"fmt" "fmt"
@@ -708,6 +709,14 @@ func init() {
return nil return nil
} }
} }
// I need keybind for tts to shut up
if event.Key() == tcell.KeyCtrlA {
textArea.SetText("pressed ctrl+A", true)
if cfg.TTS_ENABLED {
// audioStream.TextChan <- chunk
extra.TTSDoneChan <- true
}
}
if event.Key() == tcell.KeyCtrlW { if event.Key() == tcell.KeyCtrlW {
// INFO: continue bot/text message // INFO: continue bot/text message
// without new role // without new role