Feat: stop audio [WIP]
This commit is contained in:
2
bot.go
2
bot.go
@@ -524,7 +524,7 @@ func init() {
|
|||||||
choseChunkParser()
|
choseChunkParser()
|
||||||
httpClient = createClient(time.Second * 15)
|
httpClient = createClient(time.Second * 15)
|
||||||
if cfg.TTS_ENABLED {
|
if cfg.TTS_ENABLED {
|
||||||
orator = extra.InitOrator(logger, cfg.TTS_URL)
|
orator = extra.NewOrator(logger, cfg)
|
||||||
}
|
}
|
||||||
if cfg.STT_ENABLED {
|
if cfg.STT_ENABLED {
|
||||||
asr = extra.NewWhisperSTT(logger, cfg.STT_URL, 16000)
|
asr = extra.NewWhisperSTT(logger, cfg.STT_URL, 16000)
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ RAGWorkers = 5
|
|||||||
# extra tts
|
# extra tts
|
||||||
TTS_ENABLED = false
|
TTS_ENABLED = false
|
||||||
TTS_URL = "http://localhost:8880/v1/audio/speech"
|
TTS_URL = "http://localhost:8880/v1/audio/speech"
|
||||||
|
TTS_SPEED = 1.0
|
||||||
# extra stt
|
# extra stt
|
||||||
STT_ENABLED = false
|
STT_ENABLED = false
|
||||||
STT_URL = "http://localhost:8081/inference"
|
STT_URL = "http://localhost:8081/inference"
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ type Config struct {
|
|||||||
// TTS
|
// TTS
|
||||||
TTS_URL string `toml:"TTS_URL"`
|
TTS_URL string `toml:"TTS_URL"`
|
||||||
TTS_ENABLED bool `toml:"TTS_ENABLED"`
|
TTS_ENABLED bool `toml:"TTS_ENABLED"`
|
||||||
|
TTS_SPEED float32 `toml:"TTS_SPEED"`
|
||||||
// STT
|
// STT
|
||||||
STT_URL string `toml:"STT_URL"`
|
STT_URL string `toml:"STT_URL"`
|
||||||
STT_ENABLED bool `toml:"STT_ENABLED"`
|
STT_ENABLED bool `toml:"STT_ENABLED"`
|
||||||
|
|||||||
141
extra/audio.go
141
extra/audio.go
@@ -2,6 +2,7 @@ package extra
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"elefant/config"
|
||||||
"elefant/models"
|
"elefant/models"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -18,13 +19,16 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
TTSTextChan = make(chan string, 1000)
|
TTSTextChan = make(chan string, 10000)
|
||||||
TTSFlushChan = make(chan bool, 1)
|
TTSFlushChan = make(chan bool, 1)
|
||||||
TTSDoneChan = make(chan bool, 1)
|
TTSDoneChan = make(chan bool, 1)
|
||||||
)
|
)
|
||||||
|
|
||||||
type Orator interface {
|
type Orator interface {
|
||||||
Speak(text string) error
|
Speak(text string) error
|
||||||
|
Stop()
|
||||||
|
// pause and resume?
|
||||||
|
GetSBuilder() strings.Builder
|
||||||
GetLogger() *slog.Logger
|
GetLogger() *slog.Logger
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -34,8 +38,26 @@ type KokoroOrator struct {
|
|||||||
URL string
|
URL string
|
||||||
Format models.AudioFormat
|
Format models.AudioFormat
|
||||||
Stream bool
|
Stream bool
|
||||||
Speed int8
|
Speed float32
|
||||||
Language string
|
Language string
|
||||||
|
Voice string
|
||||||
|
currentStream *beep.Ctrl // Added for playback control
|
||||||
|
textBuffer strings.Builder
|
||||||
|
}
|
||||||
|
|
||||||
|
func stoproutine(orator Orator) {
|
||||||
|
select {
|
||||||
|
case <-TTSDoneChan:
|
||||||
|
orator.GetLogger().Info("orator got done signal")
|
||||||
|
orator.Stop()
|
||||||
|
// close(TTSTextChan)
|
||||||
|
// TTSTextChan = make(chan string, 10000)
|
||||||
|
// drain the channel
|
||||||
|
for len(TTSTextChan) > 0 {
|
||||||
|
<-TTSTextChan
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func readroutine(orator Orator) {
|
func readroutine(orator Orator) {
|
||||||
@@ -70,84 +92,44 @@ func readroutine(orator Orator) {
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// INFO: if there is a lot of text it will take some time to make with tts at once
|
||||||
|
// to avoid this pause, it might be better to keep splitting on sentences
|
||||||
|
// but keepinig in mind that remainder could be ommited by tokenizer
|
||||||
// Flush remaining text
|
// Flush remaining text
|
||||||
remaining := remainder.String()
|
remaining := remainder.String()
|
||||||
orator.GetLogger().Info("flushing", "rem", remaining)
|
remainder.Reset()
|
||||||
if remaining != "" { // but nothing is here?
|
if remaining != "" {
|
||||||
orator.GetLogger().Info("flushing", "remaining", remaining)
|
// orator.GetLogger().Info("flushing", "remaining", remaining)
|
||||||
if err := orator.Speak(remaining); err != nil {
|
if err := orator.Speak(remaining); err != nil {
|
||||||
orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err)
|
orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case <-TTSDoneChan:
|
// case <-TTSDoneChan:
|
||||||
// Flush remaining text
|
// orator.GetLogger().Info("orator got done signal")
|
||||||
if remaining := sentenceBuf.String(); remaining != "" {
|
// orator.Stop()
|
||||||
if err := orator.Speak(remaining); err != nil {
|
// // it that the best way to empty channel?
|
||||||
orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err)
|
// close(TTSTextChan)
|
||||||
}
|
// TTSTextChan = make(chan string, 10000)
|
||||||
}
|
// return
|
||||||
return
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func InitOrator(log *slog.Logger, URL string) Orator {
|
func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
|
||||||
orator := &KokoroOrator{
|
orator := &KokoroOrator{
|
||||||
logger: log,
|
logger: log,
|
||||||
URL: URL,
|
URL: cfg.TTS_URL,
|
||||||
Format: models.AFMP3,
|
Format: models.AFMP3,
|
||||||
Stream: false,
|
Stream: false,
|
||||||
Speed: 1,
|
Speed: cfg.TTS_SPEED,
|
||||||
Language: "a",
|
Language: "a",
|
||||||
|
Voice: "af_bella(1)+af_sky(1)",
|
||||||
}
|
}
|
||||||
go readroutine(orator)
|
go readroutine(orator)
|
||||||
|
go stoproutine(orator)
|
||||||
return orator
|
return orator
|
||||||
}
|
}
|
||||||
|
|
||||||
// type AudioStream struct {
|
|
||||||
// TextChan chan string // Send text chunks here
|
|
||||||
// DoneChan chan bool // Close when streaming ends
|
|
||||||
// }
|
|
||||||
|
|
||||||
// func RunOrator(orator Orator) *AudioStream {
|
|
||||||
// stream := &AudioStream{
|
|
||||||
// TextChan: make(chan string, 1000),
|
|
||||||
// DoneChan: make(chan bool),
|
|
||||||
// }
|
|
||||||
// go func() {
|
|
||||||
// tokenizer, _ := english.NewSentenceTokenizer(nil)
|
|
||||||
// var sentenceBuf bytes.Buffer
|
|
||||||
// for {
|
|
||||||
// select {
|
|
||||||
// case chunk := <-stream.TextChan:
|
|
||||||
// sentenceBuf.WriteString(chunk)
|
|
||||||
// text := sentenceBuf.String()
|
|
||||||
// sentences := tokenizer.Tokenize(text)
|
|
||||||
// for i, sentence := range sentences {
|
|
||||||
// if i == len(sentences)-1 {
|
|
||||||
// sentenceBuf.Reset()
|
|
||||||
// sentenceBuf.WriteString(sentence.Text)
|
|
||||||
// continue
|
|
||||||
// }
|
|
||||||
// // Send complete sentence to TTS
|
|
||||||
// if err := orator.Speak(sentence.Text); err != nil {
|
|
||||||
// orator.GetLogger().Error("tts failed", "sentence", sentence.Text, "error", err)
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// case <-stream.DoneChan:
|
|
||||||
// // Flush remaining text
|
|
||||||
// if remaining := sentenceBuf.String(); remaining != "" {
|
|
||||||
// if err := orator.Speak(remaining); err != nil {
|
|
||||||
// orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err)
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// return
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }()
|
|
||||||
// return stream
|
|
||||||
// }
|
|
||||||
|
|
||||||
func (o *KokoroOrator) GetLogger() *slog.Logger {
|
func (o *KokoroOrator) GetLogger() *slog.Logger {
|
||||||
return o.logger
|
return o.logger
|
||||||
}
|
}
|
||||||
@@ -155,12 +137,12 @@ func (o *KokoroOrator) GetLogger() *slog.Logger {
|
|||||||
func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
|
func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
|
||||||
payload := map[string]interface{}{
|
payload := map[string]interface{}{
|
||||||
"input": text,
|
"input": text,
|
||||||
"voice": "af_bella(1)+af_sky(1)",
|
"voice": o.Voice,
|
||||||
"response_format": "mp3",
|
"response_format": o.Format,
|
||||||
"download_format": "mp3",
|
"download_format": o.Format,
|
||||||
"stream": o.Stream,
|
"stream": o.Stream,
|
||||||
"speed": o.Speed,
|
"speed": o.Speed,
|
||||||
"return_download_link": true,
|
// "return_download_link": true,
|
||||||
"lang_code": o.Language,
|
"lang_code": o.Language,
|
||||||
}
|
}
|
||||||
payloadBytes, err := json.Marshal(payload)
|
payloadBytes, err := json.Marshal(payload)
|
||||||
@@ -185,6 +167,7 @@ func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (o *KokoroOrator) Speak(text string) error {
|
func (o *KokoroOrator) Speak(text string) error {
|
||||||
|
o.logger.Info("fn: Speak is called", "text-len", len(text))
|
||||||
body, err := o.requestSound(text)
|
body, err := o.requestSound(text)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
o.logger.Error("request failed", "error", err)
|
o.logger.Error("request failed", "error", err)
|
||||||
@@ -198,11 +181,33 @@ func (o *KokoroOrator) Speak(text string) error {
|
|||||||
return fmt.Errorf("mp3 decode failed: %w", err)
|
return fmt.Errorf("mp3 decode failed: %w", err)
|
||||||
}
|
}
|
||||||
defer streamer.Close()
|
defer streamer.Close()
|
||||||
speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10))
|
// here it spams with errors that speaker cannot be initialized more than once, but how would we deal with many audio records then?
|
||||||
|
if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil {
|
||||||
|
o.logger.Debug("failed to init speaker", "error", err)
|
||||||
|
}
|
||||||
done := make(chan bool)
|
done := make(chan bool)
|
||||||
speaker.Play(beep.Seq(streamer, beep.Callback(func() {
|
// Create controllable stream and store reference
|
||||||
|
o.currentStream = &beep.Ctrl{Streamer: beep.Seq(streamer, beep.Callback(func() {
|
||||||
close(done)
|
close(done)
|
||||||
})))
|
o.currentStream = nil
|
||||||
<-done
|
})), Paused: false}
|
||||||
|
speaker.Play(o.currentStream)
|
||||||
|
<-done // we hang in this routine;
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: stop works; but new stream does not start afterwards
|
||||||
|
func (o *KokoroOrator) Stop() {
|
||||||
|
// speaker.Clear()
|
||||||
|
o.logger.Info("attempted to stop orator", "orator", o)
|
||||||
|
speaker.Lock()
|
||||||
|
defer speaker.Unlock()
|
||||||
|
if o.currentStream != nil {
|
||||||
|
o.currentStream.Paused = true
|
||||||
|
o.currentStream.Streamer = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *KokoroOrator) GetSBuilder() strings.Builder {
|
||||||
|
return o.textBuffer
|
||||||
|
}
|
||||||
|
|||||||
@@ -3,6 +3,6 @@ package models
|
|||||||
type AudioFormat string
|
type AudioFormat string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
AFOPUS AudioFormat = "opus"
|
AFWav AudioFormat = "wav"
|
||||||
AFMP3 AudioFormat = "mp3"
|
AFMP3 AudioFormat = "mp3"
|
||||||
)
|
)
|
||||||
|
|||||||
9
tui.go
9
tui.go
@@ -1,6 +1,7 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"elefant/extra"
|
||||||
"elefant/models"
|
"elefant/models"
|
||||||
"elefant/pngmeta"
|
"elefant/pngmeta"
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -708,6 +709,14 @@ func init() {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// I need keybind for tts to shut up
|
||||||
|
if event.Key() == tcell.KeyCtrlA {
|
||||||
|
textArea.SetText("pressed ctrl+A", true)
|
||||||
|
if cfg.TTS_ENABLED {
|
||||||
|
// audioStream.TextChan <- chunk
|
||||||
|
extra.TTSDoneChan <- true
|
||||||
|
}
|
||||||
|
}
|
||||||
if event.Key() == tcell.KeyCtrlW {
|
if event.Key() == tcell.KeyCtrlW {
|
||||||
// INFO: continue bot/text message
|
// INFO: continue bot/text message
|
||||||
// without new role
|
// without new role
|
||||||
|
|||||||
Reference in New Issue
Block a user