Feat: google-translate-tts support
This commit is contained in:
@@ -29,6 +29,8 @@ RAGDir = "ragimport"
|
|||||||
TTS_ENABLED = false
|
TTS_ENABLED = false
|
||||||
TTS_URL = "http://localhost:8880/v1/audio/speech"
|
TTS_URL = "http://localhost:8880/v1/audio/speech"
|
||||||
TTS_SPEED = 1.2
|
TTS_SPEED = 1.2
|
||||||
|
TTS_PROVIDER = "kokoro"
|
||||||
|
TTS_LANGUAGE = "en"
|
||||||
# extra stt
|
# extra stt
|
||||||
STT_ENABLED = false
|
STT_ENABLED = false
|
||||||
STT_TYPE = "WHISPER_SERVER" # WHISPER_SERVER or WHISPER_BINARY
|
STT_TYPE = "WHISPER_SERVER" # WHISPER_SERVER or WHISPER_BINARY
|
||||||
|
|||||||
@@ -48,9 +48,11 @@ type Config struct {
|
|||||||
OpenRouterToken string `toml:"OpenRouterToken"`
|
OpenRouterToken string `toml:"OpenRouterToken"`
|
||||||
OpenRouterModel string `toml:"OpenRouterModel"`
|
OpenRouterModel string `toml:"OpenRouterModel"`
|
||||||
// TTS
|
// TTS
|
||||||
TTS_URL string `toml:"TTS_URL"`
|
TTS_URL string `toml:"TTS_URL"`
|
||||||
TTS_ENABLED bool `toml:"TTS_ENABLED"`
|
TTS_ENABLED bool `toml:"TTS_ENABLED"`
|
||||||
TTS_SPEED float32 `toml:"TTS_SPEED"`
|
TTS_SPEED float32 `toml:"TTS_SPEED"`
|
||||||
|
TTS_PROVIDER string `toml:"TTS_PROVIDER"`
|
||||||
|
TTS_LANGUAGE string `toml:"TTS_LANGUAGE"`
|
||||||
// STT
|
// STT
|
||||||
STT_TYPE string `toml:"STT_TYPE"` // WHISPER_SERVER, WHISPER_BINARY
|
STT_TYPE string `toml:"STT_TYPE"` // WHISPER_SERVER, WHISPER_BINARY
|
||||||
STT_URL string `toml:"STT_URL"`
|
STT_URL string `toml:"STT_URL"`
|
||||||
|
|||||||
@@ -96,11 +96,21 @@ This document explains how to set up and configure the application using the `co
|
|||||||
- Enable or disable text-to-speech functionality.
|
- Enable or disable text-to-speech functionality.
|
||||||
|
|
||||||
#### TTS_URL (`"http://localhost:8880/v1/audio/speech"`)
|
#### TTS_URL (`"http://localhost:8880/v1/audio/speech"`)
|
||||||
- The endpoint for TTS API.
|
- The endpoint for TTS API (used with `kokoro` provider).
|
||||||
|
|
||||||
#### TTS_SPEED (`1.2`)
|
#### TTS_SPEED (`1.2`)
|
||||||
- Playback speed for speech output (1.0 is normal speed).
|
- Playback speed for speech output (1.0 is normal speed).
|
||||||
|
|
||||||
|
#### TTS_PROVIDER (`"kokoro"`)
|
||||||
|
- TTS provider to use. Options: `"kokoro"` or `"google"`.
|
||||||
|
- `"kokoro"`: Uses Kokoro FastAPI TTS server (requires TTS_URL to be set). Provides high-quality voice synthesis but requires a running Kokoro server.
|
||||||
|
- `"google"`: Uses Google Translate TTS with gopxl/beep for local playback. Works offline using Google's public TTS API with local audio playback via gopxl/beep. Supports multiple languages via TTS_LANGUAGE setting.
|
||||||
|
|
||||||
|
#### TTS_LANGUAGE (`"en"`)
|
||||||
|
- Language code for TTS (used with `google` provider).
|
||||||
|
- Examples: `"en"` (English), `"es"` (Spanish), `"fr"` (French)
|
||||||
|
- See Google Translate TTS documentation for supported languages.
|
||||||
|
|
||||||
### Speech-to-Text (STT) Settings
|
### Speech-to-Text (STT) Settings
|
||||||
|
|
||||||
#### STT_ENABLED (`false`)
|
#### STT_ENABLED (`false`)
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
after [installing](https://github.com/GrailFinder/gf-lt/tree/master?tab=readme-ov-file#how-to-install)
|
### RP case example
|
||||||
[set up your config](config.md)
|
|
||||||
|
check the (https://github.com/GrailFinder/gf-lt/tree/master?tab=readme-ov-file#how-to-install) and
|
||||||
|
[setting up your config](config.md)
|
||||||
|
|
||||||
To roleplay, we would need to create a character card or get one from the web.
|
To roleplay, we would need to create a character card or get one from the web.
|
||||||
For this tutorial, we are going to use the default character Seraphina from [SillyTavern (ST)](https://github.com/SillyTavern/SillyTavern/blob/release/default/content/default_Seraphina.png).
|
For this tutorial, we are going to use the default character Seraphina from [SillyTavern (ST)](https://github.com/SillyTavern/SillyTavern/blob/release/default/content/default_Seraphina.png).
|
||||||
|
|||||||
191
extra/tts.go
191
extra/tts.go
@@ -12,9 +12,12 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
google_translate_tts "github.com/GrailFinder/google-translate-tts"
|
||||||
|
"github.com/GrailFinder/google-translate-tts/handlers"
|
||||||
"github.com/gopxl/beep/v2"
|
"github.com/gopxl/beep/v2"
|
||||||
"github.com/gopxl/beep/v2/mp3"
|
"github.com/gopxl/beep/v2/mp3"
|
||||||
"github.com/gopxl/beep/v2/speaker"
|
"github.com/gopxl/beep/v2/speaker"
|
||||||
@@ -49,6 +52,14 @@ type KokoroOrator struct {
|
|||||||
// textBuffer bytes.Buffer
|
// textBuffer bytes.Buffer
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Google Translate TTS implementation
|
||||||
|
type GoogleTranslateOrator struct {
|
||||||
|
logger *slog.Logger
|
||||||
|
speech *google_translate_tts.Speech
|
||||||
|
currentStream *beep.Ctrl
|
||||||
|
textBuffer strings.Builder
|
||||||
|
}
|
||||||
|
|
||||||
func (o *KokoroOrator) stoproutine() {
|
func (o *KokoroOrator) stoproutine() {
|
||||||
<-TTSDoneChan
|
<-TTSDoneChan
|
||||||
o.logger.Debug("orator got done signal")
|
o.logger.Debug("orator got done signal")
|
||||||
@@ -123,18 +134,47 @@ func (o *KokoroOrator) readroutine() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
|
func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
|
||||||
orator := &KokoroOrator{
|
provider := cfg.TTS_PROVIDER
|
||||||
logger: log,
|
if provider == "" {
|
||||||
URL: cfg.TTS_URL,
|
provider = "kokoro"
|
||||||
Format: models.AFMP3,
|
}
|
||||||
Stream: false,
|
|
||||||
Speed: cfg.TTS_SPEED,
|
switch strings.ToLower(provider) {
|
||||||
Language: "a",
|
case "google", "google-translate", "google_translate":
|
||||||
Voice: "af_bella(1)+af_sky(1)",
|
language := cfg.TTS_LANGUAGE
|
||||||
|
if language == "" {
|
||||||
|
language = "en"
|
||||||
|
}
|
||||||
|
|
||||||
|
speech := &google_translate_tts.Speech{
|
||||||
|
Folder: os.TempDir() + "/gf-lt-tts", // Temporary directory for caching
|
||||||
|
Language: language,
|
||||||
|
Proxy: "", // Proxy not supported
|
||||||
|
Speed: cfg.TTS_SPEED,
|
||||||
|
Handler: &handlers.Beep{},
|
||||||
|
}
|
||||||
|
|
||||||
|
orator := &GoogleTranslateOrator{
|
||||||
|
logger: log,
|
||||||
|
speech: speech,
|
||||||
|
}
|
||||||
|
go orator.readroutine()
|
||||||
|
go orator.stoproutine()
|
||||||
|
return orator
|
||||||
|
default: // kokoro
|
||||||
|
orator := &KokoroOrator{
|
||||||
|
logger: log,
|
||||||
|
URL: cfg.TTS_URL,
|
||||||
|
Format: models.AFMP3,
|
||||||
|
Stream: false,
|
||||||
|
Speed: cfg.TTS_SPEED,
|
||||||
|
Language: "a",
|
||||||
|
Voice: "af_bella(1)+af_sky(1)",
|
||||||
|
}
|
||||||
|
go orator.readroutine()
|
||||||
|
go orator.stoproutine()
|
||||||
|
return orator
|
||||||
}
|
}
|
||||||
go orator.readroutine()
|
|
||||||
go orator.stoproutine()
|
|
||||||
return orator
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (o *KokoroOrator) GetLogger() *slog.Logger {
|
func (o *KokoroOrator) GetLogger() *slog.Logger {
|
||||||
@@ -213,3 +253,132 @@ func (o *KokoroOrator) Stop() {
|
|||||||
o.currentStream.Streamer = nil
|
o.currentStream.Streamer = nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (o *GoogleTranslateOrator) stoproutine() {
|
||||||
|
<-TTSDoneChan
|
||||||
|
o.logger.Debug("orator got done signal")
|
||||||
|
o.Stop()
|
||||||
|
// drain the channel
|
||||||
|
for len(TTSTextChan) > 0 {
|
||||||
|
<-TTSTextChan
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *GoogleTranslateOrator) readroutine() {
|
||||||
|
tokenizer, _ := english.NewSentenceTokenizer(nil)
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case chunk := <-TTSTextChan:
|
||||||
|
_, err := o.textBuffer.WriteString(chunk)
|
||||||
|
if err != nil {
|
||||||
|
o.logger.Warn("failed to write to stringbuilder", "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
text := o.textBuffer.String()
|
||||||
|
sentences := tokenizer.Tokenize(text)
|
||||||
|
o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences))
|
||||||
|
for i, sentence := range sentences {
|
||||||
|
if i == len(sentences)-1 { // last sentence
|
||||||
|
o.textBuffer.Reset()
|
||||||
|
_, err := o.textBuffer.WriteString(sentence.Text)
|
||||||
|
if err != nil {
|
||||||
|
o.logger.Warn("failed to write to stringbuilder", "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
continue // if only one (often incomplete) sentence; wait for next chunk
|
||||||
|
}
|
||||||
|
o.logger.Debug("calling Speak with sentence", "sent", sentence.Text)
|
||||||
|
if err := o.Speak(sentence.Text); err != nil {
|
||||||
|
o.logger.Error("tts failed", "sentence", sentence.Text, "error", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case <-TTSFlushChan:
|
||||||
|
o.logger.Debug("got flushchan signal start")
|
||||||
|
// lln is done get the whole message out
|
||||||
|
if len(TTSTextChan) > 0 { // otherwise might get stuck
|
||||||
|
for chunk := range TTSTextChan {
|
||||||
|
_, err := o.textBuffer.WriteString(chunk)
|
||||||
|
if err != nil {
|
||||||
|
o.logger.Warn("failed to write to stringbuilder", "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if len(TTSTextChan) == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// INFO: if there is a lot of text it will take some time to make with tts at once
|
||||||
|
// to avoid this pause, it might be better to keep splitting on sentences
|
||||||
|
// but keepinig in mind that remainder could be ommited by tokenizer
|
||||||
|
// Flush remaining text
|
||||||
|
remaining := o.textBuffer.String()
|
||||||
|
o.textBuffer.Reset()
|
||||||
|
if remaining != "" {
|
||||||
|
o.logger.Debug("calling Speak with remainder", "rem", remaining)
|
||||||
|
if err := o.Speak(remaining); err != nil {
|
||||||
|
o.logger.Error("tts failed", "sentence", remaining, "error", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *GoogleTranslateOrator) GetLogger() *slog.Logger {
|
||||||
|
return o.logger
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *GoogleTranslateOrator) Speak(text string) error {
|
||||||
|
o.logger.Debug("fn: Speak is called", "text-len", len(text))
|
||||||
|
|
||||||
|
// Generate MP3 data using google-translate-tts
|
||||||
|
reader, err := o.speech.GenerateSpeech(text)
|
||||||
|
if err != nil {
|
||||||
|
o.logger.Error("generate speech failed", "error", err)
|
||||||
|
return fmt.Errorf("generate speech failed: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the mp3 audio from reader (wrap with NopCloser for io.ReadCloser)
|
||||||
|
streamer, format, err := mp3.Decode(io.NopCloser(reader))
|
||||||
|
if err != nil {
|
||||||
|
o.logger.Error("mp3 decode failed", "error", err)
|
||||||
|
return fmt.Errorf("mp3 decode failed: %w", err)
|
||||||
|
}
|
||||||
|
defer streamer.Close()
|
||||||
|
|
||||||
|
playbackStreamer := beep.Streamer(streamer)
|
||||||
|
speed := o.speech.Speed
|
||||||
|
if speed <= 0 {
|
||||||
|
speed = 1.0
|
||||||
|
}
|
||||||
|
if speed != 1.0 {
|
||||||
|
playbackStreamer = beep.ResampleRatio(3, float64(speed), streamer)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize speaker with the format's sample rate
|
||||||
|
if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil {
|
||||||
|
o.logger.Debug("failed to init speaker", "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
done := make(chan bool)
|
||||||
|
// Create controllable stream and store reference
|
||||||
|
o.currentStream = &beep.Ctrl{Streamer: beep.Seq(playbackStreamer, beep.Callback(func() {
|
||||||
|
close(done)
|
||||||
|
o.currentStream = nil
|
||||||
|
})), Paused: false}
|
||||||
|
speaker.Play(o.currentStream)
|
||||||
|
<-done // wait for playback to complete
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *GoogleTranslateOrator) Stop() {
|
||||||
|
o.logger.Debug("attempted to stop google translate orator")
|
||||||
|
speaker.Lock()
|
||||||
|
defer speaker.Unlock()
|
||||||
|
if o.currentStream != nil {
|
||||||
|
o.currentStream.Streamer = nil
|
||||||
|
}
|
||||||
|
// Also stop the speech handler if possible
|
||||||
|
if o.speech != nil {
|
||||||
|
_ = o.speech.Stop()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
4
go.mod
4
go.mod
@@ -4,6 +4,7 @@ go 1.25.1
|
|||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/BurntSushi/toml v1.5.0
|
github.com/BurntSushi/toml v1.5.0
|
||||||
|
github.com/GrailFinder/google-translate-tts v0.0.0-00010101000000-000000000000
|
||||||
github.com/GrailFinder/searchagent v0.2.0
|
github.com/GrailFinder/searchagent v0.2.0
|
||||||
github.com/gdamore/tcell/v2 v2.13.2
|
github.com/gdamore/tcell/v2 v2.13.2
|
||||||
github.com/glebarez/go-sqlite v1.22.0
|
github.com/glebarez/go-sqlite v1.22.0
|
||||||
@@ -23,6 +24,7 @@ require (
|
|||||||
github.com/gdamore/encoding v1.0.1 // indirect
|
github.com/gdamore/encoding v1.0.1 // indirect
|
||||||
github.com/google/uuid v1.6.0 // indirect
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
github.com/hajimehoshi/go-mp3 v0.3.4 // indirect
|
github.com/hajimehoshi/go-mp3 v0.3.4 // indirect
|
||||||
|
github.com/hajimehoshi/oto/v2 v2.3.1 // indirect
|
||||||
github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
|
github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
|
||||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||||
@@ -39,3 +41,5 @@ require (
|
|||||||
modernc.org/memory v1.11.0 // indirect
|
modernc.org/memory v1.11.0 // indirect
|
||||||
modernc.org/sqlite v1.40.1 // indirect
|
modernc.org/sqlite v1.40.1 // indirect
|
||||||
)
|
)
|
||||||
|
|
||||||
|
replace github.com/GrailFinder/google-translate-tts => /home/grail/projects/plays/goplays/google-translate-tts
|
||||||
|
|||||||
1
go.sum
1
go.sum
@@ -35,6 +35,7 @@ github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b h1:WEuQWBxel
|
|||||||
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b/go.mod h1:esZFQEUwqC+l76f2R8bIWSwXMaPbp79PppwZ1eJhFco=
|
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b/go.mod h1:esZFQEUwqC+l76f2R8bIWSwXMaPbp79PppwZ1eJhFco=
|
||||||
github.com/hajimehoshi/go-mp3 v0.3.4 h1:NUP7pBYH8OguP4diaTZ9wJbUbk3tC0KlfzsEpWmYj68=
|
github.com/hajimehoshi/go-mp3 v0.3.4 h1:NUP7pBYH8OguP4diaTZ9wJbUbk3tC0KlfzsEpWmYj68=
|
||||||
github.com/hajimehoshi/go-mp3 v0.3.4/go.mod h1:fRtZraRFcWb0pu7ok0LqyFhCUrPeMsGRSVop0eemFmo=
|
github.com/hajimehoshi/go-mp3 v0.3.4/go.mod h1:fRtZraRFcWb0pu7ok0LqyFhCUrPeMsGRSVop0eemFmo=
|
||||||
|
github.com/hajimehoshi/oto/v2 v2.3.1 h1:qrLKpNus2UfD674oxckKjNJmesp9hMh7u7QCrStB3Rc=
|
||||||
github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo=
|
github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo=
|
||||||
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||||
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||||
|
|||||||
Reference in New Issue
Block a user