Enha: clean text before sending to tts

2026-01-10 10:16:10 +03:00
parent 505477b8e3
commit 8474b87c43
2 changed files with 95 additions and 13 deletions
--- a/extra/tts.go
+++ b/extra/tts.go
@@ -13,6 +13,7 @@ import (
 	"log/slog"
 	"net/http"
 	"os"
 	"regexp"
 	"strings"
 	"time"
@@ -31,6 +32,44 @@ var (
 	// endsWithPunctuation = regexp.MustCompile(`[;.!?]$`)
 )
 // cleanText removes markdown and special characters that are not suitable for TTS
 func cleanText(text string) string {
 	// Remove markdown-like characters that might interfere with TTS
 	text = strings.ReplaceAll(text, "*", "") // Bold/italic markers
 	text = strings.ReplaceAll(text, "#", "") // Headers
 	text = strings.ReplaceAll(text, "_", "") // Underline/italic markers
 	text = strings.ReplaceAll(text, "~", "") // Strikethrough markers
 	text = strings.ReplaceAll(text, "`", "") // Code markers
 	text = strings.ReplaceAll(text, "[", "") // Link brackets
 	text = strings.ReplaceAll(text, "]", "") // Link brackets
 	text = strings.ReplaceAll(text, "!", "") // Exclamation marks (if not punctuation)
 	// Remove HTML tags using regex
 	htmlTagRegex := regexp.MustCompile(`<[^>]*>`)
 	text = htmlTagRegex.ReplaceAllString(text, "")
 	// Split text into lines to handle table separators
 	lines := strings.Split(text, "\n")
 	var filteredLines []string
 	for _, line := range lines {
 		// Check if the line looks like a table separator (e.g., |----|, |===|, | - - - |)
 		// A table separator typically contains only |, -, =, and spaces
 		isTableSeparator := regexp.MustCompile(`^\s*\|\s*[-=\s]+\|\s*$`).MatchString(strings.TrimSpace(line))
 		if !isTableSeparator {
 			// If it's not a table separator, remove vertical bars but keep the content
 			processedLine := strings.ReplaceAll(line, "|", "")
 			filteredLines = append(filteredLines, processedLine)
 		}
 		// If it is a table separator, skip it (don't add to filteredLines)
 	}
 	text = strings.Join(filteredLines, "\n")
 	text = strings.TrimSpace(text) // Remove leading/trailing whitespace
 	return text
 }
 type Orator interface {
 	Speak(text string) error
 	Stop()
@@ -97,9 +136,13 @@ func (o *KokoroOrator) readroutine() {
 					}
 					continue // if only one (often incomplete) sentence; wait for next chunk
 				}
-				o.logger.Debug("calling Speak with sentence", "sent", sentence.Text)
+				cleanedText := cleanText(sentence.Text)
-				if err := o.Speak(sentence.Text); err != nil {
+				if cleanedText == "" {
-					o.logger.Error("tts failed", "sentence", sentence.Text, "error", err)
+					continue // Skip empty text after cleaning
 				}
 				o.logger.Debug("calling Speak with sentence", "sent", cleanedText)
 				if err := o.Speak(cleanedText); err != nil {
 					o.logger.Error("tts failed", "sentence", cleanedText, "error", err)
 				}
 			}
 		case <-TTSFlushChan:
@@ -122,6 +165,7 @@ func (o *KokoroOrator) readroutine() {
 			// but keepinig in mind that remainder could be ommited by tokenizer
 			// Flush remaining text
 			remaining := o.textBuffer.String()
 			remaining = cleanText(remaining)
 			o.textBuffer.Reset()
 			if remaining != "" {
 				o.logger.Debug("calling Speak with remainder", "rem", remaining)
@@ -138,14 +182,12 @@ func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
 	if provider == "" {
 		provider = "kokoro"
 	}
 	switch strings.ToLower(provider) {
 	case "google", "google-translate", "google_translate":
 		language := cfg.TTS_LANGUAGE
 		if language == "" {
 			language = "en"
 		}
 		speech := &google_translate_tts.Speech{
 			Folder:   os.TempDir() + "/gf-lt-tts", // Temporary directory for caching
 			Language: language,
@@ -153,7 +195,6 @@ func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
 			Speed:    cfg.TTS_SPEED,
 			Handler:  &handlers.Beep{},
 		}
 		orator := &GoogleTranslateOrator{
 			logger: log,
 			speech: speech,
@@ -287,9 +328,13 @@ func (o *GoogleTranslateOrator) readroutine() {
 					}
 					continue // if only one (often incomplete) sentence; wait for next chunk
 				}
-				o.logger.Debug("calling Speak with sentence", "sent", sentence.Text)
+				cleanedText := cleanText(sentence.Text)
-				if err := o.Speak(sentence.Text); err != nil {
+				if cleanedText == "" {
-					o.logger.Error("tts failed", "sentence", sentence.Text, "error", err)
+					continue // Skip empty text after cleaning
 				}
 				o.logger.Debug("calling Speak with sentence", "sent", cleanedText)
 				if err := o.Speak(cleanedText); err != nil {
 					o.logger.Error("tts failed", "sentence", cleanedText, "error", err)
 				}
 			}
 		case <-TTSFlushChan:
@@ -307,11 +352,8 @@ func (o *GoogleTranslateOrator) readroutine() {
 					}
 				}
 			}
 			// INFO: if there is a lot of text it will take some time to make with tts at once
 			// to avoid this pause, it might be better to keep splitting on sentences
 			// but keepinig in mind that remainder could be ommited by tokenizer
 			// Flush remaining text
 			remaining := o.textBuffer.String()
 			remaining = cleanText(remaining)
 			o.textBuffer.Reset()
 			if remaining != "" {
 				o.logger.Debug("calling Speak with remainder", "rem", remaining)
--- a/extra/tts_test.go
+++ b/extra/tts_test.go
@@ -0,0 +1,40 @@
 //go:build extra
 // +build extra
 package extra
 import (
 	"testing"
 )
 func TestCleanText(t *testing.T) {
 	tests := []struct {
 		input    string
 		expected string
 	}{
 		{"Hello world", "Hello world"},
 		{"**Bold text**", "Bold text"},
 		{"*Italic text*", "Italic text"},
 		{"# Header", "Header"},
 		{"_Underlined text_", "Underlined text"},
 		{"~Strikethrough text~", "Strikethrough text"},
 		{"`Code text`", "Code text"},
 		{"[Link text](url)", "Link text(url)"},
 		{"Mixed *markdown* and #headers#!", "Mixed markdown and headers"},
 		{"<html>tags</html>", "tags"},
 		{"|---|", ""}, // Table separator
 		{"|====|", ""}, // Table separator with equals
 		{"| - - - |", ""}, // Table separator with spaced dashes
 		{"| cell1 | cell2 |", "cell1  cell2"}, // Table row with content
 		{"  Trailing spaces  ", "Trailing spaces"},
 		{"", ""},
 		{"***", ""},
 	}
 	for _, test := range tests {
 		result := cleanText(test.input)
 		if result != test.expected {
 			t.Errorf("cleanText(%q) = %q; expected %q", test.input, result, test.expected)
 		}
 	}
 }