diff --git a/extra/kokoro_onnx.go b/extra/kokoro_onnx.go index 442c511..fe5abfb 100644 --- a/extra/kokoro_onnx.go +++ b/extra/kokoro_onnx.go @@ -50,9 +50,7 @@ var kokoroPhonemeMap = map[string]int{ } func (o *KokoroONNXOrator) ensureInitialized(modelPath string) error { - o.logger.Debug("ensureInitialized called", "modelPath", modelPath) if o.modelLoaded { - o.logger.Debug("model already loaded") return nil } o.mu.Lock() @@ -143,7 +141,6 @@ func (o *KokoroONNXOrator) ensureInitialized(modelPath string) error { } func (o *KokoroONNXOrator) textToPhonemes(text string) (string, error) { - o.logger.Debug("converting text to phonemes", "text", text) cmd := exec.Command(o.espeakCmd, "-x", "-q", text) output, err := cmd.Output() if err != nil { @@ -152,18 +149,14 @@ func (o *KokoroONNXOrator) textToPhonemes(text string) (string, error) { } phonemeStr := strings.TrimSpace(string(output)) - o.logger.Debug("phonemes generated", "phonemes", phonemeStr) return phonemeStr, nil } func (o *KokoroONNXOrator) phonemesToTokens(phonemeStr string) ([]int, error) { - o.logger.Debug("converting phonemes to tokens", "phonemes", phonemeStr) - if phonemeStr == "" { o.logger.Error("empty phoneme string") return nil, fmt.Errorf("empty phoneme string") } - // Iterate over each character in the phoneme string tokens := make([]int, 0) for _, ch := range phonemeStr { @@ -172,18 +165,14 @@ func (o *KokoroONNXOrator) phonemesToTokens(phonemeStr string) ([]int, error) { tokens = append(tokens, tokenID) } } - if len(tokens) == 0 { o.logger.Error("no phonemes mapped to tokens", "phonemeStr", phonemeStr) return nil, fmt.Errorf("no valid phonemes mapped to tokens") } - o.logger.Debug("tokens generated", "count", len(tokens), "tokens", tokens) return tokens, nil } func (o *KokoroONNXOrator) generateAudio(text string) ([]float32, error) { - - o.logger.Debug("generateAudio called", "text", text, "speed", o.speed) if err := o.ensureInitialized(o.modelPath); err != nil { o.logger.Error("ensureInitialized failed", "error", err) return nil, err @@ -203,7 +192,6 @@ func (o *KokoroONNXOrator) generateAudio(text string) ([]float32, error) { } tokens = append([]int{0}, tokens...) tokens = append(tokens, 0) - o.logger.Debug("tokens prepared", "count", len(tokens)) inputIDs := make([]int64, len(tokens)) for i, t := range tokens { inputIDs[i] = int64(t) @@ -217,7 +205,6 @@ func (o *KokoroONNXOrator) generateAudio(text string) ([]float32, error) { return nil, fmt.Errorf("failed to create input tensor: %w", err) } defer func() { _ = inputTensor.Destroy() }() - o.logger.Debug("input tensor created", "shape", fmt.Sprintf("[1,%d]", len(inputIDs))) styleTensor, err := onnxruntime_go.NewTensor[float32]( onnxruntime_go.NewShape(1, 256), o.styleVector, @@ -236,7 +223,6 @@ func (o *KokoroONNXOrator) generateAudio(text string) ([]float32, error) { return nil, fmt.Errorf("failed to create speed tensor: %w", err) } defer func() { _ = speedTensor.Destroy() }() - o.logger.Debug("speed tensor created", "speed", o.speed) outputTensor, err := onnxruntime_go.NewEmptyTensor[float32]( onnxruntime_go.NewShape(1, 512), ) @@ -245,8 +231,6 @@ func (o *KokoroONNXOrator) generateAudio(text string) ([]float32, error) { return nil, fmt.Errorf("failed to create output tensor: %w", err) } defer func() { _ = outputTensor.Destroy() }() - o.logger.Debug("output tensor created", "shape", "[1,512]") - o.logger.Info("running ONNX inference", "input_len", len(inputIDs)) err = o.session.Run( []onnxruntime_go.Value{inputTensor, styleTensor, speedTensor}, []onnxruntime_go.Value{outputTensor}, @@ -255,26 +239,22 @@ func (o *KokoroONNXOrator) generateAudio(text string) ([]float32, error) { o.logger.Error("ONNX inference failed", "error", err) return nil, fmt.Errorf("ONNX inference failed: %w", err) } - o.logger.Debug("ONNX inference completed") audioData := outputTensor.GetData() if len(audioData) == 0 { o.logger.Error("empty audio output from ONNX") return nil, fmt.Errorf("empty audio output") } - o.logger.Debug("audio generated", "samples", len(audioData)) audio := make([]float32, len(audioData)) copy(audio, audioData) return audio, nil } func (o *KokoroONNXOrator) Speak(text string) error { - o.logger.Debug("KokoroONNX Speak called", "text_len", len(text)) audio, err := o.generateAudio(text) if err != nil { o.logger.Error("audio generation failed", "error", err) return fmt.Errorf("audio generation failed: %w", err) } - o.logger.Debug("audio ready for playback", "samples", len(audio)) // Create streamer for encoding encodeStreamer := beep.StreamerFunc(func(samples [][2]float64) (n int, ok bool) { for i := range samples { @@ -296,14 +276,12 @@ func (o *KokoroONNXOrator) Speak(text string) error { o.logger.Error("wav encoding failed", "error", err) return fmt.Errorf("wav encoding failed: %w", err) } - o.logger.Debug("wav encoded", "size", buf.Len()) decodedStreamer, format, err := wav.Decode(bytes.NewReader(buf.Bytes())) if err != nil { o.logger.Error("wav decode failed", "error", err) return fmt.Errorf("wav decode failed: %w", err) } defer decodedStreamer.Close() - o.logger.Debug("wav decoded", "format", format) if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil { o.logger.Error("speaker init failed", "error", err) return fmt.Errorf("speaker init failed: %w", err) @@ -313,7 +291,6 @@ func (o *KokoroONNXOrator) Speak(text string) error { o.mu.Lock() o.currentDone = done o.currentStream = &beep.Ctrl{Streamer: beep.Seq(decodedStreamer, beep.Callback(func() { - o.logger.Debug("playback finished") o.mu.Lock() close(done) o.currentStream = nil @@ -323,12 +300,10 @@ func (o *KokoroONNXOrator) Speak(text string) error { o.mu.Unlock() speaker.Play(o.currentStream) <-done - o.logger.Debug("Speak completed") return nil } func (o *KokoroONNXOrator) Stop() { - o.logger.Debug("stopping KokoroONNX orator") speaker.Lock() defer speaker.Unlock() o.mu.Lock() @@ -343,10 +318,8 @@ func (o *KokoroONNXOrator) GetLogger() *slog.Logger { } func (o *KokoroONNXOrator) stoproutine() { - o.logger.Debug("KokoroONNX stoproutine started") for { <-TTSDoneChan - o.logger.Debug("KokoroONNX got done signal") o.Stop() for len(TTSTextChan) > 0 { <-TTSTextChan @@ -361,17 +334,14 @@ func (o *KokoroONNXOrator) stoproutine() { } o.interrupt = true o.mu.Unlock() - o.logger.Debug("KokoroONNX stoproutine finished") } } func (o *KokoroONNXOrator) readroutine() { - o.logger.Debug("KokoroONNX readroutine started") tokenizer, _ := english.NewSentenceTokenizer(nil) for { select { case chunk := <-TTSTextChan: - o.logger.Debug("KokoroONNX received chunk", "chunk_len", len(chunk)) o.mu.Lock() o.interrupt = false _, err := o.textBuffer.WriteString(chunk) @@ -382,9 +352,7 @@ func (o *KokoroONNXOrator) readroutine() { } text := o.textBuffer.String() sentences := tokenizer.Tokenize(text) - o.logger.Debug("KokoroONNX tokenized", "total_sentences", len(sentences), "buffer", text) if len(sentences) <= 1 { - o.logger.Debug("KokoroONNX not enough sentences, waiting") o.mu.Unlock() continue } @@ -392,14 +360,12 @@ func (o *KokoroONNXOrator) readroutine() { remaining := sentences[len(sentences)-1].Text o.textBuffer.Reset() o.textBuffer.WriteString(remaining) - o.logger.Debug("KokoroONNX processing sentences", "count", len(completeSentences)) o.mu.Unlock() for _, sentence := range completeSentences { o.mu.Lock() interrupted := o.interrupt o.mu.Unlock() if interrupted { - o.logger.Debug("KokoroONNX interrupted, exiting") return } cleanedText := models.CleanText(sentence.Text) @@ -412,7 +378,6 @@ func (o *KokoroONNXOrator) readroutine() { } } case <-TTSFlushChan: - o.logger.Debug("KokoroONNX flush signal") if len(TTSTextChan) > 0 { for chunk := range TTSTextChan { o.mu.Lock()