Enha: binary whisper stt impl

This commit is contained in:
Grail Finder
2025-11-09 12:59:50 +03:00
parent 4a581f6c12
commit 48f32ba36a
3 changed files with 234 additions and 0 deletions

View File

@@ -31,6 +31,7 @@ type StreamCloser interface {
func NewSTT(logger *slog.Logger, cfg *config.Config) STT {
switch cfg.STT_TYPE {
case "WHISPER_BINARY":
return NewWhisperBinary(logger, cfg)
case "WHISPER_SERVER":
return NewWhisperServer(logger, cfg)
}

View File

@@ -1,13 +1,21 @@
package extra
import (
"bytes"
"context"
"errors"
"fmt"
"gf-lt/config"
"log/slog"
"os"
"os/exec"
"sync"
"github.com/gordonklaus/portaudio"
)
type WhisperBinary struct {
whisperPath string
modelPath string
@@ -17,6 +25,7 @@ type WhisperBinary struct {
mu sync.Mutex
running bool
cmd *exec.Cmd
audioBuffer []int16
}
func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
@@ -29,3 +38,221 @@ func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
cancel: cancel,
}
}
func (w *WhisperBinary) StartRecording() error {
w.mu.Lock()
defer w.mu.Unlock()
if w.running {
return errors.New("recording is already in progress")
}
if err := portaudio.Initialize(); err != nil {
return fmt.Errorf("portaudio init failed: %w", err)
}
// Initialize audio buffer
w.audioBuffer = make([]int16, 0)
in := make([]int16, 1024) // buffer size
stream, err := portaudio.OpenDefaultStream(1, 0, 16000.0, len(in), in)
if err != nil {
if paErr := portaudio.Terminate(); paErr != nil {
return fmt.Errorf("failed to open microphone: %w; terminate error: %w", err, paErr)
}
return fmt.Errorf("failed to open microphone: %w", err)
}
// Create a dummy command just for context management
w.cmd = exec.CommandContext(w.ctx, "sh", "-c", "echo 'dummy command'")
go w.recordAudio(stream, in)
w.running = true
return nil
}
func (w *WhisperBinary) recordAudio(stream *portaudio.Stream, in []int16) {
defer func() {
_ = portaudio.Terminate() // ignoring error as we're shutting down
}()
if err := stream.Start(); err != nil {
return
}
for {
select {
case <-w.ctx.Done():
return
default:
if !w.running {
return
}
if err := stream.Read(); err != nil {
return
}
// Append samples to buffer
w.mu.Lock()
if w.audioBuffer == nil {
w.audioBuffer = make([]int16, 0)
}
// Make a copy of the input buffer to avoid overwriting
tempBuffer := make([]int16, len(in))
copy(tempBuffer, in)
w.audioBuffer = append(w.audioBuffer, tempBuffer...)
w.mu.Unlock()
}
}
}
func (w *WhisperBinary) StopRecording() (string, error) {
w.mu.Lock()
if !w.running {
w.mu.Unlock()
return "", errors.New("not currently recording")
}
w.running = false
w.cancel() // This will stop the recording goroutine
w.mu.Unlock()
// Save the recorded audio to a temporary file
tempFile, err := w.saveAudioToTempFile()
if err != nil {
return "", fmt.Errorf("failed to save audio to temp file: %w", err)
}
defer os.Remove(tempFile) // Clean up the temp file
// Run the whisper binary
cmd := exec.CommandContext(w.ctx, w.whisperPath, "-m", w.modelPath, "-l", w.lang, tempFile)
var outBuf, errBuf bytes.Buffer
cmd.Stdout = &outBuf
cmd.Stderr = &errBuf
if err := cmd.Run(); err != nil {
return "", fmt.Errorf("whisper binary failed: %w, stderr: %s", err, errBuf.String())
}
result := outBuf.String()
// Clean up audio buffer
w.mu.Lock()
w.audioBuffer = nil
w.mu.Unlock()
return result, nil
}
// saveAudioToTempFile saves the recorded audio data to a temporary WAV file
func (w *WhisperBinary) saveAudioToTempFile() (string, error) {
// Create temporary WAV file
tempFile, err := os.CreateTemp("", "recording_*.wav")
if err != nil {
return "", fmt.Errorf("failed to create temp file: %w", err)
}
defer tempFile.Close()
// Write WAV header and data
err = w.writeWAVFile(tempFile.Name())
if err != nil {
return "", fmt.Errorf("failed to write WAV file: %w", err)
}
return tempFile.Name(), nil
}
// writeWAVFile creates a WAV file from the recorded audio data
func (w *WhisperBinary) writeWAVFile(filename string) error {
// Open file for writing
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
w.mu.Lock()
audioData := make([]int16, len(w.audioBuffer))
copy(audioData, w.audioBuffer)
w.mu.Unlock()
if len(audioData) == 0 {
return errors.New("no audio data to write")
}
// Calculate data size (number of samples * size of int16)
dataSize := len(audioData) * 2 // 2 bytes per int16 sample
// Write WAV header with the correct data size
header := w.createWAVHeader(16000, 1, 16, dataSize)
_, err = file.Write(header)
if err != nil {
return err
}
// Write audio data
for _, sample := range audioData {
// Write little-endian 16-bit sample
_, err := file.Write([]byte{byte(sample), byte(sample >> 8)})
if err != nil {
return err
}
}
return nil
}
// createWAVHeader creates a WAV file header
func (w *WhisperBinary) createWAVHeader(sampleRate, channels, bitsPerSample int, dataSize int) []byte {
header := make([]byte, 44)
copy(header[0:4], "RIFF")
// Total file size will be updated later
copy(header[8:12], "WAVE")
copy(header[12:16], "fmt ")
// fmt chunk size (16 for PCM)
header[16] = 16
header[17] = 0
header[18] = 0
header[19] = 0
// Audio format (1 = PCM)
header[20] = 1
header[21] = 0
// Number of channels
header[22] = byte(channels)
header[23] = 0
// Sample rate
header[24] = byte(sampleRate)
header[25] = byte(sampleRate >> 8)
header[26] = byte(sampleRate >> 16)
header[27] = byte(sampleRate >> 24)
// Byte rate
byteRate := sampleRate * channels * bitsPerSample / 8
header[28] = byte(byteRate)
header[29] = byte(byteRate >> 8)
header[30] = byte(byteRate >> 16)
header[31] = byte(byteRate >> 24)
// Block align
blockAlign := channels * bitsPerSample / 8
header[32] = byte(blockAlign)
header[33] = 0
// Bits per sample
header[34] = byte(bitsPerSample)
header[35] = 0
// "data" subchunk
copy(header[36:40], "data")
// Data size
header[40] = byte(dataSize)
header[41] = byte(dataSize >> 8)
header[42] = byte(dataSize >> 16)
header[43] = byte(dataSize >> 24)
return header
}
func (w *WhisperBinary) IsRecording() bool {
w.mu.Lock()
defer w.mu.Unlock()
return w.running
}