Enha: binary whisper stt impl
This commit is contained in:
@@ -1,13 +1,21 @@
|
||||
package extra
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"gf-lt/config"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"sync"
|
||||
|
||||
"github.com/gordonklaus/portaudio"
|
||||
)
|
||||
|
||||
|
||||
|
||||
type WhisperBinary struct {
|
||||
whisperPath string
|
||||
modelPath string
|
||||
@@ -17,6 +25,7 @@ type WhisperBinary struct {
|
||||
mu sync.Mutex
|
||||
running bool
|
||||
cmd *exec.Cmd
|
||||
audioBuffer []int16
|
||||
}
|
||||
|
||||
func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
|
||||
@@ -29,3 +38,221 @@ func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
|
||||
cancel: cancel,
|
||||
}
|
||||
}
|
||||
|
||||
func (w *WhisperBinary) StartRecording() error {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
|
||||
if w.running {
|
||||
return errors.New("recording is already in progress")
|
||||
}
|
||||
|
||||
if err := portaudio.Initialize(); err != nil {
|
||||
return fmt.Errorf("portaudio init failed: %w", err)
|
||||
}
|
||||
|
||||
// Initialize audio buffer
|
||||
w.audioBuffer = make([]int16, 0)
|
||||
|
||||
in := make([]int16, 1024) // buffer size
|
||||
stream, err := portaudio.OpenDefaultStream(1, 0, 16000.0, len(in), in)
|
||||
if err != nil {
|
||||
if paErr := portaudio.Terminate(); paErr != nil {
|
||||
return fmt.Errorf("failed to open microphone: %w; terminate error: %w", err, paErr)
|
||||
}
|
||||
return fmt.Errorf("failed to open microphone: %w", err)
|
||||
}
|
||||
|
||||
// Create a dummy command just for context management
|
||||
w.cmd = exec.CommandContext(w.ctx, "sh", "-c", "echo 'dummy command'")
|
||||
|
||||
go w.recordAudio(stream, in)
|
||||
w.running = true
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (w *WhisperBinary) recordAudio(stream *portaudio.Stream, in []int16) {
|
||||
defer func() {
|
||||
_ = portaudio.Terminate() // ignoring error as we're shutting down
|
||||
}()
|
||||
|
||||
if err := stream.Start(); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-w.ctx.Done():
|
||||
return
|
||||
default:
|
||||
if !w.running {
|
||||
return
|
||||
}
|
||||
if err := stream.Read(); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Append samples to buffer
|
||||
w.mu.Lock()
|
||||
if w.audioBuffer == nil {
|
||||
w.audioBuffer = make([]int16, 0)
|
||||
}
|
||||
// Make a copy of the input buffer to avoid overwriting
|
||||
tempBuffer := make([]int16, len(in))
|
||||
copy(tempBuffer, in)
|
||||
w.audioBuffer = append(w.audioBuffer, tempBuffer...)
|
||||
w.mu.Unlock()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (w *WhisperBinary) StopRecording() (string, error) {
|
||||
w.mu.Lock()
|
||||
if !w.running {
|
||||
w.mu.Unlock()
|
||||
return "", errors.New("not currently recording")
|
||||
}
|
||||
|
||||
w.running = false
|
||||
w.cancel() // This will stop the recording goroutine
|
||||
w.mu.Unlock()
|
||||
|
||||
// Save the recorded audio to a temporary file
|
||||
tempFile, err := w.saveAudioToTempFile()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to save audio to temp file: %w", err)
|
||||
}
|
||||
defer os.Remove(tempFile) // Clean up the temp file
|
||||
|
||||
// Run the whisper binary
|
||||
cmd := exec.CommandContext(w.ctx, w.whisperPath, "-m", w.modelPath, "-l", w.lang, tempFile)
|
||||
|
||||
var outBuf, errBuf bytes.Buffer
|
||||
cmd.Stdout = &outBuf
|
||||
cmd.Stderr = &errBuf
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
return "", fmt.Errorf("whisper binary failed: %w, stderr: %s", err, errBuf.String())
|
||||
}
|
||||
|
||||
result := outBuf.String()
|
||||
|
||||
// Clean up audio buffer
|
||||
w.mu.Lock()
|
||||
w.audioBuffer = nil
|
||||
w.mu.Unlock()
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// saveAudioToTempFile saves the recorded audio data to a temporary WAV file
|
||||
func (w *WhisperBinary) saveAudioToTempFile() (string, error) {
|
||||
// Create temporary WAV file
|
||||
tempFile, err := os.CreateTemp("", "recording_*.wav")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to create temp file: %w", err)
|
||||
}
|
||||
defer tempFile.Close()
|
||||
|
||||
// Write WAV header and data
|
||||
err = w.writeWAVFile(tempFile.Name())
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to write WAV file: %w", err)
|
||||
}
|
||||
|
||||
return tempFile.Name(), nil
|
||||
}
|
||||
|
||||
// writeWAVFile creates a WAV file from the recorded audio data
|
||||
func (w *WhisperBinary) writeWAVFile(filename string) error {
|
||||
// Open file for writing
|
||||
file, err := os.Create(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
w.mu.Lock()
|
||||
audioData := make([]int16, len(w.audioBuffer))
|
||||
copy(audioData, w.audioBuffer)
|
||||
w.mu.Unlock()
|
||||
|
||||
if len(audioData) == 0 {
|
||||
return errors.New("no audio data to write")
|
||||
}
|
||||
|
||||
// Calculate data size (number of samples * size of int16)
|
||||
dataSize := len(audioData) * 2 // 2 bytes per int16 sample
|
||||
|
||||
// Write WAV header with the correct data size
|
||||
header := w.createWAVHeader(16000, 1, 16, dataSize)
|
||||
_, err = file.Write(header)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Write audio data
|
||||
for _, sample := range audioData {
|
||||
// Write little-endian 16-bit sample
|
||||
_, err := file.Write([]byte{byte(sample), byte(sample >> 8)})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// createWAVHeader creates a WAV file header
|
||||
func (w *WhisperBinary) createWAVHeader(sampleRate, channels, bitsPerSample int, dataSize int) []byte {
|
||||
header := make([]byte, 44)
|
||||
copy(header[0:4], "RIFF")
|
||||
// Total file size will be updated later
|
||||
copy(header[8:12], "WAVE")
|
||||
copy(header[12:16], "fmt ")
|
||||
// fmt chunk size (16 for PCM)
|
||||
header[16] = 16
|
||||
header[17] = 0
|
||||
header[18] = 0
|
||||
header[19] = 0
|
||||
// Audio format (1 = PCM)
|
||||
header[20] = 1
|
||||
header[21] = 0
|
||||
// Number of channels
|
||||
header[22] = byte(channels)
|
||||
header[23] = 0
|
||||
// Sample rate
|
||||
header[24] = byte(sampleRate)
|
||||
header[25] = byte(sampleRate >> 8)
|
||||
header[26] = byte(sampleRate >> 16)
|
||||
header[27] = byte(sampleRate >> 24)
|
||||
// Byte rate
|
||||
byteRate := sampleRate * channels * bitsPerSample / 8
|
||||
header[28] = byte(byteRate)
|
||||
header[29] = byte(byteRate >> 8)
|
||||
header[30] = byte(byteRate >> 16)
|
||||
header[31] = byte(byteRate >> 24)
|
||||
// Block align
|
||||
blockAlign := channels * bitsPerSample / 8
|
||||
header[32] = byte(blockAlign)
|
||||
header[33] = 0
|
||||
// Bits per sample
|
||||
header[34] = byte(bitsPerSample)
|
||||
header[35] = 0
|
||||
// "data" subchunk
|
||||
copy(header[36:40], "data")
|
||||
// Data size
|
||||
header[40] = byte(dataSize)
|
||||
header[41] = byte(dataSize >> 8)
|
||||
header[42] = byte(dataSize >> 16)
|
||||
header[43] = byte(dataSize >> 24)
|
||||
|
||||
return header
|
||||
}
|
||||
|
||||
func (w *WhisperBinary) IsRecording() bool {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
return w.running
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user