Enha: binary whisper stt impl
This commit is contained in:
@@ -20,8 +20,14 @@ TTS_URL = "http://localhost:8880/v1/audio/speech"
|
|||||||
TTS_SPEED = 1.0
|
TTS_SPEED = 1.0
|
||||||
# extra stt
|
# extra stt
|
||||||
STT_ENABLED = false
|
STT_ENABLED = false
|
||||||
|
STT_TYPE = "WHISPER_SERVER" # WHISPER_SERVER or WHISPER_BINARY
|
||||||
STT_URL = "http://localhost:8081/inference"
|
STT_URL = "http://localhost:8081/inference"
|
||||||
|
WhisperBinaryPath = "./whisper-cli" # Path to whisper binary (for WHISPER_BINARY mode)
|
||||||
|
WhisperModelPath = "./ggml-model.bin" # Path to whisper model file (for WHISPER_BINARY mode)
|
||||||
|
STT_LANG = "en" # Language for speech recognition (for WHISPER_BINARY mode)
|
||||||
|
STT_SR = 16000 # Sample rate for audio recording
|
||||||
DBPATH = "gflt.db"
|
DBPATH = "gflt.db"
|
||||||
|
#
|
||||||
FetchModelNameAPI = "http://localhost:8080/v1/models"
|
FetchModelNameAPI = "http://localhost:8080/v1/models"
|
||||||
# external search tool
|
# external search tool
|
||||||
SearchAPI = "" # url to call the tool by
|
SearchAPI = "" # url to call the tool by
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ type StreamCloser interface {
|
|||||||
func NewSTT(logger *slog.Logger, cfg *config.Config) STT {
|
func NewSTT(logger *slog.Logger, cfg *config.Config) STT {
|
||||||
switch cfg.STT_TYPE {
|
switch cfg.STT_TYPE {
|
||||||
case "WHISPER_BINARY":
|
case "WHISPER_BINARY":
|
||||||
|
return NewWhisperBinary(logger, cfg)
|
||||||
case "WHISPER_SERVER":
|
case "WHISPER_SERVER":
|
||||||
return NewWhisperServer(logger, cfg)
|
return NewWhisperServer(logger, cfg)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,13 +1,21 @@
|
|||||||
package extra
|
package extra
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
"gf-lt/config"
|
"gf-lt/config"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
|
"github.com/gordonklaus/portaudio"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
type WhisperBinary struct {
|
type WhisperBinary struct {
|
||||||
whisperPath string
|
whisperPath string
|
||||||
modelPath string
|
modelPath string
|
||||||
@@ -17,6 +25,7 @@ type WhisperBinary struct {
|
|||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
running bool
|
running bool
|
||||||
cmd *exec.Cmd
|
cmd *exec.Cmd
|
||||||
|
audioBuffer []int16
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
|
func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
|
||||||
@@ -29,3 +38,221 @@ func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
|
|||||||
cancel: cancel,
|
cancel: cancel,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (w *WhisperBinary) StartRecording() error {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
|
||||||
|
if w.running {
|
||||||
|
return errors.New("recording is already in progress")
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := portaudio.Initialize(); err != nil {
|
||||||
|
return fmt.Errorf("portaudio init failed: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize audio buffer
|
||||||
|
w.audioBuffer = make([]int16, 0)
|
||||||
|
|
||||||
|
in := make([]int16, 1024) // buffer size
|
||||||
|
stream, err := portaudio.OpenDefaultStream(1, 0, 16000.0, len(in), in)
|
||||||
|
if err != nil {
|
||||||
|
if paErr := portaudio.Terminate(); paErr != nil {
|
||||||
|
return fmt.Errorf("failed to open microphone: %w; terminate error: %w", err, paErr)
|
||||||
|
}
|
||||||
|
return fmt.Errorf("failed to open microphone: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a dummy command just for context management
|
||||||
|
w.cmd = exec.CommandContext(w.ctx, "sh", "-c", "echo 'dummy command'")
|
||||||
|
|
||||||
|
go w.recordAudio(stream, in)
|
||||||
|
w.running = true
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *WhisperBinary) recordAudio(stream *portaudio.Stream, in []int16) {
|
||||||
|
defer func() {
|
||||||
|
_ = portaudio.Terminate() // ignoring error as we're shutting down
|
||||||
|
}()
|
||||||
|
|
||||||
|
if err := stream.Start(); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-w.ctx.Done():
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
if !w.running {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := stream.Read(); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Append samples to buffer
|
||||||
|
w.mu.Lock()
|
||||||
|
if w.audioBuffer == nil {
|
||||||
|
w.audioBuffer = make([]int16, 0)
|
||||||
|
}
|
||||||
|
// Make a copy of the input buffer to avoid overwriting
|
||||||
|
tempBuffer := make([]int16, len(in))
|
||||||
|
copy(tempBuffer, in)
|
||||||
|
w.audioBuffer = append(w.audioBuffer, tempBuffer...)
|
||||||
|
w.mu.Unlock()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *WhisperBinary) StopRecording() (string, error) {
|
||||||
|
w.mu.Lock()
|
||||||
|
if !w.running {
|
||||||
|
w.mu.Unlock()
|
||||||
|
return "", errors.New("not currently recording")
|
||||||
|
}
|
||||||
|
|
||||||
|
w.running = false
|
||||||
|
w.cancel() // This will stop the recording goroutine
|
||||||
|
w.mu.Unlock()
|
||||||
|
|
||||||
|
// Save the recorded audio to a temporary file
|
||||||
|
tempFile, err := w.saveAudioToTempFile()
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("failed to save audio to temp file: %w", err)
|
||||||
|
}
|
||||||
|
defer os.Remove(tempFile) // Clean up the temp file
|
||||||
|
|
||||||
|
// Run the whisper binary
|
||||||
|
cmd := exec.CommandContext(w.ctx, w.whisperPath, "-m", w.modelPath, "-l", w.lang, tempFile)
|
||||||
|
|
||||||
|
var outBuf, errBuf bytes.Buffer
|
||||||
|
cmd.Stdout = &outBuf
|
||||||
|
cmd.Stderr = &errBuf
|
||||||
|
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
return "", fmt.Errorf("whisper binary failed: %w, stderr: %s", err, errBuf.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
result := outBuf.String()
|
||||||
|
|
||||||
|
// Clean up audio buffer
|
||||||
|
w.mu.Lock()
|
||||||
|
w.audioBuffer = nil
|
||||||
|
w.mu.Unlock()
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// saveAudioToTempFile saves the recorded audio data to a temporary WAV file
|
||||||
|
func (w *WhisperBinary) saveAudioToTempFile() (string, error) {
|
||||||
|
// Create temporary WAV file
|
||||||
|
tempFile, err := os.CreateTemp("", "recording_*.wav")
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("failed to create temp file: %w", err)
|
||||||
|
}
|
||||||
|
defer tempFile.Close()
|
||||||
|
|
||||||
|
// Write WAV header and data
|
||||||
|
err = w.writeWAVFile(tempFile.Name())
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("failed to write WAV file: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return tempFile.Name(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeWAVFile creates a WAV file from the recorded audio data
|
||||||
|
func (w *WhisperBinary) writeWAVFile(filename string) error {
|
||||||
|
// Open file for writing
|
||||||
|
file, err := os.Create(filename)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
w.mu.Lock()
|
||||||
|
audioData := make([]int16, len(w.audioBuffer))
|
||||||
|
copy(audioData, w.audioBuffer)
|
||||||
|
w.mu.Unlock()
|
||||||
|
|
||||||
|
if len(audioData) == 0 {
|
||||||
|
return errors.New("no audio data to write")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate data size (number of samples * size of int16)
|
||||||
|
dataSize := len(audioData) * 2 // 2 bytes per int16 sample
|
||||||
|
|
||||||
|
// Write WAV header with the correct data size
|
||||||
|
header := w.createWAVHeader(16000, 1, 16, dataSize)
|
||||||
|
_, err = file.Write(header)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write audio data
|
||||||
|
for _, sample := range audioData {
|
||||||
|
// Write little-endian 16-bit sample
|
||||||
|
_, err := file.Write([]byte{byte(sample), byte(sample >> 8)})
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// createWAVHeader creates a WAV file header
|
||||||
|
func (w *WhisperBinary) createWAVHeader(sampleRate, channels, bitsPerSample int, dataSize int) []byte {
|
||||||
|
header := make([]byte, 44)
|
||||||
|
copy(header[0:4], "RIFF")
|
||||||
|
// Total file size will be updated later
|
||||||
|
copy(header[8:12], "WAVE")
|
||||||
|
copy(header[12:16], "fmt ")
|
||||||
|
// fmt chunk size (16 for PCM)
|
||||||
|
header[16] = 16
|
||||||
|
header[17] = 0
|
||||||
|
header[18] = 0
|
||||||
|
header[19] = 0
|
||||||
|
// Audio format (1 = PCM)
|
||||||
|
header[20] = 1
|
||||||
|
header[21] = 0
|
||||||
|
// Number of channels
|
||||||
|
header[22] = byte(channels)
|
||||||
|
header[23] = 0
|
||||||
|
// Sample rate
|
||||||
|
header[24] = byte(sampleRate)
|
||||||
|
header[25] = byte(sampleRate >> 8)
|
||||||
|
header[26] = byte(sampleRate >> 16)
|
||||||
|
header[27] = byte(sampleRate >> 24)
|
||||||
|
// Byte rate
|
||||||
|
byteRate := sampleRate * channels * bitsPerSample / 8
|
||||||
|
header[28] = byte(byteRate)
|
||||||
|
header[29] = byte(byteRate >> 8)
|
||||||
|
header[30] = byte(byteRate >> 16)
|
||||||
|
header[31] = byte(byteRate >> 24)
|
||||||
|
// Block align
|
||||||
|
blockAlign := channels * bitsPerSample / 8
|
||||||
|
header[32] = byte(blockAlign)
|
||||||
|
header[33] = 0
|
||||||
|
// Bits per sample
|
||||||
|
header[34] = byte(bitsPerSample)
|
||||||
|
header[35] = 0
|
||||||
|
// "data" subchunk
|
||||||
|
copy(header[36:40], "data")
|
||||||
|
// Data size
|
||||||
|
header[40] = byte(dataSize)
|
||||||
|
header[41] = byte(dataSize >> 8)
|
||||||
|
header[42] = byte(dataSize >> 16)
|
||||||
|
header[43] = byte(dataSize >> 24)
|
||||||
|
|
||||||
|
return header
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *WhisperBinary) IsRecording() bool {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
return w.running
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user