Dep (stt): use ffmpeg instead of portaudio
This commit is contained in:
7
Makefile
7
Makefile
@@ -143,11 +143,10 @@ build-whisper: ## Build whisper.cpp from source in batteries directory
|
||||
|
||||
download-whisper-model: ## Download Whisper model for STT in batteries directory
|
||||
@echo "Downloading Whisper model for STT..."
|
||||
@if [ ! -d "batteries/whisper.cpp" ]; then \
|
||||
echo "Please run 'make setup-whisper' first to clone the repository."; \
|
||||
exit 1; \
|
||||
@if [ ! -d "batteries/whisper.cpp/models" ]; then \
|
||||
mkdir -p "batteries/whisper.cpp/models" \
|
||||
fi
|
||||
@cd batteries/whisper.cpp && bash ./models/download-ggml-model.sh large-v3-turbo-q5_0
|
||||
curl -o batteries/whisper.cpp/models/ggml-large-v3-turbo-q5_0.bin -L "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q5_0.bin?download=true"
|
||||
@echo "Whisper model downloaded successfully!"
|
||||
|
||||
# Docker targets for STT/TTS services (in batteries directory)
|
||||
|
||||
@@ -6,19 +6,27 @@ services:
|
||||
ports:
|
||||
- "8081:8081"
|
||||
volumes:
|
||||
- whisper_models:/app/models
|
||||
- ./whisper.cpp/models/ggml-large-v3-turbo-q5_0.bin:/app/models/ggml-large-v3-turbo-q5_0.bin
|
||||
working_dir: /app
|
||||
entrypoint: ""
|
||||
command: >
|
||||
sh -c "
|
||||
if [ ! -f /app/models/ggml-large-v3-turbo.bin ]; then
|
||||
echo 'Downloading ggml-large-v3-turbo model...'
|
||||
./download-ggml-model.sh large-v3-turbo /app/models
|
||||
if [ ! -f /app/models/ggml-large-v3-turbo-q5_0.bin ]; then
|
||||
echo 'Downloading ggml-large-v3-turboq5_0 model...'
|
||||
curl -o /app/models/ggml-large-v3-turbo-q5_0.bin -L "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q5_0.bin?download=true"
|
||||
fi &&
|
||||
./build/bin/whisper-server -m /app/models/ggml-large-v3-turbo.bin -t 4 -p 1 --port 8081 --host 0.0.0.0
|
||||
./build/bin/whisper-server -m /app/models/ggml-large-v3-turbo-q5_0.bin -t 4 -p 1 --port 8081 --host 0.0.0.0
|
||||
"
|
||||
environment:
|
||||
- WHISPER_LOG_LEVEL=3
|
||||
# For GPU support, uncomment the following lines:
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
# Restart policy in case the service fails
|
||||
restart: unless-stopped
|
||||
|
||||
@@ -45,7 +53,5 @@ services:
|
||||
volumes:
|
||||
models:
|
||||
driver: local
|
||||
audio:
|
||||
driver: local
|
||||
whisper_models:
|
||||
driver: local
|
||||
|
||||
132
extra/stt.go
132
extra/stt.go
@@ -6,18 +6,10 @@ package extra
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"gf-lt/config"
|
||||
"io"
|
||||
"log/slog"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
"github.com/gordonklaus/portaudio"
|
||||
)
|
||||
|
||||
var specialRE = regexp.MustCompile(`\[.*?\]`)
|
||||
@@ -44,14 +36,6 @@ func NewSTT(logger *slog.Logger, cfg *config.Config) STT {
|
||||
return NewWhisperServer(logger, cfg)
|
||||
}
|
||||
|
||||
type WhisperServer struct {
|
||||
logger *slog.Logger
|
||||
ServerURL string
|
||||
SampleRate int
|
||||
AudioBuffer *bytes.Buffer
|
||||
recording bool
|
||||
}
|
||||
|
||||
func NewWhisperServer(logger *slog.Logger, cfg *config.Config) *WhisperServer {
|
||||
return &WhisperServer{
|
||||
logger: logger,
|
||||
@@ -61,69 +45,6 @@ func NewWhisperServer(logger *slog.Logger, cfg *config.Config) *WhisperServer {
|
||||
}
|
||||
}
|
||||
|
||||
func (stt *WhisperServer) StartRecording() error {
|
||||
if err := stt.microphoneStream(stt.SampleRate); err != nil {
|
||||
return fmt.Errorf("failed to init microphone: %w", err)
|
||||
}
|
||||
stt.recording = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (stt *WhisperServer) StopRecording() (string, error) {
|
||||
stt.recording = false
|
||||
// wait loop to finish?
|
||||
if stt.AudioBuffer == nil {
|
||||
err := errors.New("unexpected nil AudioBuffer")
|
||||
stt.logger.Error(err.Error())
|
||||
return "", err
|
||||
}
|
||||
// Create WAV header first
|
||||
body := &bytes.Buffer{}
|
||||
writer := multipart.NewWriter(body)
|
||||
// Add audio file part
|
||||
part, err := writer.CreateFormFile("file", "recording.wav")
|
||||
if err != nil {
|
||||
stt.logger.Error("fn: StopRecording", "error", err)
|
||||
return "", err
|
||||
}
|
||||
// Stream directly to multipart writer: header + raw data
|
||||
dataSize := stt.AudioBuffer.Len()
|
||||
stt.writeWavHeader(part, dataSize)
|
||||
if _, err := io.Copy(part, stt.AudioBuffer); err != nil {
|
||||
stt.logger.Error("fn: StopRecording", "error", err)
|
||||
return "", err
|
||||
}
|
||||
// Reset buffer for next recording
|
||||
stt.AudioBuffer.Reset()
|
||||
// Add response format field
|
||||
err = writer.WriteField("response_format", "text")
|
||||
if err != nil {
|
||||
stt.logger.Error("fn: StopRecording", "error", err)
|
||||
return "", err
|
||||
}
|
||||
if writer.Close() != nil {
|
||||
stt.logger.Error("fn: StopRecording", "error", err)
|
||||
return "", err
|
||||
}
|
||||
// Send request
|
||||
resp, err := http.Post(stt.ServerURL, writer.FormDataContentType(), body) //nolint:noctx
|
||||
if err != nil {
|
||||
stt.logger.Error("fn: StopRecording", "error", err)
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
// Read and print response
|
||||
responseTextBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
stt.logger.Error("fn: StopRecording", "error", err)
|
||||
return "", err
|
||||
}
|
||||
resptext := strings.TrimRight(string(responseTextBytes), "\n")
|
||||
// in case there are special tokens like [_BEG_]
|
||||
resptext = specialRE.ReplaceAllString(resptext, "")
|
||||
return strings.TrimSpace(strings.ReplaceAll(resptext, "\n ", "\n")), nil
|
||||
}
|
||||
|
||||
func (stt *WhisperServer) writeWavHeader(w io.Writer, dataSize int) {
|
||||
header := make([]byte, 44)
|
||||
copy(header[0:4], "RIFF")
|
||||
@@ -147,56 +68,3 @@ func (stt *WhisperServer) writeWavHeader(w io.Writer, dataSize int) {
|
||||
func (stt *WhisperServer) IsRecording() bool {
|
||||
return stt.recording
|
||||
}
|
||||
|
||||
func (stt *WhisperServer) microphoneStream(sampleRate int) error {
|
||||
// Temporarily redirect stderr to suppress ALSA warnings during PortAudio init
|
||||
origStderr, errDup := syscall.Dup(syscall.Stderr)
|
||||
if errDup != nil {
|
||||
return fmt.Errorf("failed to dup stderr: %w", errDup)
|
||||
}
|
||||
nullFD, err := syscall.Open("/dev/null", syscall.O_WRONLY, 0)
|
||||
if err != nil {
|
||||
_ = syscall.Close(origStderr) // Close the dup'd fd if open fails
|
||||
return fmt.Errorf("failed to open /dev/null: %w", err)
|
||||
}
|
||||
// redirect stderr
|
||||
_ = syscall.Dup2(nullFD, syscall.Stderr)
|
||||
// Initialize PortAudio (this is where ALSA warnings occur)
|
||||
defer func() {
|
||||
// Restore stderr
|
||||
_ = syscall.Dup2(origStderr, syscall.Stderr)
|
||||
_ = syscall.Close(origStderr)
|
||||
_ = syscall.Close(nullFD)
|
||||
}()
|
||||
if err := portaudio.Initialize(); err != nil {
|
||||
return fmt.Errorf("portaudio init failed: %w", err)
|
||||
}
|
||||
in := make([]int16, 64)
|
||||
stream, err := portaudio.OpenDefaultStream(1, 0, float64(sampleRate), len(in), in)
|
||||
if err != nil {
|
||||
if paErr := portaudio.Terminate(); paErr != nil {
|
||||
return fmt.Errorf("failed to open microphone: %w; terminate error: %w", err, paErr)
|
||||
}
|
||||
return fmt.Errorf("failed to open microphone: %w", err)
|
||||
}
|
||||
go func(stream *portaudio.Stream) {
|
||||
if err := stream.Start(); err != nil {
|
||||
stt.logger.Error("microphoneStream", "error", err)
|
||||
return
|
||||
}
|
||||
for {
|
||||
if !stt.IsRecording() {
|
||||
return
|
||||
}
|
||||
if err := stream.Read(); err != nil {
|
||||
stt.logger.Error("reading stream", "error", err)
|
||||
return
|
||||
}
|
||||
if err := binary.Write(stt.AudioBuffer, binary.LittleEndian, in); err != nil {
|
||||
stt.logger.Error("writing to buffer", "error", err)
|
||||
return
|
||||
}
|
||||
}
|
||||
}(stream)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -9,15 +9,13 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"gf-lt/config"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
|
||||
"github.com/gordonklaus/portaudio"
|
||||
"time"
|
||||
)
|
||||
|
||||
type WhisperBinary struct {
|
||||
@@ -25,11 +23,143 @@ type WhisperBinary struct {
|
||||
whisperPath string
|
||||
modelPath string
|
||||
lang string
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
mu sync.Mutex
|
||||
recording bool
|
||||
audioBuffer []int16
|
||||
// Per-recording fields (protected by mu)
|
||||
mu sync.Mutex
|
||||
recording bool
|
||||
tempFile string
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
cmd *exec.Cmd
|
||||
cmdMu sync.Mutex
|
||||
}
|
||||
|
||||
func (w *WhisperBinary) StartRecording() error {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
if w.recording {
|
||||
return errors.New("recording is already in progress")
|
||||
}
|
||||
// Fresh context for this recording
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
w.ctx = ctx
|
||||
w.cancel = cancel
|
||||
// Create temporary file
|
||||
tempFile, err := os.CreateTemp("", "recording_*.wav")
|
||||
if err != nil {
|
||||
cancel()
|
||||
return fmt.Errorf("failed to create temp file: %w", err)
|
||||
}
|
||||
tempFile.Close()
|
||||
w.tempFile = tempFile.Name()
|
||||
// ffmpeg command: capture from default microphone, write WAV
|
||||
args := []string{
|
||||
"-f", "alsa", // or "pulse" if preferred
|
||||
"-i", "default",
|
||||
"-acodec", "pcm_s16le",
|
||||
"-ar", "16000",
|
||||
"-ac", "1",
|
||||
"-y", // overwrite output file
|
||||
w.tempFile,
|
||||
}
|
||||
cmd := exec.CommandContext(w.ctx, "ffmpeg", args...)
|
||||
// Capture stderr for debugging (optional, but useful for diagnosing)
|
||||
stderr, err := cmd.StderrPipe()
|
||||
if err != nil {
|
||||
cancel()
|
||||
os.Remove(w.tempFile)
|
||||
return fmt.Errorf("failed to create stderr pipe: %w", err)
|
||||
}
|
||||
go func() {
|
||||
buf := make([]byte, 1024)
|
||||
for {
|
||||
n, err := stderr.Read(buf)
|
||||
if n > 0 {
|
||||
w.logger.Debug("ffmpeg stderr", "output", string(buf[:n]))
|
||||
}
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
}()
|
||||
w.cmdMu.Lock()
|
||||
w.cmd = cmd
|
||||
w.cmdMu.Unlock()
|
||||
if err := cmd.Start(); err != nil {
|
||||
cancel()
|
||||
os.Remove(w.tempFile)
|
||||
return fmt.Errorf("failed to start ffmpeg: %w", err)
|
||||
}
|
||||
w.recording = true
|
||||
w.logger.Debug("Recording started", "file", w.tempFile)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (w *WhisperBinary) StopRecording() (string, error) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
if !w.recording {
|
||||
return "", errors.New("not currently recording")
|
||||
}
|
||||
w.recording = false
|
||||
// Gracefully stop ffmpeg
|
||||
w.cmdMu.Lock()
|
||||
if w.cmd != nil && w.cmd.Process != nil {
|
||||
w.logger.Debug("Sending SIGTERM to ffmpeg")
|
||||
w.cmd.Process.Signal(syscall.SIGTERM)
|
||||
// Wait for process to exit (up to 2 seconds)
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
done <- w.cmd.Wait()
|
||||
}()
|
||||
select {
|
||||
case <-done:
|
||||
w.logger.Debug("ffmpeg exited after SIGTERM")
|
||||
case <-time.After(2 * time.Second):
|
||||
w.logger.Warn("ffmpeg did not exit, sending SIGKILL")
|
||||
w.cmd.Process.Kill()
|
||||
<-done
|
||||
}
|
||||
}
|
||||
w.cmdMu.Unlock()
|
||||
// Cancel context (already done, but for cleanliness)
|
||||
if w.cancel != nil {
|
||||
w.cancel()
|
||||
}
|
||||
// Validate temp file
|
||||
if w.tempFile == "" {
|
||||
return "", errors.New("no recording file")
|
||||
}
|
||||
defer os.Remove(w.tempFile)
|
||||
info, err := os.Stat(w.tempFile)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to stat temp file: %w", err)
|
||||
}
|
||||
if info.Size() < 44 { // WAV header is 44 bytes
|
||||
// Log ffmpeg stderr? Already captured in debug logs.
|
||||
return "", fmt.Errorf("recording file too small (%d bytes), possibly no audio captured", info.Size())
|
||||
}
|
||||
// Run whisper.cpp binary
|
||||
cmd := exec.Command(w.whisperPath, "-m", w.modelPath, "-l", w.lang, w.tempFile)
|
||||
var outBuf, errBuf bytes.Buffer
|
||||
cmd.Stdout = &outBuf
|
||||
cmd.Stderr = &errBuf
|
||||
if err := cmd.Run(); err != nil {
|
||||
w.logger.Error("whisper binary failed",
|
||||
"error", err,
|
||||
"stderr", errBuf.String(),
|
||||
"file_size", info.Size())
|
||||
return "", fmt.Errorf("whisper binary failed: %w (stderr: %s)", err, errBuf.String())
|
||||
}
|
||||
result := strings.TrimRight(outBuf.String(), "\n")
|
||||
result = specialRE.ReplaceAllString(result, "")
|
||||
return strings.TrimSpace(strings.ReplaceAll(result, "\n ", "\n")), nil
|
||||
}
|
||||
|
||||
// IsRecording returns true if a recording is in progress.
|
||||
func (w *WhisperBinary) IsRecording() bool {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
return w.recording
|
||||
}
|
||||
|
||||
func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
|
||||
@@ -44,283 +174,3 @@ func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
|
||||
cancel: cancel,
|
||||
}
|
||||
}
|
||||
|
||||
func (w *WhisperBinary) StartRecording() error {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
if w.recording {
|
||||
return errors.New("recording is already in progress")
|
||||
}
|
||||
// If context is cancelled, create a new one for the next recording session
|
||||
if w.ctx.Err() != nil {
|
||||
w.logger.Debug("Context cancelled, creating new context")
|
||||
w.ctx, w.cancel = context.WithCancel(context.Background())
|
||||
}
|
||||
// Temporarily redirect stderr to suppress ALSA warnings during PortAudio init
|
||||
origStderr, errDup := syscall.Dup(syscall.Stderr)
|
||||
if errDup != nil {
|
||||
return fmt.Errorf("failed to dup stderr: %w", errDup)
|
||||
}
|
||||
nullFD, err := syscall.Open("/dev/null", syscall.O_WRONLY, 0)
|
||||
if err != nil {
|
||||
_ = syscall.Close(origStderr) // Close the dup'd fd if open fails
|
||||
return fmt.Errorf("failed to open /dev/null: %w", err)
|
||||
}
|
||||
// redirect stderr
|
||||
_ = syscall.Dup2(nullFD, syscall.Stderr)
|
||||
// Initialize PortAudio (this is where ALSA warnings occur)
|
||||
portaudioErr := portaudio.Initialize()
|
||||
defer func() {
|
||||
// Restore stderr
|
||||
_ = syscall.Dup2(origStderr, syscall.Stderr)
|
||||
_ = syscall.Close(origStderr)
|
||||
_ = syscall.Close(nullFD)
|
||||
}()
|
||||
if portaudioErr != nil {
|
||||
return fmt.Errorf("portaudio init failed: %w", portaudioErr)
|
||||
}
|
||||
// Initialize audio buffer
|
||||
w.audioBuffer = make([]int16, 0)
|
||||
in := make([]int16, 1024) // buffer size
|
||||
stream, err := portaudio.OpenDefaultStream(1, 0, 16000.0, len(in), in)
|
||||
if err != nil {
|
||||
if paErr := portaudio.Terminate(); paErr != nil {
|
||||
return fmt.Errorf("failed to open microphone: %w; terminate error: %w", err, paErr)
|
||||
}
|
||||
return fmt.Errorf("failed to open microphone: %w", err)
|
||||
}
|
||||
go w.recordAudio(stream, in)
|
||||
w.recording = true
|
||||
w.logger.Debug("Recording started")
|
||||
return nil
|
||||
}
|
||||
|
||||
func (w *WhisperBinary) recordAudio(stream *portaudio.Stream, in []int16) {
|
||||
defer func() {
|
||||
w.logger.Debug("recordAudio defer function called")
|
||||
_ = stream.Stop() // Stop the stream
|
||||
_ = portaudio.Terminate() // ignoring error as we're shutting down
|
||||
w.logger.Debug("recordAudio terminated")
|
||||
}()
|
||||
w.logger.Debug("Starting audio stream")
|
||||
if err := stream.Start(); err != nil {
|
||||
w.logger.Error("Failed to start audio stream", "error", err)
|
||||
return
|
||||
}
|
||||
w.logger.Debug("Audio stream started, entering recording loop")
|
||||
for {
|
||||
select {
|
||||
case <-w.ctx.Done():
|
||||
w.logger.Debug("Context done, exiting recording loop")
|
||||
return
|
||||
default:
|
||||
// Check recording status with minimal lock time
|
||||
w.mu.Lock()
|
||||
recording := w.recording
|
||||
w.mu.Unlock()
|
||||
|
||||
if !recording {
|
||||
w.logger.Debug("Recording flag is false, exiting recording loop")
|
||||
return
|
||||
}
|
||||
if err := stream.Read(); err != nil {
|
||||
w.logger.Error("Error reading from stream", "error", err)
|
||||
return
|
||||
}
|
||||
// Append samples to buffer - only acquire lock when necessary
|
||||
w.mu.Lock()
|
||||
if w.audioBuffer == nil {
|
||||
w.audioBuffer = make([]int16, 0)
|
||||
}
|
||||
// Make a copy of the input buffer to avoid overwriting
|
||||
tempBuffer := make([]int16, len(in))
|
||||
copy(tempBuffer, in)
|
||||
w.audioBuffer = append(w.audioBuffer, tempBuffer...)
|
||||
w.mu.Unlock()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (w *WhisperBinary) StopRecording() (string, error) {
|
||||
w.logger.Debug("StopRecording called")
|
||||
w.mu.Lock()
|
||||
if !w.recording {
|
||||
w.mu.Unlock()
|
||||
return "", errors.New("not currently recording")
|
||||
}
|
||||
w.logger.Debug("Setting recording to false and cancelling context")
|
||||
w.recording = false
|
||||
w.cancel() // This will stop the recording goroutine
|
||||
w.mu.Unlock()
|
||||
// // Small delay to allow the recording goroutine to react to context cancellation
|
||||
// time.Sleep(20 * time.Millisecond)
|
||||
// Save the recorded audio to a temporary file
|
||||
tempFile, err := w.saveAudioToTempFile()
|
||||
if err != nil {
|
||||
w.logger.Error("Error saving audio to temp file", "error", err)
|
||||
return "", fmt.Errorf("failed to save audio to temp file: %w", err)
|
||||
}
|
||||
w.logger.Debug("Saved audio to temp file", "file", tempFile)
|
||||
// Run the whisper binary with a separate context to avoid cancellation during transcription
|
||||
cmd := exec.Command(w.whisperPath, "-m", w.modelPath, "-l", w.lang, tempFile, "2>/dev/null")
|
||||
var outBuf bytes.Buffer
|
||||
cmd.Stdout = &outBuf
|
||||
// Redirect stderr to suppress ALSA warnings and other stderr output
|
||||
cmd.Stderr = io.Discard // Suppress stderr output from whisper binary
|
||||
w.logger.Debug("Running whisper binary command")
|
||||
if err := cmd.Run(); err != nil {
|
||||
// Clean up audio buffer
|
||||
w.mu.Lock()
|
||||
w.audioBuffer = nil
|
||||
w.mu.Unlock()
|
||||
// Since we're suppressing stderr, we'll just log that the command failed
|
||||
w.logger.Error("Error running whisper binary", "error", err)
|
||||
return "", fmt.Errorf("whisper binary failed: %w", err)
|
||||
}
|
||||
result := outBuf.String()
|
||||
w.logger.Debug("Whisper binary completed", "result", result)
|
||||
// Clean up audio buffer
|
||||
w.mu.Lock()
|
||||
w.audioBuffer = nil
|
||||
w.mu.Unlock()
|
||||
// Clean up the temporary file after transcription
|
||||
w.logger.Debug("StopRecording completed")
|
||||
os.Remove(tempFile)
|
||||
result = strings.TrimRight(result, "\n")
|
||||
// in case there are special tokens like [_BEG_]
|
||||
result = specialRE.ReplaceAllString(result, "")
|
||||
return strings.TrimSpace(strings.ReplaceAll(result, "\n ", "\n")), nil
|
||||
}
|
||||
|
||||
// saveAudioToTempFile saves the recorded audio data to a temporary WAV file
|
||||
func (w *WhisperBinary) saveAudioToTempFile() (string, error) {
|
||||
w.logger.Debug("saveAudioToTempFile called")
|
||||
// Create temporary WAV file
|
||||
tempFile, err := os.CreateTemp("", "recording_*.wav")
|
||||
if err != nil {
|
||||
w.logger.Error("Failed to create temp file", "error", err)
|
||||
return "", fmt.Errorf("failed to create temp file: %w", err)
|
||||
}
|
||||
w.logger.Debug("Created temp file", "file", tempFile.Name())
|
||||
defer tempFile.Close()
|
||||
|
||||
// Write WAV header and data
|
||||
w.logger.Debug("About to write WAV file", "file", tempFile.Name())
|
||||
err = w.writeWAVFile(tempFile.Name())
|
||||
if err != nil {
|
||||
w.logger.Error("Error writing WAV file", "error", err)
|
||||
return "", fmt.Errorf("failed to write WAV file: %w", err)
|
||||
}
|
||||
w.logger.Debug("WAV file written successfully", "file", tempFile.Name())
|
||||
|
||||
return tempFile.Name(), nil
|
||||
}
|
||||
|
||||
// writeWAVFile creates a WAV file from the recorded audio data
|
||||
func (w *WhisperBinary) writeWAVFile(filename string) error {
|
||||
w.logger.Debug("writeWAVFile called", "filename", filename)
|
||||
// Open file for writing
|
||||
file, err := os.Create(filename)
|
||||
if err != nil {
|
||||
w.logger.Error("Error creating file", "error", err)
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
w.logger.Debug("About to acquire mutex in writeWAVFile")
|
||||
w.mu.Lock()
|
||||
w.logger.Debug("Locked mutex, copying audio buffer")
|
||||
audioData := make([]int16, len(w.audioBuffer))
|
||||
copy(audioData, w.audioBuffer)
|
||||
w.mu.Unlock()
|
||||
w.logger.Debug("Unlocked mutex", "audio_data_length", len(audioData))
|
||||
|
||||
if len(audioData) == 0 {
|
||||
w.logger.Warn("No audio data to write")
|
||||
return errors.New("no audio data to write")
|
||||
}
|
||||
|
||||
// Calculate data size (number of samples * size of int16)
|
||||
dataSize := len(audioData) * 2 // 2 bytes per int16 sample
|
||||
w.logger.Debug("Calculated data size", "size", dataSize)
|
||||
|
||||
// Write WAV header with the correct data size
|
||||
header := w.createWAVHeader(16000, 1, 16, dataSize)
|
||||
_, err = file.Write(header)
|
||||
if err != nil {
|
||||
w.logger.Error("Error writing WAV header", "error", err)
|
||||
return err
|
||||
}
|
||||
w.logger.Debug("WAV header written successfully")
|
||||
|
||||
// Write audio data
|
||||
w.logger.Debug("About to write audio data samples")
|
||||
for i, sample := range audioData {
|
||||
// Write little-endian 16-bit sample
|
||||
_, err := file.Write([]byte{byte(sample), byte(sample >> 8)})
|
||||
if err != nil {
|
||||
w.logger.Error("Error writing sample", "index", i, "error", err)
|
||||
return err
|
||||
}
|
||||
// Log progress every 10000 samples to avoid too much output
|
||||
if i%10000 == 0 {
|
||||
w.logger.Debug("Written samples", "count", i)
|
||||
}
|
||||
}
|
||||
w.logger.Debug("All audio data written successfully")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// createWAVHeader creates a WAV file header
|
||||
func (w *WhisperBinary) createWAVHeader(sampleRate, channels, bitsPerSample int, dataSize int) []byte {
|
||||
header := make([]byte, 44)
|
||||
copy(header[0:4], "RIFF")
|
||||
// Total file size will be updated later
|
||||
copy(header[8:12], "WAVE")
|
||||
copy(header[12:16], "fmt ")
|
||||
// fmt chunk size (16 for PCM)
|
||||
header[16] = 16
|
||||
header[17] = 0
|
||||
header[18] = 0
|
||||
header[19] = 0
|
||||
// Audio format (1 = PCM)
|
||||
header[20] = 1
|
||||
header[21] = 0
|
||||
// Number of channels
|
||||
header[22] = byte(channels)
|
||||
header[23] = 0
|
||||
// Sample rate
|
||||
header[24] = byte(sampleRate)
|
||||
header[25] = byte(sampleRate >> 8)
|
||||
header[26] = byte(sampleRate >> 16)
|
||||
header[27] = byte(sampleRate >> 24)
|
||||
// Byte rate
|
||||
byteRate := sampleRate * channels * bitsPerSample / 8
|
||||
header[28] = byte(byteRate)
|
||||
header[29] = byte(byteRate >> 8)
|
||||
header[30] = byte(byteRate >> 16)
|
||||
header[31] = byte(byteRate >> 24)
|
||||
// Block align
|
||||
blockAlign := channels * bitsPerSample / 8
|
||||
header[32] = byte(blockAlign)
|
||||
header[33] = 0
|
||||
// Bits per sample
|
||||
header[34] = byte(bitsPerSample)
|
||||
header[35] = 0
|
||||
// "data" subchunk
|
||||
copy(header[36:40], "data")
|
||||
// Data size
|
||||
header[40] = byte(dataSize)
|
||||
header[41] = byte(dataSize >> 8)
|
||||
header[42] = byte(dataSize >> 16)
|
||||
header[43] = byte(dataSize >> 24)
|
||||
|
||||
return header
|
||||
}
|
||||
|
||||
func (w *WhisperBinary) IsRecording() bool {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
return w.recording
|
||||
}
|
||||
|
||||
156
extra/whisper_server.go
Normal file
156
extra/whisper_server.go
Normal file
@@ -0,0 +1,156 @@
|
||||
//go:build extra
|
||||
// +build extra
|
||||
|
||||
package extra
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type WhisperServer struct {
|
||||
logger *slog.Logger
|
||||
ServerURL string
|
||||
SampleRate int
|
||||
AudioBuffer *bytes.Buffer
|
||||
recording bool // protected by mu
|
||||
mu sync.Mutex // protects recording & AudioBuffer
|
||||
cmd *exec.Cmd // protected by cmdMu
|
||||
stopCh chan struct{} // protected by cmdMu
|
||||
cmdMu sync.Mutex // protects cmd and stopCh
|
||||
}
|
||||
|
||||
func (stt *WhisperServer) StartRecording() error {
|
||||
stt.mu.Lock()
|
||||
defer stt.mu.Unlock()
|
||||
if stt.recording {
|
||||
return nil
|
||||
}
|
||||
// Build ffmpeg command for microphone capture
|
||||
args := []string{
|
||||
"-f", "alsa",
|
||||
"-i", "default",
|
||||
"-acodec", "pcm_s16le",
|
||||
"-ar", fmt.Sprint(stt.SampleRate),
|
||||
"-ac", "1",
|
||||
"-f", "s16le",
|
||||
"-",
|
||||
}
|
||||
cmd := exec.Command("ffmpeg", args...)
|
||||
stdout, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get stdout pipe: %w", err)
|
||||
}
|
||||
stt.cmdMu.Lock()
|
||||
stt.cmd = cmd
|
||||
stt.stopCh = make(chan struct{})
|
||||
stt.cmdMu.Unlock()
|
||||
if err := cmd.Start(); err != nil {
|
||||
return fmt.Errorf("failed to start ffmpeg: %w", err)
|
||||
}
|
||||
stt.recording = true
|
||||
stt.AudioBuffer.Reset()
|
||||
// Read PCM data in goroutine
|
||||
go func() {
|
||||
buf := make([]byte, 4096)
|
||||
for {
|
||||
select {
|
||||
case <-stt.stopCh:
|
||||
return
|
||||
default:
|
||||
n, err := stdout.Read(buf)
|
||||
if n > 0 {
|
||||
stt.mu.Lock()
|
||||
stt.AudioBuffer.Write(buf[:n])
|
||||
stt.mu.Unlock()
|
||||
}
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
stt.logger.Error("recording read error", "error", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (stt *WhisperServer) StopRecording() (string, error) {
|
||||
stt.mu.Lock()
|
||||
defer stt.mu.Unlock()
|
||||
if !stt.recording {
|
||||
return "", errors.New("not recording")
|
||||
}
|
||||
stt.recording = false
|
||||
// Stop ffmpeg
|
||||
stt.cmdMu.Lock()
|
||||
if stt.cmd != nil && stt.cmd.Process != nil {
|
||||
stt.cmd.Process.Kill()
|
||||
stt.cmd.Wait()
|
||||
}
|
||||
close(stt.stopCh)
|
||||
stt.cmdMu.Unlock()
|
||||
// Rest of StopRecording unchanged (WAV header + HTTP upload)
|
||||
// ...
|
||||
stt.recording = false
|
||||
// wait loop to finish?
|
||||
if stt.AudioBuffer == nil {
|
||||
err := errors.New("unexpected nil AudioBuffer")
|
||||
stt.logger.Error(err.Error())
|
||||
return "", err
|
||||
}
|
||||
// Create WAV header first
|
||||
body := &bytes.Buffer{}
|
||||
writer := multipart.NewWriter(body)
|
||||
// Add audio file part
|
||||
part, err := writer.CreateFormFile("file", "recording.wav")
|
||||
if err != nil {
|
||||
stt.logger.Error("fn: StopRecording", "error", err)
|
||||
return "", err
|
||||
}
|
||||
// Stream directly to multipart writer: header + raw data
|
||||
dataSize := stt.AudioBuffer.Len()
|
||||
stt.writeWavHeader(part, dataSize)
|
||||
if _, err := io.Copy(part, stt.AudioBuffer); err != nil {
|
||||
stt.logger.Error("fn: StopRecording", "error", err)
|
||||
return "", err
|
||||
}
|
||||
// Reset buffer for next recording
|
||||
stt.AudioBuffer.Reset()
|
||||
// Add response format field
|
||||
err = writer.WriteField("response_format", "text")
|
||||
if err != nil {
|
||||
stt.logger.Error("fn: StopRecording", "error", err)
|
||||
return "", err
|
||||
}
|
||||
if writer.Close() != nil {
|
||||
stt.logger.Error("fn: StopRecording", "error", err)
|
||||
return "", err
|
||||
}
|
||||
// Send request
|
||||
resp, err := http.Post(stt.ServerURL, writer.FormDataContentType(), body) //nolint:noctx
|
||||
if err != nil {
|
||||
stt.logger.Error("fn: StopRecording", "error", err)
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
// Read and print response
|
||||
responseTextBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
stt.logger.Error("fn: StopRecording", "error", err)
|
||||
return "", err
|
||||
}
|
||||
resptext := strings.TrimRight(string(responseTextBytes), "\n")
|
||||
// in case there are special tokens like [_BEG_]
|
||||
resptext = specialRE.ReplaceAllString(resptext, "")
|
||||
return strings.TrimSpace(strings.ReplaceAll(resptext, "\n ", "\n")), nil
|
||||
}
|
||||
1
go.mod
1
go.mod
@@ -9,7 +9,6 @@ require (
|
||||
github.com/PuerkitoBio/goquery v1.11.0
|
||||
github.com/gdamore/tcell/v2 v2.13.2
|
||||
github.com/glebarez/go-sqlite v1.22.0
|
||||
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b
|
||||
github.com/jmoiron/sqlx v1.4.0
|
||||
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
|
||||
github.com/neurosnap/sentences v1.1.2
|
||||
|
||||
2
go.sum
2
go.sum
@@ -37,8 +37,6 @@ github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17k
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b h1:WEuQWBxelOGHA6z9lABqaMLMrfwVyMdN3UgRLT+YUPo=
|
||||
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b/go.mod h1:esZFQEUwqC+l76f2R8bIWSwXMaPbp79PppwZ1eJhFco=
|
||||
github.com/hajimehoshi/go-mp3 v0.3.4 h1:NUP7pBYH8OguP4diaTZ9wJbUbk3tC0KlfzsEpWmYj68=
|
||||
github.com/hajimehoshi/go-mp3 v0.3.4/go.mod h1:fRtZraRFcWb0pu7ok0LqyFhCUrPeMsGRSVop0eemFmo=
|
||||
github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo=
|
||||
|
||||
Reference in New Issue
Block a user