From c8f00198d6f0ad66269753252f56485ee346d413 Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Sat, 7 Mar 2026 18:13:11 +0300 Subject: [PATCH] Dep (stt): use ffmpeg instead of portaudio --- Makefile | 7 +- batteries/docker-compose.yml | 20 +- extra/stt.go | 132 ----------- extra/whisper_binary.go | 426 ++++++++++++----------------------- extra/whisper_server.go | 156 +++++++++++++ go.mod | 1 - go.sum | 2 - 7 files changed, 310 insertions(+), 434 deletions(-) create mode 100644 extra/whisper_server.go diff --git a/Makefile b/Makefile index 78db940..1490074 100644 --- a/Makefile +++ b/Makefile @@ -143,11 +143,10 @@ build-whisper: ## Build whisper.cpp from source in batteries directory download-whisper-model: ## Download Whisper model for STT in batteries directory @echo "Downloading Whisper model for STT..." - @if [ ! -d "batteries/whisper.cpp" ]; then \ - echo "Please run 'make setup-whisper' first to clone the repository."; \ - exit 1; \ + @if [ ! -d "batteries/whisper.cpp/models" ]; then \ + mkdir -p "batteries/whisper.cpp/models" \ fi - @cd batteries/whisper.cpp && bash ./models/download-ggml-model.sh large-v3-turbo-q5_0 + curl -o batteries/whisper.cpp/models/ggml-large-v3-turbo-q5_0.bin -L "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q5_0.bin?download=true" @echo "Whisper model downloaded successfully!" # Docker targets for STT/TTS services (in batteries directory) diff --git a/batteries/docker-compose.yml b/batteries/docker-compose.yml index 7cf401b..84b2262 100644 --- a/batteries/docker-compose.yml +++ b/batteries/docker-compose.yml @@ -6,19 +6,27 @@ services: ports: - "8081:8081" volumes: - - whisper_models:/app/models + - ./whisper.cpp/models/ggml-large-v3-turbo-q5_0.bin:/app/models/ggml-large-v3-turbo-q5_0.bin working_dir: /app entrypoint: "" command: > sh -c " - if [ ! -f /app/models/ggml-large-v3-turbo.bin ]; then - echo 'Downloading ggml-large-v3-turbo model...' - ./download-ggml-model.sh large-v3-turbo /app/models + if [ ! -f /app/models/ggml-large-v3-turbo-q5_0.bin ]; then + echo 'Downloading ggml-large-v3-turboq5_0 model...' + curl -o /app/models/ggml-large-v3-turbo-q5_0.bin -L "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q5_0.bin?download=true" fi && - ./build/bin/whisper-server -m /app/models/ggml-large-v3-turbo.bin -t 4 -p 1 --port 8081 --host 0.0.0.0 + ./build/bin/whisper-server -m /app/models/ggml-large-v3-turbo-q5_0.bin -t 4 -p 1 --port 8081 --host 0.0.0.0 " environment: - WHISPER_LOG_LEVEL=3 + # For GPU support, uncomment the following lines: + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] # Restart policy in case the service fails restart: unless-stopped @@ -45,7 +53,5 @@ services: volumes: models: driver: local - audio: - driver: local whisper_models: driver: local diff --git a/extra/stt.go b/extra/stt.go index 86fcf9c..7bbf2fd 100644 --- a/extra/stt.go +++ b/extra/stt.go @@ -6,18 +6,10 @@ package extra import ( "bytes" "encoding/binary" - "errors" - "fmt" "gf-lt/config" "io" "log/slog" - "mime/multipart" - "net/http" "regexp" - "strings" - "syscall" - - "github.com/gordonklaus/portaudio" ) var specialRE = regexp.MustCompile(`\[.*?\]`) @@ -44,14 +36,6 @@ func NewSTT(logger *slog.Logger, cfg *config.Config) STT { return NewWhisperServer(logger, cfg) } -type WhisperServer struct { - logger *slog.Logger - ServerURL string - SampleRate int - AudioBuffer *bytes.Buffer - recording bool -} - func NewWhisperServer(logger *slog.Logger, cfg *config.Config) *WhisperServer { return &WhisperServer{ logger: logger, @@ -61,69 +45,6 @@ func NewWhisperServer(logger *slog.Logger, cfg *config.Config) *WhisperServer { } } -func (stt *WhisperServer) StartRecording() error { - if err := stt.microphoneStream(stt.SampleRate); err != nil { - return fmt.Errorf("failed to init microphone: %w", err) - } - stt.recording = true - return nil -} - -func (stt *WhisperServer) StopRecording() (string, error) { - stt.recording = false - // wait loop to finish? - if stt.AudioBuffer == nil { - err := errors.New("unexpected nil AudioBuffer") - stt.logger.Error(err.Error()) - return "", err - } - // Create WAV header first - body := &bytes.Buffer{} - writer := multipart.NewWriter(body) - // Add audio file part - part, err := writer.CreateFormFile("file", "recording.wav") - if err != nil { - stt.logger.Error("fn: StopRecording", "error", err) - return "", err - } - // Stream directly to multipart writer: header + raw data - dataSize := stt.AudioBuffer.Len() - stt.writeWavHeader(part, dataSize) - if _, err := io.Copy(part, stt.AudioBuffer); err != nil { - stt.logger.Error("fn: StopRecording", "error", err) - return "", err - } - // Reset buffer for next recording - stt.AudioBuffer.Reset() - // Add response format field - err = writer.WriteField("response_format", "text") - if err != nil { - stt.logger.Error("fn: StopRecording", "error", err) - return "", err - } - if writer.Close() != nil { - stt.logger.Error("fn: StopRecording", "error", err) - return "", err - } - // Send request - resp, err := http.Post(stt.ServerURL, writer.FormDataContentType(), body) //nolint:noctx - if err != nil { - stt.logger.Error("fn: StopRecording", "error", err) - return "", err - } - defer resp.Body.Close() - // Read and print response - responseTextBytes, err := io.ReadAll(resp.Body) - if err != nil { - stt.logger.Error("fn: StopRecording", "error", err) - return "", err - } - resptext := strings.TrimRight(string(responseTextBytes), "\n") - // in case there are special tokens like [_BEG_] - resptext = specialRE.ReplaceAllString(resptext, "") - return strings.TrimSpace(strings.ReplaceAll(resptext, "\n ", "\n")), nil -} - func (stt *WhisperServer) writeWavHeader(w io.Writer, dataSize int) { header := make([]byte, 44) copy(header[0:4], "RIFF") @@ -147,56 +68,3 @@ func (stt *WhisperServer) writeWavHeader(w io.Writer, dataSize int) { func (stt *WhisperServer) IsRecording() bool { return stt.recording } - -func (stt *WhisperServer) microphoneStream(sampleRate int) error { - // Temporarily redirect stderr to suppress ALSA warnings during PortAudio init - origStderr, errDup := syscall.Dup(syscall.Stderr) - if errDup != nil { - return fmt.Errorf("failed to dup stderr: %w", errDup) - } - nullFD, err := syscall.Open("/dev/null", syscall.O_WRONLY, 0) - if err != nil { - _ = syscall.Close(origStderr) // Close the dup'd fd if open fails - return fmt.Errorf("failed to open /dev/null: %w", err) - } - // redirect stderr - _ = syscall.Dup2(nullFD, syscall.Stderr) - // Initialize PortAudio (this is where ALSA warnings occur) - defer func() { - // Restore stderr - _ = syscall.Dup2(origStderr, syscall.Stderr) - _ = syscall.Close(origStderr) - _ = syscall.Close(nullFD) - }() - if err := portaudio.Initialize(); err != nil { - return fmt.Errorf("portaudio init failed: %w", err) - } - in := make([]int16, 64) - stream, err := portaudio.OpenDefaultStream(1, 0, float64(sampleRate), len(in), in) - if err != nil { - if paErr := portaudio.Terminate(); paErr != nil { - return fmt.Errorf("failed to open microphone: %w; terminate error: %w", err, paErr) - } - return fmt.Errorf("failed to open microphone: %w", err) - } - go func(stream *portaudio.Stream) { - if err := stream.Start(); err != nil { - stt.logger.Error("microphoneStream", "error", err) - return - } - for { - if !stt.IsRecording() { - return - } - if err := stream.Read(); err != nil { - stt.logger.Error("reading stream", "error", err) - return - } - if err := binary.Write(stt.AudioBuffer, binary.LittleEndian, in); err != nil { - stt.logger.Error("writing to buffer", "error", err) - return - } - } - }(stream) - return nil -} diff --git a/extra/whisper_binary.go b/extra/whisper_binary.go index 6b7ddc8..1c35952 100644 --- a/extra/whisper_binary.go +++ b/extra/whisper_binary.go @@ -9,15 +9,13 @@ import ( "errors" "fmt" "gf-lt/config" - "io" "log/slog" "os" "os/exec" "strings" "sync" "syscall" - - "github.com/gordonklaus/portaudio" + "time" ) type WhisperBinary struct { @@ -25,11 +23,143 @@ type WhisperBinary struct { whisperPath string modelPath string lang string - ctx context.Context - cancel context.CancelFunc - mu sync.Mutex - recording bool - audioBuffer []int16 + // Per-recording fields (protected by mu) + mu sync.Mutex + recording bool + tempFile string + ctx context.Context + cancel context.CancelFunc + cmd *exec.Cmd + cmdMu sync.Mutex +} + +func (w *WhisperBinary) StartRecording() error { + w.mu.Lock() + defer w.mu.Unlock() + if w.recording { + return errors.New("recording is already in progress") + } + // Fresh context for this recording + ctx, cancel := context.WithCancel(context.Background()) + w.ctx = ctx + w.cancel = cancel + // Create temporary file + tempFile, err := os.CreateTemp("", "recording_*.wav") + if err != nil { + cancel() + return fmt.Errorf("failed to create temp file: %w", err) + } + tempFile.Close() + w.tempFile = tempFile.Name() + // ffmpeg command: capture from default microphone, write WAV + args := []string{ + "-f", "alsa", // or "pulse" if preferred + "-i", "default", + "-acodec", "pcm_s16le", + "-ar", "16000", + "-ac", "1", + "-y", // overwrite output file + w.tempFile, + } + cmd := exec.CommandContext(w.ctx, "ffmpeg", args...) + // Capture stderr for debugging (optional, but useful for diagnosing) + stderr, err := cmd.StderrPipe() + if err != nil { + cancel() + os.Remove(w.tempFile) + return fmt.Errorf("failed to create stderr pipe: %w", err) + } + go func() { + buf := make([]byte, 1024) + for { + n, err := stderr.Read(buf) + if n > 0 { + w.logger.Debug("ffmpeg stderr", "output", string(buf[:n])) + } + if err != nil { + break + } + } + }() + w.cmdMu.Lock() + w.cmd = cmd + w.cmdMu.Unlock() + if err := cmd.Start(); err != nil { + cancel() + os.Remove(w.tempFile) + return fmt.Errorf("failed to start ffmpeg: %w", err) + } + w.recording = true + w.logger.Debug("Recording started", "file", w.tempFile) + return nil +} + +func (w *WhisperBinary) StopRecording() (string, error) { + w.mu.Lock() + defer w.mu.Unlock() + if !w.recording { + return "", errors.New("not currently recording") + } + w.recording = false + // Gracefully stop ffmpeg + w.cmdMu.Lock() + if w.cmd != nil && w.cmd.Process != nil { + w.logger.Debug("Sending SIGTERM to ffmpeg") + w.cmd.Process.Signal(syscall.SIGTERM) + // Wait for process to exit (up to 2 seconds) + done := make(chan error, 1) + go func() { + done <- w.cmd.Wait() + }() + select { + case <-done: + w.logger.Debug("ffmpeg exited after SIGTERM") + case <-time.After(2 * time.Second): + w.logger.Warn("ffmpeg did not exit, sending SIGKILL") + w.cmd.Process.Kill() + <-done + } + } + w.cmdMu.Unlock() + // Cancel context (already done, but for cleanliness) + if w.cancel != nil { + w.cancel() + } + // Validate temp file + if w.tempFile == "" { + return "", errors.New("no recording file") + } + defer os.Remove(w.tempFile) + info, err := os.Stat(w.tempFile) + if err != nil { + return "", fmt.Errorf("failed to stat temp file: %w", err) + } + if info.Size() < 44 { // WAV header is 44 bytes + // Log ffmpeg stderr? Already captured in debug logs. + return "", fmt.Errorf("recording file too small (%d bytes), possibly no audio captured", info.Size()) + } + // Run whisper.cpp binary + cmd := exec.Command(w.whisperPath, "-m", w.modelPath, "-l", w.lang, w.tempFile) + var outBuf, errBuf bytes.Buffer + cmd.Stdout = &outBuf + cmd.Stderr = &errBuf + if err := cmd.Run(); err != nil { + w.logger.Error("whisper binary failed", + "error", err, + "stderr", errBuf.String(), + "file_size", info.Size()) + return "", fmt.Errorf("whisper binary failed: %w (stderr: %s)", err, errBuf.String()) + } + result := strings.TrimRight(outBuf.String(), "\n") + result = specialRE.ReplaceAllString(result, "") + return strings.TrimSpace(strings.ReplaceAll(result, "\n ", "\n")), nil +} + +// IsRecording returns true if a recording is in progress. +func (w *WhisperBinary) IsRecording() bool { + w.mu.Lock() + defer w.mu.Unlock() + return w.recording } func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary { @@ -44,283 +174,3 @@ func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary { cancel: cancel, } } - -func (w *WhisperBinary) StartRecording() error { - w.mu.Lock() - defer w.mu.Unlock() - if w.recording { - return errors.New("recording is already in progress") - } - // If context is cancelled, create a new one for the next recording session - if w.ctx.Err() != nil { - w.logger.Debug("Context cancelled, creating new context") - w.ctx, w.cancel = context.WithCancel(context.Background()) - } - // Temporarily redirect stderr to suppress ALSA warnings during PortAudio init - origStderr, errDup := syscall.Dup(syscall.Stderr) - if errDup != nil { - return fmt.Errorf("failed to dup stderr: %w", errDup) - } - nullFD, err := syscall.Open("/dev/null", syscall.O_WRONLY, 0) - if err != nil { - _ = syscall.Close(origStderr) // Close the dup'd fd if open fails - return fmt.Errorf("failed to open /dev/null: %w", err) - } - // redirect stderr - _ = syscall.Dup2(nullFD, syscall.Stderr) - // Initialize PortAudio (this is where ALSA warnings occur) - portaudioErr := portaudio.Initialize() - defer func() { - // Restore stderr - _ = syscall.Dup2(origStderr, syscall.Stderr) - _ = syscall.Close(origStderr) - _ = syscall.Close(nullFD) - }() - if portaudioErr != nil { - return fmt.Errorf("portaudio init failed: %w", portaudioErr) - } - // Initialize audio buffer - w.audioBuffer = make([]int16, 0) - in := make([]int16, 1024) // buffer size - stream, err := portaudio.OpenDefaultStream(1, 0, 16000.0, len(in), in) - if err != nil { - if paErr := portaudio.Terminate(); paErr != nil { - return fmt.Errorf("failed to open microphone: %w; terminate error: %w", err, paErr) - } - return fmt.Errorf("failed to open microphone: %w", err) - } - go w.recordAudio(stream, in) - w.recording = true - w.logger.Debug("Recording started") - return nil -} - -func (w *WhisperBinary) recordAudio(stream *portaudio.Stream, in []int16) { - defer func() { - w.logger.Debug("recordAudio defer function called") - _ = stream.Stop() // Stop the stream - _ = portaudio.Terminate() // ignoring error as we're shutting down - w.logger.Debug("recordAudio terminated") - }() - w.logger.Debug("Starting audio stream") - if err := stream.Start(); err != nil { - w.logger.Error("Failed to start audio stream", "error", err) - return - } - w.logger.Debug("Audio stream started, entering recording loop") - for { - select { - case <-w.ctx.Done(): - w.logger.Debug("Context done, exiting recording loop") - return - default: - // Check recording status with minimal lock time - w.mu.Lock() - recording := w.recording - w.mu.Unlock() - - if !recording { - w.logger.Debug("Recording flag is false, exiting recording loop") - return - } - if err := stream.Read(); err != nil { - w.logger.Error("Error reading from stream", "error", err) - return - } - // Append samples to buffer - only acquire lock when necessary - w.mu.Lock() - if w.audioBuffer == nil { - w.audioBuffer = make([]int16, 0) - } - // Make a copy of the input buffer to avoid overwriting - tempBuffer := make([]int16, len(in)) - copy(tempBuffer, in) - w.audioBuffer = append(w.audioBuffer, tempBuffer...) - w.mu.Unlock() - } - } -} - -func (w *WhisperBinary) StopRecording() (string, error) { - w.logger.Debug("StopRecording called") - w.mu.Lock() - if !w.recording { - w.mu.Unlock() - return "", errors.New("not currently recording") - } - w.logger.Debug("Setting recording to false and cancelling context") - w.recording = false - w.cancel() // This will stop the recording goroutine - w.mu.Unlock() - // // Small delay to allow the recording goroutine to react to context cancellation - // time.Sleep(20 * time.Millisecond) - // Save the recorded audio to a temporary file - tempFile, err := w.saveAudioToTempFile() - if err != nil { - w.logger.Error("Error saving audio to temp file", "error", err) - return "", fmt.Errorf("failed to save audio to temp file: %w", err) - } - w.logger.Debug("Saved audio to temp file", "file", tempFile) - // Run the whisper binary with a separate context to avoid cancellation during transcription - cmd := exec.Command(w.whisperPath, "-m", w.modelPath, "-l", w.lang, tempFile, "2>/dev/null") - var outBuf bytes.Buffer - cmd.Stdout = &outBuf - // Redirect stderr to suppress ALSA warnings and other stderr output - cmd.Stderr = io.Discard // Suppress stderr output from whisper binary - w.logger.Debug("Running whisper binary command") - if err := cmd.Run(); err != nil { - // Clean up audio buffer - w.mu.Lock() - w.audioBuffer = nil - w.mu.Unlock() - // Since we're suppressing stderr, we'll just log that the command failed - w.logger.Error("Error running whisper binary", "error", err) - return "", fmt.Errorf("whisper binary failed: %w", err) - } - result := outBuf.String() - w.logger.Debug("Whisper binary completed", "result", result) - // Clean up audio buffer - w.mu.Lock() - w.audioBuffer = nil - w.mu.Unlock() - // Clean up the temporary file after transcription - w.logger.Debug("StopRecording completed") - os.Remove(tempFile) - result = strings.TrimRight(result, "\n") - // in case there are special tokens like [_BEG_] - result = specialRE.ReplaceAllString(result, "") - return strings.TrimSpace(strings.ReplaceAll(result, "\n ", "\n")), nil -} - -// saveAudioToTempFile saves the recorded audio data to a temporary WAV file -func (w *WhisperBinary) saveAudioToTempFile() (string, error) { - w.logger.Debug("saveAudioToTempFile called") - // Create temporary WAV file - tempFile, err := os.CreateTemp("", "recording_*.wav") - if err != nil { - w.logger.Error("Failed to create temp file", "error", err) - return "", fmt.Errorf("failed to create temp file: %w", err) - } - w.logger.Debug("Created temp file", "file", tempFile.Name()) - defer tempFile.Close() - - // Write WAV header and data - w.logger.Debug("About to write WAV file", "file", tempFile.Name()) - err = w.writeWAVFile(tempFile.Name()) - if err != nil { - w.logger.Error("Error writing WAV file", "error", err) - return "", fmt.Errorf("failed to write WAV file: %w", err) - } - w.logger.Debug("WAV file written successfully", "file", tempFile.Name()) - - return tempFile.Name(), nil -} - -// writeWAVFile creates a WAV file from the recorded audio data -func (w *WhisperBinary) writeWAVFile(filename string) error { - w.logger.Debug("writeWAVFile called", "filename", filename) - // Open file for writing - file, err := os.Create(filename) - if err != nil { - w.logger.Error("Error creating file", "error", err) - return err - } - defer file.Close() - - w.logger.Debug("About to acquire mutex in writeWAVFile") - w.mu.Lock() - w.logger.Debug("Locked mutex, copying audio buffer") - audioData := make([]int16, len(w.audioBuffer)) - copy(audioData, w.audioBuffer) - w.mu.Unlock() - w.logger.Debug("Unlocked mutex", "audio_data_length", len(audioData)) - - if len(audioData) == 0 { - w.logger.Warn("No audio data to write") - return errors.New("no audio data to write") - } - - // Calculate data size (number of samples * size of int16) - dataSize := len(audioData) * 2 // 2 bytes per int16 sample - w.logger.Debug("Calculated data size", "size", dataSize) - - // Write WAV header with the correct data size - header := w.createWAVHeader(16000, 1, 16, dataSize) - _, err = file.Write(header) - if err != nil { - w.logger.Error("Error writing WAV header", "error", err) - return err - } - w.logger.Debug("WAV header written successfully") - - // Write audio data - w.logger.Debug("About to write audio data samples") - for i, sample := range audioData { - // Write little-endian 16-bit sample - _, err := file.Write([]byte{byte(sample), byte(sample >> 8)}) - if err != nil { - w.logger.Error("Error writing sample", "index", i, "error", err) - return err - } - // Log progress every 10000 samples to avoid too much output - if i%10000 == 0 { - w.logger.Debug("Written samples", "count", i) - } - } - w.logger.Debug("All audio data written successfully") - - return nil -} - -// createWAVHeader creates a WAV file header -func (w *WhisperBinary) createWAVHeader(sampleRate, channels, bitsPerSample int, dataSize int) []byte { - header := make([]byte, 44) - copy(header[0:4], "RIFF") - // Total file size will be updated later - copy(header[8:12], "WAVE") - copy(header[12:16], "fmt ") - // fmt chunk size (16 for PCM) - header[16] = 16 - header[17] = 0 - header[18] = 0 - header[19] = 0 - // Audio format (1 = PCM) - header[20] = 1 - header[21] = 0 - // Number of channels - header[22] = byte(channels) - header[23] = 0 - // Sample rate - header[24] = byte(sampleRate) - header[25] = byte(sampleRate >> 8) - header[26] = byte(sampleRate >> 16) - header[27] = byte(sampleRate >> 24) - // Byte rate - byteRate := sampleRate * channels * bitsPerSample / 8 - header[28] = byte(byteRate) - header[29] = byte(byteRate >> 8) - header[30] = byte(byteRate >> 16) - header[31] = byte(byteRate >> 24) - // Block align - blockAlign := channels * bitsPerSample / 8 - header[32] = byte(blockAlign) - header[33] = 0 - // Bits per sample - header[34] = byte(bitsPerSample) - header[35] = 0 - // "data" subchunk - copy(header[36:40], "data") - // Data size - header[40] = byte(dataSize) - header[41] = byte(dataSize >> 8) - header[42] = byte(dataSize >> 16) - header[43] = byte(dataSize >> 24) - - return header -} - -func (w *WhisperBinary) IsRecording() bool { - w.mu.Lock() - defer w.mu.Unlock() - return w.recording -} diff --git a/extra/whisper_server.go b/extra/whisper_server.go new file mode 100644 index 0000000..7532f4a --- /dev/null +++ b/extra/whisper_server.go @@ -0,0 +1,156 @@ +//go:build extra +// +build extra + +package extra + +import ( + "bytes" + "errors" + "fmt" + "io" + "log/slog" + "mime/multipart" + "net/http" + "os/exec" + "strings" + "sync" +) + +type WhisperServer struct { + logger *slog.Logger + ServerURL string + SampleRate int + AudioBuffer *bytes.Buffer + recording bool // protected by mu + mu sync.Mutex // protects recording & AudioBuffer + cmd *exec.Cmd // protected by cmdMu + stopCh chan struct{} // protected by cmdMu + cmdMu sync.Mutex // protects cmd and stopCh +} + +func (stt *WhisperServer) StartRecording() error { + stt.mu.Lock() + defer stt.mu.Unlock() + if stt.recording { + return nil + } + // Build ffmpeg command for microphone capture + args := []string{ + "-f", "alsa", + "-i", "default", + "-acodec", "pcm_s16le", + "-ar", fmt.Sprint(stt.SampleRate), + "-ac", "1", + "-f", "s16le", + "-", + } + cmd := exec.Command("ffmpeg", args...) + stdout, err := cmd.StdoutPipe() + if err != nil { + return fmt.Errorf("failed to get stdout pipe: %w", err) + } + stt.cmdMu.Lock() + stt.cmd = cmd + stt.stopCh = make(chan struct{}) + stt.cmdMu.Unlock() + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start ffmpeg: %w", err) + } + stt.recording = true + stt.AudioBuffer.Reset() + // Read PCM data in goroutine + go func() { + buf := make([]byte, 4096) + for { + select { + case <-stt.stopCh: + return + default: + n, err := stdout.Read(buf) + if n > 0 { + stt.mu.Lock() + stt.AudioBuffer.Write(buf[:n]) + stt.mu.Unlock() + } + if err != nil { + if err != io.EOF { + stt.logger.Error("recording read error", "error", err) + } + return + } + } + } + }() + return nil +} + +func (stt *WhisperServer) StopRecording() (string, error) { + stt.mu.Lock() + defer stt.mu.Unlock() + if !stt.recording { + return "", errors.New("not recording") + } + stt.recording = false + // Stop ffmpeg + stt.cmdMu.Lock() + if stt.cmd != nil && stt.cmd.Process != nil { + stt.cmd.Process.Kill() + stt.cmd.Wait() + } + close(stt.stopCh) + stt.cmdMu.Unlock() + // Rest of StopRecording unchanged (WAV header + HTTP upload) + // ... + stt.recording = false + // wait loop to finish? + if stt.AudioBuffer == nil { + err := errors.New("unexpected nil AudioBuffer") + stt.logger.Error(err.Error()) + return "", err + } + // Create WAV header first + body := &bytes.Buffer{} + writer := multipart.NewWriter(body) + // Add audio file part + part, err := writer.CreateFormFile("file", "recording.wav") + if err != nil { + stt.logger.Error("fn: StopRecording", "error", err) + return "", err + } + // Stream directly to multipart writer: header + raw data + dataSize := stt.AudioBuffer.Len() + stt.writeWavHeader(part, dataSize) + if _, err := io.Copy(part, stt.AudioBuffer); err != nil { + stt.logger.Error("fn: StopRecording", "error", err) + return "", err + } + // Reset buffer for next recording + stt.AudioBuffer.Reset() + // Add response format field + err = writer.WriteField("response_format", "text") + if err != nil { + stt.logger.Error("fn: StopRecording", "error", err) + return "", err + } + if writer.Close() != nil { + stt.logger.Error("fn: StopRecording", "error", err) + return "", err + } + // Send request + resp, err := http.Post(stt.ServerURL, writer.FormDataContentType(), body) //nolint:noctx + if err != nil { + stt.logger.Error("fn: StopRecording", "error", err) + return "", err + } + defer resp.Body.Close() + // Read and print response + responseTextBytes, err := io.ReadAll(resp.Body) + if err != nil { + stt.logger.Error("fn: StopRecording", "error", err) + return "", err + } + resptext := strings.TrimRight(string(responseTextBytes), "\n") + // in case there are special tokens like [_BEG_] + resptext = specialRE.ReplaceAllString(resptext, "") + return strings.TrimSpace(strings.ReplaceAll(resptext, "\n ", "\n")), nil +} diff --git a/go.mod b/go.mod index 17609a4..615390f 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,6 @@ require ( github.com/PuerkitoBio/goquery v1.11.0 github.com/gdamore/tcell/v2 v2.13.2 github.com/glebarez/go-sqlite v1.22.0 - github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b github.com/jmoiron/sqlx v1.4.0 github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 github.com/neurosnap/sentences v1.1.2 diff --git a/go.sum b/go.sum index 565947e..6c36a06 100644 --- a/go.sum +++ b/go.sum @@ -37,8 +37,6 @@ github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17k github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b h1:WEuQWBxelOGHA6z9lABqaMLMrfwVyMdN3UgRLT+YUPo= -github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b/go.mod h1:esZFQEUwqC+l76f2R8bIWSwXMaPbp79PppwZ1eJhFco= github.com/hajimehoshi/go-mp3 v0.3.4 h1:NUP7pBYH8OguP4diaTZ9wJbUbk3tC0KlfzsEpWmYj68= github.com/hajimehoshi/go-mp3 v0.3.4/go.mod h1:fRtZraRFcWb0pu7ok0LqyFhCUrPeMsGRSVop0eemFmo= github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo=