Dep (stt): use ffmpeg instead of portaudio
This commit is contained in:
7
Makefile
7
Makefile
@@ -143,11 +143,10 @@ build-whisper: ## Build whisper.cpp from source in batteries directory
|
|||||||
|
|
||||||
download-whisper-model: ## Download Whisper model for STT in batteries directory
|
download-whisper-model: ## Download Whisper model for STT in batteries directory
|
||||||
@echo "Downloading Whisper model for STT..."
|
@echo "Downloading Whisper model for STT..."
|
||||||
@if [ ! -d "batteries/whisper.cpp" ]; then \
|
@if [ ! -d "batteries/whisper.cpp/models" ]; then \
|
||||||
echo "Please run 'make setup-whisper' first to clone the repository."; \
|
mkdir -p "batteries/whisper.cpp/models" \
|
||||||
exit 1; \
|
|
||||||
fi
|
fi
|
||||||
@cd batteries/whisper.cpp && bash ./models/download-ggml-model.sh large-v3-turbo-q5_0
|
curl -o batteries/whisper.cpp/models/ggml-large-v3-turbo-q5_0.bin -L "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q5_0.bin?download=true"
|
||||||
@echo "Whisper model downloaded successfully!"
|
@echo "Whisper model downloaded successfully!"
|
||||||
|
|
||||||
# Docker targets for STT/TTS services (in batteries directory)
|
# Docker targets for STT/TTS services (in batteries directory)
|
||||||
|
|||||||
@@ -6,19 +6,27 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "8081:8081"
|
- "8081:8081"
|
||||||
volumes:
|
volumes:
|
||||||
- whisper_models:/app/models
|
- ./whisper.cpp/models/ggml-large-v3-turbo-q5_0.bin:/app/models/ggml-large-v3-turbo-q5_0.bin
|
||||||
working_dir: /app
|
working_dir: /app
|
||||||
entrypoint: ""
|
entrypoint: ""
|
||||||
command: >
|
command: >
|
||||||
sh -c "
|
sh -c "
|
||||||
if [ ! -f /app/models/ggml-large-v3-turbo.bin ]; then
|
if [ ! -f /app/models/ggml-large-v3-turbo-q5_0.bin ]; then
|
||||||
echo 'Downloading ggml-large-v3-turbo model...'
|
echo 'Downloading ggml-large-v3-turboq5_0 model...'
|
||||||
./download-ggml-model.sh large-v3-turbo /app/models
|
curl -o /app/models/ggml-large-v3-turbo-q5_0.bin -L "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q5_0.bin?download=true"
|
||||||
fi &&
|
fi &&
|
||||||
./build/bin/whisper-server -m /app/models/ggml-large-v3-turbo.bin -t 4 -p 1 --port 8081 --host 0.0.0.0
|
./build/bin/whisper-server -m /app/models/ggml-large-v3-turbo-q5_0.bin -t 4 -p 1 --port 8081 --host 0.0.0.0
|
||||||
"
|
"
|
||||||
environment:
|
environment:
|
||||||
- WHISPER_LOG_LEVEL=3
|
- WHISPER_LOG_LEVEL=3
|
||||||
|
# For GPU support, uncomment the following lines:
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: 1
|
||||||
|
capabilities: [gpu]
|
||||||
# Restart policy in case the service fails
|
# Restart policy in case the service fails
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
@@ -45,7 +53,5 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
models:
|
models:
|
||||||
driver: local
|
driver: local
|
||||||
audio:
|
|
||||||
driver: local
|
|
||||||
whisper_models:
|
whisper_models:
|
||||||
driver: local
|
driver: local
|
||||||
|
|||||||
132
extra/stt.go
132
extra/stt.go
@@ -6,18 +6,10 @@ package extra
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"gf-lt/config"
|
"gf-lt/config"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"mime/multipart"
|
|
||||||
"net/http"
|
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
|
||||||
"syscall"
|
|
||||||
|
|
||||||
"github.com/gordonklaus/portaudio"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var specialRE = regexp.MustCompile(`\[.*?\]`)
|
var specialRE = regexp.MustCompile(`\[.*?\]`)
|
||||||
@@ -44,14 +36,6 @@ func NewSTT(logger *slog.Logger, cfg *config.Config) STT {
|
|||||||
return NewWhisperServer(logger, cfg)
|
return NewWhisperServer(logger, cfg)
|
||||||
}
|
}
|
||||||
|
|
||||||
type WhisperServer struct {
|
|
||||||
logger *slog.Logger
|
|
||||||
ServerURL string
|
|
||||||
SampleRate int
|
|
||||||
AudioBuffer *bytes.Buffer
|
|
||||||
recording bool
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewWhisperServer(logger *slog.Logger, cfg *config.Config) *WhisperServer {
|
func NewWhisperServer(logger *slog.Logger, cfg *config.Config) *WhisperServer {
|
||||||
return &WhisperServer{
|
return &WhisperServer{
|
||||||
logger: logger,
|
logger: logger,
|
||||||
@@ -61,69 +45,6 @@ func NewWhisperServer(logger *slog.Logger, cfg *config.Config) *WhisperServer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (stt *WhisperServer) StartRecording() error {
|
|
||||||
if err := stt.microphoneStream(stt.SampleRate); err != nil {
|
|
||||||
return fmt.Errorf("failed to init microphone: %w", err)
|
|
||||||
}
|
|
||||||
stt.recording = true
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (stt *WhisperServer) StopRecording() (string, error) {
|
|
||||||
stt.recording = false
|
|
||||||
// wait loop to finish?
|
|
||||||
if stt.AudioBuffer == nil {
|
|
||||||
err := errors.New("unexpected nil AudioBuffer")
|
|
||||||
stt.logger.Error(err.Error())
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
// Create WAV header first
|
|
||||||
body := &bytes.Buffer{}
|
|
||||||
writer := multipart.NewWriter(body)
|
|
||||||
// Add audio file part
|
|
||||||
part, err := writer.CreateFormFile("file", "recording.wav")
|
|
||||||
if err != nil {
|
|
||||||
stt.logger.Error("fn: StopRecording", "error", err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
// Stream directly to multipart writer: header + raw data
|
|
||||||
dataSize := stt.AudioBuffer.Len()
|
|
||||||
stt.writeWavHeader(part, dataSize)
|
|
||||||
if _, err := io.Copy(part, stt.AudioBuffer); err != nil {
|
|
||||||
stt.logger.Error("fn: StopRecording", "error", err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
// Reset buffer for next recording
|
|
||||||
stt.AudioBuffer.Reset()
|
|
||||||
// Add response format field
|
|
||||||
err = writer.WriteField("response_format", "text")
|
|
||||||
if err != nil {
|
|
||||||
stt.logger.Error("fn: StopRecording", "error", err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
if writer.Close() != nil {
|
|
||||||
stt.logger.Error("fn: StopRecording", "error", err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
// Send request
|
|
||||||
resp, err := http.Post(stt.ServerURL, writer.FormDataContentType(), body) //nolint:noctx
|
|
||||||
if err != nil {
|
|
||||||
stt.logger.Error("fn: StopRecording", "error", err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
// Read and print response
|
|
||||||
responseTextBytes, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
stt.logger.Error("fn: StopRecording", "error", err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
resptext := strings.TrimRight(string(responseTextBytes), "\n")
|
|
||||||
// in case there are special tokens like [_BEG_]
|
|
||||||
resptext = specialRE.ReplaceAllString(resptext, "")
|
|
||||||
return strings.TrimSpace(strings.ReplaceAll(resptext, "\n ", "\n")), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (stt *WhisperServer) writeWavHeader(w io.Writer, dataSize int) {
|
func (stt *WhisperServer) writeWavHeader(w io.Writer, dataSize int) {
|
||||||
header := make([]byte, 44)
|
header := make([]byte, 44)
|
||||||
copy(header[0:4], "RIFF")
|
copy(header[0:4], "RIFF")
|
||||||
@@ -147,56 +68,3 @@ func (stt *WhisperServer) writeWavHeader(w io.Writer, dataSize int) {
|
|||||||
func (stt *WhisperServer) IsRecording() bool {
|
func (stt *WhisperServer) IsRecording() bool {
|
||||||
return stt.recording
|
return stt.recording
|
||||||
}
|
}
|
||||||
|
|
||||||
func (stt *WhisperServer) microphoneStream(sampleRate int) error {
|
|
||||||
// Temporarily redirect stderr to suppress ALSA warnings during PortAudio init
|
|
||||||
origStderr, errDup := syscall.Dup(syscall.Stderr)
|
|
||||||
if errDup != nil {
|
|
||||||
return fmt.Errorf("failed to dup stderr: %w", errDup)
|
|
||||||
}
|
|
||||||
nullFD, err := syscall.Open("/dev/null", syscall.O_WRONLY, 0)
|
|
||||||
if err != nil {
|
|
||||||
_ = syscall.Close(origStderr) // Close the dup'd fd if open fails
|
|
||||||
return fmt.Errorf("failed to open /dev/null: %w", err)
|
|
||||||
}
|
|
||||||
// redirect stderr
|
|
||||||
_ = syscall.Dup2(nullFD, syscall.Stderr)
|
|
||||||
// Initialize PortAudio (this is where ALSA warnings occur)
|
|
||||||
defer func() {
|
|
||||||
// Restore stderr
|
|
||||||
_ = syscall.Dup2(origStderr, syscall.Stderr)
|
|
||||||
_ = syscall.Close(origStderr)
|
|
||||||
_ = syscall.Close(nullFD)
|
|
||||||
}()
|
|
||||||
if err := portaudio.Initialize(); err != nil {
|
|
||||||
return fmt.Errorf("portaudio init failed: %w", err)
|
|
||||||
}
|
|
||||||
in := make([]int16, 64)
|
|
||||||
stream, err := portaudio.OpenDefaultStream(1, 0, float64(sampleRate), len(in), in)
|
|
||||||
if err != nil {
|
|
||||||
if paErr := portaudio.Terminate(); paErr != nil {
|
|
||||||
return fmt.Errorf("failed to open microphone: %w; terminate error: %w", err, paErr)
|
|
||||||
}
|
|
||||||
return fmt.Errorf("failed to open microphone: %w", err)
|
|
||||||
}
|
|
||||||
go func(stream *portaudio.Stream) {
|
|
||||||
if err := stream.Start(); err != nil {
|
|
||||||
stt.logger.Error("microphoneStream", "error", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
for {
|
|
||||||
if !stt.IsRecording() {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if err := stream.Read(); err != nil {
|
|
||||||
stt.logger.Error("reading stream", "error", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if err := binary.Write(stt.AudioBuffer, binary.LittleEndian, in); err != nil {
|
|
||||||
stt.logger.Error("writing to buffer", "error", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}(stream)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -9,15 +9,13 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"gf-lt/config"
|
"gf-lt/config"
|
||||||
"io"
|
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
"time"
|
||||||
"github.com/gordonklaus/portaudio"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type WhisperBinary struct {
|
type WhisperBinary struct {
|
||||||
@@ -25,11 +23,143 @@ type WhisperBinary struct {
|
|||||||
whisperPath string
|
whisperPath string
|
||||||
modelPath string
|
modelPath string
|
||||||
lang string
|
lang string
|
||||||
ctx context.Context
|
// Per-recording fields (protected by mu)
|
||||||
cancel context.CancelFunc
|
mu sync.Mutex
|
||||||
mu sync.Mutex
|
recording bool
|
||||||
recording bool
|
tempFile string
|
||||||
audioBuffer []int16
|
ctx context.Context
|
||||||
|
cancel context.CancelFunc
|
||||||
|
cmd *exec.Cmd
|
||||||
|
cmdMu sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *WhisperBinary) StartRecording() error {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
if w.recording {
|
||||||
|
return errors.New("recording is already in progress")
|
||||||
|
}
|
||||||
|
// Fresh context for this recording
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
w.ctx = ctx
|
||||||
|
w.cancel = cancel
|
||||||
|
// Create temporary file
|
||||||
|
tempFile, err := os.CreateTemp("", "recording_*.wav")
|
||||||
|
if err != nil {
|
||||||
|
cancel()
|
||||||
|
return fmt.Errorf("failed to create temp file: %w", err)
|
||||||
|
}
|
||||||
|
tempFile.Close()
|
||||||
|
w.tempFile = tempFile.Name()
|
||||||
|
// ffmpeg command: capture from default microphone, write WAV
|
||||||
|
args := []string{
|
||||||
|
"-f", "alsa", // or "pulse" if preferred
|
||||||
|
"-i", "default",
|
||||||
|
"-acodec", "pcm_s16le",
|
||||||
|
"-ar", "16000",
|
||||||
|
"-ac", "1",
|
||||||
|
"-y", // overwrite output file
|
||||||
|
w.tempFile,
|
||||||
|
}
|
||||||
|
cmd := exec.CommandContext(w.ctx, "ffmpeg", args...)
|
||||||
|
// Capture stderr for debugging (optional, but useful for diagnosing)
|
||||||
|
stderr, err := cmd.StderrPipe()
|
||||||
|
if err != nil {
|
||||||
|
cancel()
|
||||||
|
os.Remove(w.tempFile)
|
||||||
|
return fmt.Errorf("failed to create stderr pipe: %w", err)
|
||||||
|
}
|
||||||
|
go func() {
|
||||||
|
buf := make([]byte, 1024)
|
||||||
|
for {
|
||||||
|
n, err := stderr.Read(buf)
|
||||||
|
if n > 0 {
|
||||||
|
w.logger.Debug("ffmpeg stderr", "output", string(buf[:n]))
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
w.cmdMu.Lock()
|
||||||
|
w.cmd = cmd
|
||||||
|
w.cmdMu.Unlock()
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
cancel()
|
||||||
|
os.Remove(w.tempFile)
|
||||||
|
return fmt.Errorf("failed to start ffmpeg: %w", err)
|
||||||
|
}
|
||||||
|
w.recording = true
|
||||||
|
w.logger.Debug("Recording started", "file", w.tempFile)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *WhisperBinary) StopRecording() (string, error) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
if !w.recording {
|
||||||
|
return "", errors.New("not currently recording")
|
||||||
|
}
|
||||||
|
w.recording = false
|
||||||
|
// Gracefully stop ffmpeg
|
||||||
|
w.cmdMu.Lock()
|
||||||
|
if w.cmd != nil && w.cmd.Process != nil {
|
||||||
|
w.logger.Debug("Sending SIGTERM to ffmpeg")
|
||||||
|
w.cmd.Process.Signal(syscall.SIGTERM)
|
||||||
|
// Wait for process to exit (up to 2 seconds)
|
||||||
|
done := make(chan error, 1)
|
||||||
|
go func() {
|
||||||
|
done <- w.cmd.Wait()
|
||||||
|
}()
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
w.logger.Debug("ffmpeg exited after SIGTERM")
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
w.logger.Warn("ffmpeg did not exit, sending SIGKILL")
|
||||||
|
w.cmd.Process.Kill()
|
||||||
|
<-done
|
||||||
|
}
|
||||||
|
}
|
||||||
|
w.cmdMu.Unlock()
|
||||||
|
// Cancel context (already done, but for cleanliness)
|
||||||
|
if w.cancel != nil {
|
||||||
|
w.cancel()
|
||||||
|
}
|
||||||
|
// Validate temp file
|
||||||
|
if w.tempFile == "" {
|
||||||
|
return "", errors.New("no recording file")
|
||||||
|
}
|
||||||
|
defer os.Remove(w.tempFile)
|
||||||
|
info, err := os.Stat(w.tempFile)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("failed to stat temp file: %w", err)
|
||||||
|
}
|
||||||
|
if info.Size() < 44 { // WAV header is 44 bytes
|
||||||
|
// Log ffmpeg stderr? Already captured in debug logs.
|
||||||
|
return "", fmt.Errorf("recording file too small (%d bytes), possibly no audio captured", info.Size())
|
||||||
|
}
|
||||||
|
// Run whisper.cpp binary
|
||||||
|
cmd := exec.Command(w.whisperPath, "-m", w.modelPath, "-l", w.lang, w.tempFile)
|
||||||
|
var outBuf, errBuf bytes.Buffer
|
||||||
|
cmd.Stdout = &outBuf
|
||||||
|
cmd.Stderr = &errBuf
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
w.logger.Error("whisper binary failed",
|
||||||
|
"error", err,
|
||||||
|
"stderr", errBuf.String(),
|
||||||
|
"file_size", info.Size())
|
||||||
|
return "", fmt.Errorf("whisper binary failed: %w (stderr: %s)", err, errBuf.String())
|
||||||
|
}
|
||||||
|
result := strings.TrimRight(outBuf.String(), "\n")
|
||||||
|
result = specialRE.ReplaceAllString(result, "")
|
||||||
|
return strings.TrimSpace(strings.ReplaceAll(result, "\n ", "\n")), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsRecording returns true if a recording is in progress.
|
||||||
|
func (w *WhisperBinary) IsRecording() bool {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
return w.recording
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
|
func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
|
||||||
@@ -44,283 +174,3 @@ func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
|
|||||||
cancel: cancel,
|
cancel: cancel,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *WhisperBinary) StartRecording() error {
|
|
||||||
w.mu.Lock()
|
|
||||||
defer w.mu.Unlock()
|
|
||||||
if w.recording {
|
|
||||||
return errors.New("recording is already in progress")
|
|
||||||
}
|
|
||||||
// If context is cancelled, create a new one for the next recording session
|
|
||||||
if w.ctx.Err() != nil {
|
|
||||||
w.logger.Debug("Context cancelled, creating new context")
|
|
||||||
w.ctx, w.cancel = context.WithCancel(context.Background())
|
|
||||||
}
|
|
||||||
// Temporarily redirect stderr to suppress ALSA warnings during PortAudio init
|
|
||||||
origStderr, errDup := syscall.Dup(syscall.Stderr)
|
|
||||||
if errDup != nil {
|
|
||||||
return fmt.Errorf("failed to dup stderr: %w", errDup)
|
|
||||||
}
|
|
||||||
nullFD, err := syscall.Open("/dev/null", syscall.O_WRONLY, 0)
|
|
||||||
if err != nil {
|
|
||||||
_ = syscall.Close(origStderr) // Close the dup'd fd if open fails
|
|
||||||
return fmt.Errorf("failed to open /dev/null: %w", err)
|
|
||||||
}
|
|
||||||
// redirect stderr
|
|
||||||
_ = syscall.Dup2(nullFD, syscall.Stderr)
|
|
||||||
// Initialize PortAudio (this is where ALSA warnings occur)
|
|
||||||
portaudioErr := portaudio.Initialize()
|
|
||||||
defer func() {
|
|
||||||
// Restore stderr
|
|
||||||
_ = syscall.Dup2(origStderr, syscall.Stderr)
|
|
||||||
_ = syscall.Close(origStderr)
|
|
||||||
_ = syscall.Close(nullFD)
|
|
||||||
}()
|
|
||||||
if portaudioErr != nil {
|
|
||||||
return fmt.Errorf("portaudio init failed: %w", portaudioErr)
|
|
||||||
}
|
|
||||||
// Initialize audio buffer
|
|
||||||
w.audioBuffer = make([]int16, 0)
|
|
||||||
in := make([]int16, 1024) // buffer size
|
|
||||||
stream, err := portaudio.OpenDefaultStream(1, 0, 16000.0, len(in), in)
|
|
||||||
if err != nil {
|
|
||||||
if paErr := portaudio.Terminate(); paErr != nil {
|
|
||||||
return fmt.Errorf("failed to open microphone: %w; terminate error: %w", err, paErr)
|
|
||||||
}
|
|
||||||
return fmt.Errorf("failed to open microphone: %w", err)
|
|
||||||
}
|
|
||||||
go w.recordAudio(stream, in)
|
|
||||||
w.recording = true
|
|
||||||
w.logger.Debug("Recording started")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (w *WhisperBinary) recordAudio(stream *portaudio.Stream, in []int16) {
|
|
||||||
defer func() {
|
|
||||||
w.logger.Debug("recordAudio defer function called")
|
|
||||||
_ = stream.Stop() // Stop the stream
|
|
||||||
_ = portaudio.Terminate() // ignoring error as we're shutting down
|
|
||||||
w.logger.Debug("recordAudio terminated")
|
|
||||||
}()
|
|
||||||
w.logger.Debug("Starting audio stream")
|
|
||||||
if err := stream.Start(); err != nil {
|
|
||||||
w.logger.Error("Failed to start audio stream", "error", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
w.logger.Debug("Audio stream started, entering recording loop")
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-w.ctx.Done():
|
|
||||||
w.logger.Debug("Context done, exiting recording loop")
|
|
||||||
return
|
|
||||||
default:
|
|
||||||
// Check recording status with minimal lock time
|
|
||||||
w.mu.Lock()
|
|
||||||
recording := w.recording
|
|
||||||
w.mu.Unlock()
|
|
||||||
|
|
||||||
if !recording {
|
|
||||||
w.logger.Debug("Recording flag is false, exiting recording loop")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if err := stream.Read(); err != nil {
|
|
||||||
w.logger.Error("Error reading from stream", "error", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Append samples to buffer - only acquire lock when necessary
|
|
||||||
w.mu.Lock()
|
|
||||||
if w.audioBuffer == nil {
|
|
||||||
w.audioBuffer = make([]int16, 0)
|
|
||||||
}
|
|
||||||
// Make a copy of the input buffer to avoid overwriting
|
|
||||||
tempBuffer := make([]int16, len(in))
|
|
||||||
copy(tempBuffer, in)
|
|
||||||
w.audioBuffer = append(w.audioBuffer, tempBuffer...)
|
|
||||||
w.mu.Unlock()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (w *WhisperBinary) StopRecording() (string, error) {
|
|
||||||
w.logger.Debug("StopRecording called")
|
|
||||||
w.mu.Lock()
|
|
||||||
if !w.recording {
|
|
||||||
w.mu.Unlock()
|
|
||||||
return "", errors.New("not currently recording")
|
|
||||||
}
|
|
||||||
w.logger.Debug("Setting recording to false and cancelling context")
|
|
||||||
w.recording = false
|
|
||||||
w.cancel() // This will stop the recording goroutine
|
|
||||||
w.mu.Unlock()
|
|
||||||
// // Small delay to allow the recording goroutine to react to context cancellation
|
|
||||||
// time.Sleep(20 * time.Millisecond)
|
|
||||||
// Save the recorded audio to a temporary file
|
|
||||||
tempFile, err := w.saveAudioToTempFile()
|
|
||||||
if err != nil {
|
|
||||||
w.logger.Error("Error saving audio to temp file", "error", err)
|
|
||||||
return "", fmt.Errorf("failed to save audio to temp file: %w", err)
|
|
||||||
}
|
|
||||||
w.logger.Debug("Saved audio to temp file", "file", tempFile)
|
|
||||||
// Run the whisper binary with a separate context to avoid cancellation during transcription
|
|
||||||
cmd := exec.Command(w.whisperPath, "-m", w.modelPath, "-l", w.lang, tempFile, "2>/dev/null")
|
|
||||||
var outBuf bytes.Buffer
|
|
||||||
cmd.Stdout = &outBuf
|
|
||||||
// Redirect stderr to suppress ALSA warnings and other stderr output
|
|
||||||
cmd.Stderr = io.Discard // Suppress stderr output from whisper binary
|
|
||||||
w.logger.Debug("Running whisper binary command")
|
|
||||||
if err := cmd.Run(); err != nil {
|
|
||||||
// Clean up audio buffer
|
|
||||||
w.mu.Lock()
|
|
||||||
w.audioBuffer = nil
|
|
||||||
w.mu.Unlock()
|
|
||||||
// Since we're suppressing stderr, we'll just log that the command failed
|
|
||||||
w.logger.Error("Error running whisper binary", "error", err)
|
|
||||||
return "", fmt.Errorf("whisper binary failed: %w", err)
|
|
||||||
}
|
|
||||||
result := outBuf.String()
|
|
||||||
w.logger.Debug("Whisper binary completed", "result", result)
|
|
||||||
// Clean up audio buffer
|
|
||||||
w.mu.Lock()
|
|
||||||
w.audioBuffer = nil
|
|
||||||
w.mu.Unlock()
|
|
||||||
// Clean up the temporary file after transcription
|
|
||||||
w.logger.Debug("StopRecording completed")
|
|
||||||
os.Remove(tempFile)
|
|
||||||
result = strings.TrimRight(result, "\n")
|
|
||||||
// in case there are special tokens like [_BEG_]
|
|
||||||
result = specialRE.ReplaceAllString(result, "")
|
|
||||||
return strings.TrimSpace(strings.ReplaceAll(result, "\n ", "\n")), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// saveAudioToTempFile saves the recorded audio data to a temporary WAV file
|
|
||||||
func (w *WhisperBinary) saveAudioToTempFile() (string, error) {
|
|
||||||
w.logger.Debug("saveAudioToTempFile called")
|
|
||||||
// Create temporary WAV file
|
|
||||||
tempFile, err := os.CreateTemp("", "recording_*.wav")
|
|
||||||
if err != nil {
|
|
||||||
w.logger.Error("Failed to create temp file", "error", err)
|
|
||||||
return "", fmt.Errorf("failed to create temp file: %w", err)
|
|
||||||
}
|
|
||||||
w.logger.Debug("Created temp file", "file", tempFile.Name())
|
|
||||||
defer tempFile.Close()
|
|
||||||
|
|
||||||
// Write WAV header and data
|
|
||||||
w.logger.Debug("About to write WAV file", "file", tempFile.Name())
|
|
||||||
err = w.writeWAVFile(tempFile.Name())
|
|
||||||
if err != nil {
|
|
||||||
w.logger.Error("Error writing WAV file", "error", err)
|
|
||||||
return "", fmt.Errorf("failed to write WAV file: %w", err)
|
|
||||||
}
|
|
||||||
w.logger.Debug("WAV file written successfully", "file", tempFile.Name())
|
|
||||||
|
|
||||||
return tempFile.Name(), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// writeWAVFile creates a WAV file from the recorded audio data
|
|
||||||
func (w *WhisperBinary) writeWAVFile(filename string) error {
|
|
||||||
w.logger.Debug("writeWAVFile called", "filename", filename)
|
|
||||||
// Open file for writing
|
|
||||||
file, err := os.Create(filename)
|
|
||||||
if err != nil {
|
|
||||||
w.logger.Error("Error creating file", "error", err)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer file.Close()
|
|
||||||
|
|
||||||
w.logger.Debug("About to acquire mutex in writeWAVFile")
|
|
||||||
w.mu.Lock()
|
|
||||||
w.logger.Debug("Locked mutex, copying audio buffer")
|
|
||||||
audioData := make([]int16, len(w.audioBuffer))
|
|
||||||
copy(audioData, w.audioBuffer)
|
|
||||||
w.mu.Unlock()
|
|
||||||
w.logger.Debug("Unlocked mutex", "audio_data_length", len(audioData))
|
|
||||||
|
|
||||||
if len(audioData) == 0 {
|
|
||||||
w.logger.Warn("No audio data to write")
|
|
||||||
return errors.New("no audio data to write")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate data size (number of samples * size of int16)
|
|
||||||
dataSize := len(audioData) * 2 // 2 bytes per int16 sample
|
|
||||||
w.logger.Debug("Calculated data size", "size", dataSize)
|
|
||||||
|
|
||||||
// Write WAV header with the correct data size
|
|
||||||
header := w.createWAVHeader(16000, 1, 16, dataSize)
|
|
||||||
_, err = file.Write(header)
|
|
||||||
if err != nil {
|
|
||||||
w.logger.Error("Error writing WAV header", "error", err)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
w.logger.Debug("WAV header written successfully")
|
|
||||||
|
|
||||||
// Write audio data
|
|
||||||
w.logger.Debug("About to write audio data samples")
|
|
||||||
for i, sample := range audioData {
|
|
||||||
// Write little-endian 16-bit sample
|
|
||||||
_, err := file.Write([]byte{byte(sample), byte(sample >> 8)})
|
|
||||||
if err != nil {
|
|
||||||
w.logger.Error("Error writing sample", "index", i, "error", err)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
// Log progress every 10000 samples to avoid too much output
|
|
||||||
if i%10000 == 0 {
|
|
||||||
w.logger.Debug("Written samples", "count", i)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
w.logger.Debug("All audio data written successfully")
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// createWAVHeader creates a WAV file header
|
|
||||||
func (w *WhisperBinary) createWAVHeader(sampleRate, channels, bitsPerSample int, dataSize int) []byte {
|
|
||||||
header := make([]byte, 44)
|
|
||||||
copy(header[0:4], "RIFF")
|
|
||||||
// Total file size will be updated later
|
|
||||||
copy(header[8:12], "WAVE")
|
|
||||||
copy(header[12:16], "fmt ")
|
|
||||||
// fmt chunk size (16 for PCM)
|
|
||||||
header[16] = 16
|
|
||||||
header[17] = 0
|
|
||||||
header[18] = 0
|
|
||||||
header[19] = 0
|
|
||||||
// Audio format (1 = PCM)
|
|
||||||
header[20] = 1
|
|
||||||
header[21] = 0
|
|
||||||
// Number of channels
|
|
||||||
header[22] = byte(channels)
|
|
||||||
header[23] = 0
|
|
||||||
// Sample rate
|
|
||||||
header[24] = byte(sampleRate)
|
|
||||||
header[25] = byte(sampleRate >> 8)
|
|
||||||
header[26] = byte(sampleRate >> 16)
|
|
||||||
header[27] = byte(sampleRate >> 24)
|
|
||||||
// Byte rate
|
|
||||||
byteRate := sampleRate * channels * bitsPerSample / 8
|
|
||||||
header[28] = byte(byteRate)
|
|
||||||
header[29] = byte(byteRate >> 8)
|
|
||||||
header[30] = byte(byteRate >> 16)
|
|
||||||
header[31] = byte(byteRate >> 24)
|
|
||||||
// Block align
|
|
||||||
blockAlign := channels * bitsPerSample / 8
|
|
||||||
header[32] = byte(blockAlign)
|
|
||||||
header[33] = 0
|
|
||||||
// Bits per sample
|
|
||||||
header[34] = byte(bitsPerSample)
|
|
||||||
header[35] = 0
|
|
||||||
// "data" subchunk
|
|
||||||
copy(header[36:40], "data")
|
|
||||||
// Data size
|
|
||||||
header[40] = byte(dataSize)
|
|
||||||
header[41] = byte(dataSize >> 8)
|
|
||||||
header[42] = byte(dataSize >> 16)
|
|
||||||
header[43] = byte(dataSize >> 24)
|
|
||||||
|
|
||||||
return header
|
|
||||||
}
|
|
||||||
|
|
||||||
func (w *WhisperBinary) IsRecording() bool {
|
|
||||||
w.mu.Lock()
|
|
||||||
defer w.mu.Unlock()
|
|
||||||
return w.recording
|
|
||||||
}
|
|
||||||
|
|||||||
156
extra/whisper_server.go
Normal file
156
extra/whisper_server.go
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
//go:build extra
|
||||||
|
// +build extra
|
||||||
|
|
||||||
|
package extra
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"mime/multipart"
|
||||||
|
"net/http"
|
||||||
|
"os/exec"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
type WhisperServer struct {
|
||||||
|
logger *slog.Logger
|
||||||
|
ServerURL string
|
||||||
|
SampleRate int
|
||||||
|
AudioBuffer *bytes.Buffer
|
||||||
|
recording bool // protected by mu
|
||||||
|
mu sync.Mutex // protects recording & AudioBuffer
|
||||||
|
cmd *exec.Cmd // protected by cmdMu
|
||||||
|
stopCh chan struct{} // protected by cmdMu
|
||||||
|
cmdMu sync.Mutex // protects cmd and stopCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func (stt *WhisperServer) StartRecording() error {
|
||||||
|
stt.mu.Lock()
|
||||||
|
defer stt.mu.Unlock()
|
||||||
|
if stt.recording {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// Build ffmpeg command for microphone capture
|
||||||
|
args := []string{
|
||||||
|
"-f", "alsa",
|
||||||
|
"-i", "default",
|
||||||
|
"-acodec", "pcm_s16le",
|
||||||
|
"-ar", fmt.Sprint(stt.SampleRate),
|
||||||
|
"-ac", "1",
|
||||||
|
"-f", "s16le",
|
||||||
|
"-",
|
||||||
|
}
|
||||||
|
cmd := exec.Command("ffmpeg", args...)
|
||||||
|
stdout, err := cmd.StdoutPipe()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to get stdout pipe: %w", err)
|
||||||
|
}
|
||||||
|
stt.cmdMu.Lock()
|
||||||
|
stt.cmd = cmd
|
||||||
|
stt.stopCh = make(chan struct{})
|
||||||
|
stt.cmdMu.Unlock()
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return fmt.Errorf("failed to start ffmpeg: %w", err)
|
||||||
|
}
|
||||||
|
stt.recording = true
|
||||||
|
stt.AudioBuffer.Reset()
|
||||||
|
// Read PCM data in goroutine
|
||||||
|
go func() {
|
||||||
|
buf := make([]byte, 4096)
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stt.stopCh:
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
n, err := stdout.Read(buf)
|
||||||
|
if n > 0 {
|
||||||
|
stt.mu.Lock()
|
||||||
|
stt.AudioBuffer.Write(buf[:n])
|
||||||
|
stt.mu.Unlock()
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
if err != io.EOF {
|
||||||
|
stt.logger.Error("recording read error", "error", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (stt *WhisperServer) StopRecording() (string, error) {
|
||||||
|
stt.mu.Lock()
|
||||||
|
defer stt.mu.Unlock()
|
||||||
|
if !stt.recording {
|
||||||
|
return "", errors.New("not recording")
|
||||||
|
}
|
||||||
|
stt.recording = false
|
||||||
|
// Stop ffmpeg
|
||||||
|
stt.cmdMu.Lock()
|
||||||
|
if stt.cmd != nil && stt.cmd.Process != nil {
|
||||||
|
stt.cmd.Process.Kill()
|
||||||
|
stt.cmd.Wait()
|
||||||
|
}
|
||||||
|
close(stt.stopCh)
|
||||||
|
stt.cmdMu.Unlock()
|
||||||
|
// Rest of StopRecording unchanged (WAV header + HTTP upload)
|
||||||
|
// ...
|
||||||
|
stt.recording = false
|
||||||
|
// wait loop to finish?
|
||||||
|
if stt.AudioBuffer == nil {
|
||||||
|
err := errors.New("unexpected nil AudioBuffer")
|
||||||
|
stt.logger.Error(err.Error())
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
// Create WAV header first
|
||||||
|
body := &bytes.Buffer{}
|
||||||
|
writer := multipart.NewWriter(body)
|
||||||
|
// Add audio file part
|
||||||
|
part, err := writer.CreateFormFile("file", "recording.wav")
|
||||||
|
if err != nil {
|
||||||
|
stt.logger.Error("fn: StopRecording", "error", err)
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
// Stream directly to multipart writer: header + raw data
|
||||||
|
dataSize := stt.AudioBuffer.Len()
|
||||||
|
stt.writeWavHeader(part, dataSize)
|
||||||
|
if _, err := io.Copy(part, stt.AudioBuffer); err != nil {
|
||||||
|
stt.logger.Error("fn: StopRecording", "error", err)
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
// Reset buffer for next recording
|
||||||
|
stt.AudioBuffer.Reset()
|
||||||
|
// Add response format field
|
||||||
|
err = writer.WriteField("response_format", "text")
|
||||||
|
if err != nil {
|
||||||
|
stt.logger.Error("fn: StopRecording", "error", err)
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if writer.Close() != nil {
|
||||||
|
stt.logger.Error("fn: StopRecording", "error", err)
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
// Send request
|
||||||
|
resp, err := http.Post(stt.ServerURL, writer.FormDataContentType(), body) //nolint:noctx
|
||||||
|
if err != nil {
|
||||||
|
stt.logger.Error("fn: StopRecording", "error", err)
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
// Read and print response
|
||||||
|
responseTextBytes, err := io.ReadAll(resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
stt.logger.Error("fn: StopRecording", "error", err)
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
resptext := strings.TrimRight(string(responseTextBytes), "\n")
|
||||||
|
// in case there are special tokens like [_BEG_]
|
||||||
|
resptext = specialRE.ReplaceAllString(resptext, "")
|
||||||
|
return strings.TrimSpace(strings.ReplaceAll(resptext, "\n ", "\n")), nil
|
||||||
|
}
|
||||||
1
go.mod
1
go.mod
@@ -9,7 +9,6 @@ require (
|
|||||||
github.com/PuerkitoBio/goquery v1.11.0
|
github.com/PuerkitoBio/goquery v1.11.0
|
||||||
github.com/gdamore/tcell/v2 v2.13.2
|
github.com/gdamore/tcell/v2 v2.13.2
|
||||||
github.com/glebarez/go-sqlite v1.22.0
|
github.com/glebarez/go-sqlite v1.22.0
|
||||||
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b
|
|
||||||
github.com/jmoiron/sqlx v1.4.0
|
github.com/jmoiron/sqlx v1.4.0
|
||||||
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
|
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
|
||||||
github.com/neurosnap/sentences v1.1.2
|
github.com/neurosnap/sentences v1.1.2
|
||||||
|
|||||||
2
go.sum
2
go.sum
@@ -37,8 +37,6 @@ github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17k
|
|||||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b h1:WEuQWBxelOGHA6z9lABqaMLMrfwVyMdN3UgRLT+YUPo=
|
|
||||||
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b/go.mod h1:esZFQEUwqC+l76f2R8bIWSwXMaPbp79PppwZ1eJhFco=
|
|
||||||
github.com/hajimehoshi/go-mp3 v0.3.4 h1:NUP7pBYH8OguP4diaTZ9wJbUbk3tC0KlfzsEpWmYj68=
|
github.com/hajimehoshi/go-mp3 v0.3.4 h1:NUP7pBYH8OguP4diaTZ9wJbUbk3tC0KlfzsEpWmYj68=
|
||||||
github.com/hajimehoshi/go-mp3 v0.3.4/go.mod h1:fRtZraRFcWb0pu7ok0LqyFhCUrPeMsGRSVop0eemFmo=
|
github.com/hajimehoshi/go-mp3 v0.3.4/go.mod h1:fRtZraRFcWb0pu7ok0LqyFhCUrPeMsGRSVop0eemFmo=
|
||||||
github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo=
|
github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo=
|
||||||
|
|||||||
Reference in New Issue
Block a user