Feat: stt sketch [WIP]

This commit is contained in:
Grail Finder
2025-05-17 21:23:51 +03:00
parent f7d1fbf73c
commit 2d56806cfa
4 changed files with 207 additions and 11 deletions

188
extra/stt.go Normal file
View File

@@ -0,0 +1,188 @@
package extra
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"log/slog"
"net/http"
"os"
"os/signal"
"github.com/MarkKremer/microphone/v2"
"github.com/gopxl/beep/v2"
"github.com/gopxl/beep/v2/wav"
)
type STT interface {
StartRecording() error
StopRecording() (string, error)
IsRecording() bool
}
type WhisperSTT struct {
logger *slog.Logger
ServerURL string
SampleRate beep.SampleRate
Buffer *bytes.Buffer
streamer beep.StreamCloser
recording bool
}
type writeseeker struct {
buf []byte
pos int
}
func (m *writeseeker) Write(p []byte) (n int, err error) {
minCap := m.pos + len(p)
if minCap > cap(m.buf) { // Make sure buf has enough capacity:
buf2 := make([]byte, len(m.buf), minCap+len(p)) // add some extra
copy(buf2, m.buf)
m.buf = buf2
}
if minCap > len(m.buf) {
m.buf = m.buf[:minCap]
}
copy(m.buf[m.pos:], p)
m.pos += len(p)
return len(p), nil
}
func (m *writeseeker) Seek(offset int64, whence int) (int64, error) {
newPos, offs := 0, int(offset)
switch whence {
case io.SeekStart:
newPos = offs
case io.SeekCurrent:
newPos = m.pos + offs
case io.SeekEnd:
newPos = len(m.buf) + offs
}
if newPos < 0 {
return 0, errors.New("negative result pos")
}
m.pos = newPos
return int64(newPos), nil
}
// Reader returns an io.Reader. Use it, for example, with io.Copy, to copy the content of the WriterSeeker buffer to an io.Writer
func (ws *writeseeker) Reader() io.Reader {
return bytes.NewReader(ws.buf)
}
func NewWhisperSTT(logger *slog.Logger, serverURL string, sampleRate beep.SampleRate) *WhisperSTT {
return &WhisperSTT{
logger: logger,
ServerURL: serverURL,
SampleRate: sampleRate,
Buffer: new(bytes.Buffer),
}
}
func (stt *WhisperSTT) StartRecording() error {
stream, err := microphoneStream(stt.SampleRate)
if err != nil {
return fmt.Errorf("failed to init microphone: %w", err)
}
stt.streamer = stream
stt.recording = true
go stt.capture()
return nil
}
func (stt *WhisperSTT) capture() {
sink := beep.NewBuffer(beep.Format{
SampleRate: stt.SampleRate,
NumChannels: 1,
Precision: 2,
})
// Append the streamer to the buffer and encode as WAV
sink.Append(stt.streamer)
// Encode the captured audio to WAV format using beep's WAV encoder
// var wavBuf bytes.Buffer
var wavBuf writeseeker
if err := wav.Encode(&wavBuf, sink.Streamer(0, sink.Len()), beep.Format{
SampleRate: stt.SampleRate,
NumChannels: 1,
Precision: 2,
}); err != nil {
stt.logger.Error("failed to encode WAV", "error", err)
}
r := wavBuf.Reader()
// stt.Buffer = &wavBuf
if _, err := io.Copy(stt.Buffer, r); err != nil {
stt.logger.Error("failed to encode WAV", "error", err)
}
}
func (stt *WhisperSTT) StopRecording() (string, error) {
if !stt.recording {
return "", nil
}
stt.streamer.Close()
stt.recording = false
// Send to Whisper.cpp server
req, err := http.NewRequest("POST", stt.ServerURL, stt.Buffer)
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "audio/wav")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return "", fmt.Errorf("transcription request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
var result struct {
Text string `json:"text"`
}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return "", fmt.Errorf("failed to decode response: %w", err)
}
return result.Text, nil
}
func (stt *WhisperSTT) IsRecording() bool {
return stt.recording
}
func microphoneStream(sr beep.SampleRate) (beep.StreamCloser, error) {
if err := microphone.Init(); err != nil {
return nil, fmt.Errorf("microphone init failed: %w", err)
}
stream, _, err := microphone.OpenDefaultStream(sr, 1) // 1 channel mono
if err != nil {
microphone.Terminate()
return nil, fmt.Errorf("failed to open microphone: %w", err)
}
// Handle OS signals to clean up
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt, os.Kill)
go func() {
<-sig
stream.Stop()
stream.Close()
microphone.Terminate()
os.Exit(1)
}()
stream.Start()
return stream, nil
}