Feat: new rag attempt

This commit is contained in:
Grail Finder
2025-10-09 16:19:43 +03:00
parent 5d2ce7a5f5
commit 2e1b018a45
8 changed files with 893 additions and 67 deletions

179
storage/vector.go.bak Normal file
View File

@@ -0,0 +1,179 @@
package storage
import (
"gf-lt/models"
"encoding/binary"
"fmt"
"sort"
"unsafe"
)
type VectorRepo interface {
WriteVector(*models.VectorRow) error
SearchClosest(q []float32) ([]models.VectorRow, error)
ListFiles() ([]string, error)
RemoveEmbByFileName(filename string) error
}
// SerializeVector converts []float32 to binary blob
func SerializeVector(vec []float32) []byte {
buf := make([]byte, len(vec)*4) // 4 bytes per float32
for i, v := range vec {
binary.LittleEndian.PutUint32(buf[i*4:], mathFloat32bits(v))
}
return buf
}
// DeserializeVector converts binary blob back to []float32
func DeserializeVector(data []byte) []float32 {
count := len(data) / 4
vec := make([]float32, count)
for i := 0; i < count; i++ {
vec[i] = mathBitsToFloat32(binary.LittleEndian.Uint32(data[i*4:]))
}
return vec
}
// mathFloat32bits and mathBitsToFloat32 are helpers to convert between float32 and uint32
func mathFloat32bits(f float32) uint32 {
return binary.LittleEndian.Uint32((*(*[4]byte)(unsafe.Pointer(&f)))[:4])
}
func mathBitsToFloat32(b uint32) float32 {
return *(*float32)(unsafe.Pointer(&b))
}
var (
vecTableName5120 = "embeddings_5120"
vecTableName384 = "embeddings_384"
)
func fetchTableName(emb []float32) (string, error) {
switch len(emb) {
case 5120:
return vecTableName5120, nil
case 384:
return vecTableName384, nil
default:
return "", fmt.Errorf("no table for the size of %d", len(emb))
}
}
func (p ProviderSQL) WriteVector(row *models.VectorRow) error {
tableName, err := fetchTableName(row.Embeddings)
if err != nil {
return err
}
stmt, _, err := p.s3Conn.Prepare(
fmt.Sprintf("INSERT INTO %s(embedding, slug, raw_text, filename) VALUES (?, ?, ?, ?)", tableName))
if err != nil {
p.logger.Error("failed to prep a stmt", "error", err)
return err
}
defer stmt.Close()
serializedEmbeddings := SerializeVector(row.Embeddings)
if err := stmt.BindBlob(1, serializedEmbeddings); err != nil {
p.logger.Error("failed to bind", "error", err)
return err
}
if err := stmt.BindText(2, row.Slug); err != nil {
p.logger.Error("failed to bind", "error", err)
return err
}
if err := stmt.BindText(3, row.RawText); err != nil {
p.logger.Error("failed to bind", "error", err)
return err
}
if err := stmt.BindText(4, row.FileName); err != nil {
p.logger.Error("failed to bind", "error", err)
return err
}
err = stmt.Exec()
if err != nil {
return err
}
return nil
}
func decodeUnsafe(bs []byte) []float32 {
return unsafe.Slice((*float32)(unsafe.Pointer(&bs[0])), len(bs)/4)
}
func (p ProviderSQL) SearchClosest(q []float32) ([]models.VectorRow, error) {
tableName, err := fetchTableName(q)
if err != nil {
return nil, err
}
stmt, _, err := p.s3Conn.Prepare(
fmt.Sprintf(`SELECT
distance,
embedding,
slug,
raw_text,
filename
FROM %s
WHERE embedding MATCH ?
ORDER BY distance
LIMIT 3
`, tableName))
if err != nil {
return nil, err
}
// This function needs to be completely rewritten to use the new binary storage approach
if err != nil {
return nil, err
}
if err := stmt.BindBlob(1, query); err != nil {
p.logger.Error("failed to bind", "error", err)
return nil, err
}
resp := []models.VectorRow{}
for stmt.Step() {
res := models.VectorRow{}
res.Distance = float32(stmt.ColumnFloat(0))
emb := stmt.ColumnRawText(1)
res.Embeddings = decodeUnsafe(emb)
res.Slug = stmt.ColumnText(2)
res.RawText = stmt.ColumnText(3)
res.FileName = stmt.ColumnText(4)
resp = append(resp, res)
}
if err := stmt.Err(); err != nil {
return nil, err
}
err = stmt.Close()
if err != nil {
return nil, err
}
return resp, nil
}
func (p ProviderSQL) ListFiles() ([]string, error) {
q := fmt.Sprintf("SELECT filename FROM %s GROUP BY filename", vecTableName384)
stmt, _, err := p.s3Conn.Prepare(q)
if err != nil {
return nil, err
}
defer stmt.Close()
resp := []string{}
for stmt.Step() {
resp = append(resp, stmt.ColumnText(0))
}
if err := stmt.Err(); err != nil {
return nil, err
}
return resp, nil
}
func (p ProviderSQL) RemoveEmbByFileName(filename string) error {
q := fmt.Sprintf("DELETE FROM %s WHERE filename = ?", vecTableName384)
stmt, _, err := p.s3Conn.Prepare(q)
if err != nil {
return err
}
defer stmt.Close()
if err := stmt.BindText(1, filename); err != nil {
return err
}
return stmt.Exec()
}