Enha: new tables

This commit is contained in:
Grail Finder
2025-11-22 15:31:46 +03:00
parent 50d7bfced3
commit 4eb3d609a1
2 changed files with 54 additions and 38 deletions

View File

@@ -163,9 +163,9 @@ func (r *RAG) writeVectors(vectorCh chan []models.VectorRow) error {
for batch := range vectorCh { for batch := range vectorCh {
for _, vector := range batch { for _, vector := range batch {
if err := r.storage.WriteVector(&vector); err != nil { if err := r.storage.WriteVector(&vector); err != nil {
r.logger.Error("failed to write vector", "error", err, "slug", vector.Slug) r.logger.Error("failed to write vector to DB", "error", err, "slug", vector.Slug)
LongJobStatusCh <- ErrRAGStatus LongJobStatusCh <- ErrRAGStatus
continue // a duplicate is not critical return err // Stop the entire RAG operation on DB error
} }
} }
r.logger.Debug("wrote batch to db", "size", len(batch), "vector_chan_len", len(vectorCh)) r.logger.Debug("wrote batch to db", "size", len(batch), "vector_chan_len", len(vectorCh))

View File

@@ -30,33 +30,34 @@ func NewVectorStorage(logger *slog.Logger, store storage.FullRepo) *VectorStorag
// CreateTables creates the necessary tables for vector storage // CreateTables creates the necessary tables for vector storage
func (vs *VectorStorage) CreateTables() error { func (vs *VectorStorage) CreateTables() error {
// Create tables for different embedding dimensions // Create tables for common embedding dimensions
queries := []string{ embeddingSizes := []int{384, 768, 1024, 1536, 2048, 3072, 4096, 5120}
`CREATE TABLE IF NOT EXISTS embeddings_384 ( // Pre-allocate queries slice: each embedding size needs 1 table + 3 indexes = 4 queries per size
id INTEGER PRIMARY KEY AUTOINCREMENT, queries := make([]string, 0, len(embeddingSizes)*4)
embeddings BLOB NOT NULL,
slug TEXT NOT NULL,
raw_text TEXT NOT NULL,
filename TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)`,
`CREATE TABLE IF NOT EXISTS embeddings_5120 (
id INTEGER PRIMARY KEY AUTOINCREMENT,
embeddings BLOB NOT NULL,
slug TEXT NOT NULL,
raw_text TEXT NOT NULL,
filename TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)`,
// Indexes for better performance
`CREATE INDEX IF NOT EXISTS idx_embeddings_384_filename ON embeddings_384(filename)`,
`CREATE INDEX IF NOT EXISTS idx_embeddings_5120_filename ON embeddings_5120(filename)`,
`CREATE INDEX IF NOT EXISTS idx_embeddings_384_slug ON embeddings_384(slug)`,
`CREATE INDEX IF NOT EXISTS idx_embeddings_5120_slug ON embeddings_5120(slug)`,
// Additional indexes that may help with searches // Generate table creation queries for each embedding size
`CREATE INDEX IF NOT EXISTS idx_embeddings_384_created_at ON embeddings_384(created_at)`, for _, size := range embeddingSizes {
`CREATE INDEX IF NOT EXISTS idx_embeddings_5120_created_at ON embeddings_5120(created_at)`, tableName := fmt.Sprintf("embeddings_%d", size)
queries = append(queries,
fmt.Sprintf(`CREATE TABLE IF NOT EXISTS %s (
id INTEGER PRIMARY KEY AUTOINCREMENT,
embeddings BLOB NOT NULL,
slug TEXT NOT NULL,
raw_text TEXT NOT NULL,
filename TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)`, tableName),
)
}
// Add indexes for all supported sizes
for _, size := range embeddingSizes {
tableName := fmt.Sprintf("embeddings_%d", size)
queries = append(queries,
fmt.Sprintf(`CREATE INDEX IF NOT EXISTS idx_%s_filename ON %s(filename)`, tableName, tableName),
fmt.Sprintf(`CREATE INDEX IF NOT EXISTS idx_%s_slug ON %s(slug)`, tableName, tableName),
fmt.Sprintf(`CREATE INDEX IF NOT EXISTS idx_%s_created_at ON %s(created_at)`, tableName, tableName),
)
} }
for _, query := range queries { for _, query := range queries {
@@ -120,14 +121,25 @@ func (vs *VectorStorage) WriteVector(row *models.VectorRow) error {
// getTableName determines which table to use based on embedding size // getTableName determines which table to use based on embedding size
func (vs *VectorStorage) getTableName(emb []float32) (string, error) { func (vs *VectorStorage) getTableName(emb []float32) (string, error) {
switch len(emb) { size := len(emb)
case 384:
return "embeddings_384", nil // Check if we support this embedding size
case 5120: supportedSizes := map[int]bool{
return "embeddings_5120", nil 384: true,
default: 768: true,
return "", fmt.Errorf("no table for embedding size of %d", len(emb)) 1024: true,
1536: true,
2048: true,
3072: true,
4096: true,
5120: true,
} }
if supportedSizes[size] {
return fmt.Sprintf("embeddings_%d", size), nil
}
return "", fmt.Errorf("no table for embedding size of %d", size)
} }
// SearchClosest finds vectors closest to the query vector using efficient cosine similarity calculation // SearchClosest finds vectors closest to the query vector using efficient cosine similarity calculation
@@ -211,8 +223,10 @@ func (vs *VectorStorage) SearchClosest(query []float32) ([]models.VectorRow, err
func (vs *VectorStorage) ListFiles() ([]string, error) { func (vs *VectorStorage) ListFiles() ([]string, error) {
fileLists := make([][]string, 0) fileLists := make([][]string, 0)
// Query both tables and combine results // Query all supported tables and combine results
for _, table := range []string{"embeddings_384", "embeddings_5120"} { embeddingSizes := []int{384, 768, 1024, 1536, 2048, 3072, 4096, 5120}
for _, size := range embeddingSizes {
table := fmt.Sprintf("embeddings_%d", size)
query := "SELECT DISTINCT filename FROM " + table query := "SELECT DISTINCT filename FROM " + table
rows, err := vs.sqlxDB.Query(query) rows, err := vs.sqlxDB.Query(query)
if err != nil { if err != nil {
@@ -252,7 +266,9 @@ func (vs *VectorStorage) ListFiles() ([]string, error) {
func (vs *VectorStorage) RemoveEmbByFileName(filename string) error { func (vs *VectorStorage) RemoveEmbByFileName(filename string) error {
var errors []string var errors []string
for _, table := range []string{"embeddings_384", "embeddings_5120"} { embeddingSizes := []int{384, 768, 1024, 1536, 2048, 3072, 4096, 5120}
for _, size := range embeddingSizes {
table := fmt.Sprintf("embeddings_%d", size)
query := fmt.Sprintf("DELETE FROM %s WHERE filename = ?", table) query := fmt.Sprintf("DELETE FROM %s WHERE filename = ?", table)
if _, err := vs.sqlxDB.Exec(query, filename); err != nil { if _, err := vs.sqlxDB.Exec(query, filename); err != nil {
errors = append(errors, err.Error()) errors = append(errors, err.Error())