Enha: migrations with different emb tables
This commit is contained in:
@@ -43,10 +43,7 @@ func New(l *slog.Logger, s storage.FullRepo, cfg *config.Config) *RAG {
|
|||||||
storage: NewVectorStorage(l, s),
|
storage: NewVectorStorage(l, s),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create the necessary tables
|
// Note: Vector tables are created via database migrations, not at runtime
|
||||||
if err := rag.storage.CreateTables(); err != nil {
|
|
||||||
l.Error("failed to create vector tables", "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return rag
|
return rag
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,45 +28,6 @@ func NewVectorStorage(logger *slog.Logger, store storage.FullRepo) *VectorStorag
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// CreateTables creates the necessary tables for vector storage
|
|
||||||
func (vs *VectorStorage) CreateTables() error {
|
|
||||||
// Create tables for common embedding dimensions
|
|
||||||
embeddingSizes := []int{384, 768, 1024, 1536, 2048, 3072, 4096, 5120}
|
|
||||||
// Pre-allocate queries slice: each embedding size needs 1 table + 3 indexes = 4 queries per size
|
|
||||||
queries := make([]string, 0, len(embeddingSizes)*4)
|
|
||||||
|
|
||||||
// Generate table creation queries for each embedding size
|
|
||||||
for _, size := range embeddingSizes {
|
|
||||||
tableName := fmt.Sprintf("embeddings_%d", size)
|
|
||||||
queries = append(queries,
|
|
||||||
fmt.Sprintf(`CREATE TABLE IF NOT EXISTS %s (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
embeddings BLOB NOT NULL,
|
|
||||||
slug TEXT NOT NULL,
|
|
||||||
raw_text TEXT NOT NULL,
|
|
||||||
filename TEXT NOT NULL,
|
|
||||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
||||||
)`, tableName),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add indexes for all supported sizes
|
|
||||||
for _, size := range embeddingSizes {
|
|
||||||
tableName := fmt.Sprintf("embeddings_%d", size)
|
|
||||||
queries = append(queries,
|
|
||||||
fmt.Sprintf(`CREATE INDEX IF NOT EXISTS idx_%s_filename ON %s(filename)`, tableName, tableName),
|
|
||||||
fmt.Sprintf(`CREATE INDEX IF NOT EXISTS idx_%s_slug ON %s(slug)`, tableName, tableName),
|
|
||||||
fmt.Sprintf(`CREATE INDEX IF NOT EXISTS idx_%s_created_at ON %s(created_at)`, tableName, tableName),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, query := range queries {
|
|
||||||
if _, err := vs.sqlxDB.Exec(query); err != nil {
|
|
||||||
return fmt.Errorf("failed to create table: %w", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// SerializeVector converts []float32 to binary blob
|
// SerializeVector converts []float32 to binary blob
|
||||||
func SerializeVector(vec []float32) []byte {
|
func SerializeVector(vec []float32) []byte {
|
||||||
|
|||||||
@@ -1,10 +1,34 @@
|
|||||||
-- Drop vector storage tables
|
-- Drop vector storage tables
|
||||||
DROP INDEX IF EXISTS idx_embeddings_384_filename;
|
DROP INDEX IF EXISTS idx_embeddings_384_filename;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_768_filename;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_1024_filename;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_1536_filename;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_2048_filename;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_3072_filename;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_4096_filename;
|
||||||
DROP INDEX IF EXISTS idx_embeddings_5120_filename;
|
DROP INDEX IF EXISTS idx_embeddings_5120_filename;
|
||||||
DROP INDEX IF EXISTS idx_embeddings_384_slug;
|
DROP INDEX IF EXISTS idx_embeddings_384_slug;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_768_slug;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_1024_slug;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_1536_slug;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_2048_slug;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_3072_slug;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_4096_slug;
|
||||||
DROP INDEX IF EXISTS idx_embeddings_5120_slug;
|
DROP INDEX IF EXISTS idx_embeddings_5120_slug;
|
||||||
DROP INDEX IF EXISTS idx_embeddings_384_created_at;
|
DROP INDEX IF EXISTS idx_embeddings_384_created_at;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_768_created_at;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_1024_created_at;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_1536_created_at;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_2048_created_at;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_3072_created_at;
|
||||||
|
DROP INDEX IF EXISTS idx_embeddings_4096_created_at;
|
||||||
DROP INDEX IF EXISTS idx_embeddings_5120_created_at;
|
DROP INDEX IF EXISTS idx_embeddings_5120_created_at;
|
||||||
|
|
||||||
DROP TABLE IF EXISTS embeddings_384;
|
DROP TABLE IF EXISTS embeddings_384;
|
||||||
|
DROP TABLE IF EXISTS embeddings_768;
|
||||||
|
DROP TABLE IF EXISTS embeddings_1024;
|
||||||
|
DROP TABLE IF EXISTS embeddings_1536;
|
||||||
|
DROP TABLE IF EXISTS embeddings_2048;
|
||||||
|
DROP TABLE IF EXISTS embeddings_3072;
|
||||||
|
DROP TABLE IF EXISTS embeddings_4096;
|
||||||
DROP TABLE IF EXISTS embeddings_5120;
|
DROP TABLE IF EXISTS embeddings_5120;
|
||||||
@@ -8,6 +8,60 @@ CREATE TABLE IF NOT EXISTS embeddings_384 (
|
|||||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
);
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS embeddings_768 (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
embeddings BLOB NOT NULL,
|
||||||
|
slug TEXT NOT NULL,
|
||||||
|
raw_text TEXT NOT NULL,
|
||||||
|
filename TEXT NOT NULL DEFAULT '',
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS embeddings_1024 (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
embeddings BLOB NOT NULL,
|
||||||
|
slug TEXT NOT NULL,
|
||||||
|
raw_text TEXT NOT NULL,
|
||||||
|
filename TEXT NOT NULL DEFAULT '',
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS embeddings_1536 (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
embeddings BLOB NOT NULL,
|
||||||
|
slug TEXT NOT NULL,
|
||||||
|
raw_text TEXT NOT NULL,
|
||||||
|
filename TEXT NOT NULL DEFAULT '',
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS embeddings_2048 (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
embeddings BLOB NOT NULL,
|
||||||
|
slug TEXT NOT NULL,
|
||||||
|
raw_text TEXT NOT NULL,
|
||||||
|
filename TEXT NOT NULL DEFAULT '',
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS embeddings_3072 (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
embeddings BLOB NOT NULL,
|
||||||
|
slug TEXT NOT NULL,
|
||||||
|
raw_text TEXT NOT NULL,
|
||||||
|
filename TEXT NOT NULL DEFAULT '',
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS embeddings_4096 (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
embeddings BLOB NOT NULL,
|
||||||
|
slug TEXT NOT NULL,
|
||||||
|
raw_text TEXT NOT NULL,
|
||||||
|
filename TEXT NOT NULL DEFAULT '',
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS embeddings_5120 (
|
CREATE TABLE IF NOT EXISTS embeddings_5120 (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
embeddings BLOB NOT NULL,
|
embeddings BLOB NOT NULL,
|
||||||
@@ -19,8 +73,26 @@ CREATE TABLE IF NOT EXISTS embeddings_5120 (
|
|||||||
|
|
||||||
-- Indexes for better performance
|
-- Indexes for better performance
|
||||||
CREATE INDEX IF NOT EXISTS idx_embeddings_384_filename ON embeddings_384(filename);
|
CREATE INDEX IF NOT EXISTS idx_embeddings_384_filename ON embeddings_384(filename);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_768_filename ON embeddings_768(filename);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_1024_filename ON embeddings_1024(filename);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_1536_filename ON embeddings_1536(filename);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_2048_filename ON embeddings_2048(filename);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_3072_filename ON embeddings_3072(filename);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_4096_filename ON embeddings_4096(filename);
|
||||||
CREATE INDEX IF NOT EXISTS idx_embeddings_5120_filename ON embeddings_5120(filename);
|
CREATE INDEX IF NOT EXISTS idx_embeddings_5120_filename ON embeddings_5120(filename);
|
||||||
CREATE INDEX IF NOT EXISTS idx_embeddings_384_slug ON embeddings_384(slug);
|
CREATE INDEX IF NOT EXISTS idx_embeddings_384_slug ON embeddings_384(slug);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_768_slug ON embeddings_768(slug);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_1024_slug ON embeddings_1024(slug);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_1536_slug ON embeddings_1536(slug);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_2048_slug ON embeddings_2048(slug);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_3072_slug ON embeddings_3072(slug);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_4096_slug ON embeddings_4096(slug);
|
||||||
CREATE INDEX IF NOT EXISTS idx_embeddings_5120_slug ON embeddings_5120(slug);
|
CREATE INDEX IF NOT EXISTS idx_embeddings_5120_slug ON embeddings_5120(slug);
|
||||||
CREATE INDEX IF NOT EXISTS idx_embeddings_384_created_at ON embeddings_384(created_at);
|
CREATE INDEX IF NOT EXISTS idx_embeddings_384_created_at ON embeddings_384(created_at);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_768_created_at ON embeddings_768(created_at);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_1024_created_at ON embeddings_1024(created_at);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_1536_created_at ON embeddings_1536(created_at);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_2048_created_at ON embeddings_2048(created_at);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_3072_created_at ON embeddings_3072(created_at);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_embeddings_4096_created_at ON embeddings_4096(created_at);
|
||||||
CREATE INDEX IF NOT EXISTS idx_embeddings_5120_created_at ON embeddings_5120(created_at);
|
CREATE INDEX IF NOT EXISTS idx_embeddings_5120_created_at ON embeddings_5120(created_at);
|
||||||
|
|||||||
@@ -45,17 +45,24 @@ func mathBitsToFloat32(b uint32) float32 {
|
|||||||
return *(*float32)(unsafe.Pointer(&b))
|
return *(*float32)(unsafe.Pointer(&b))
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
|
||||||
vecTableName5120 = "embeddings_5120"
|
|
||||||
vecTableName384 = "embeddings_384"
|
|
||||||
)
|
|
||||||
|
|
||||||
func fetchTableName(emb []float32) (string, error) {
|
func fetchTableName(emb []float32) (string, error) {
|
||||||
switch len(emb) {
|
switch len(emb) {
|
||||||
case 5120:
|
|
||||||
return vecTableName5120, nil
|
|
||||||
case 384:
|
case 384:
|
||||||
return vecTableName384, nil
|
return "embeddings_384", nil
|
||||||
|
case 768:
|
||||||
|
return "embeddings_768", nil
|
||||||
|
case 1024:
|
||||||
|
return "embeddings_1024", nil
|
||||||
|
case 1536:
|
||||||
|
return "embeddings_1536", nil
|
||||||
|
case 2048:
|
||||||
|
return "embeddings_2048", nil
|
||||||
|
case 3072:
|
||||||
|
return "embeddings_3072", nil
|
||||||
|
case 4096:
|
||||||
|
return "embeddings_4096", nil
|
||||||
|
case 5120:
|
||||||
|
return "embeddings_5120", nil
|
||||||
default:
|
default:
|
||||||
return "", fmt.Errorf("no table for the size of %d", len(emb))
|
return "", fmt.Errorf("no table for the size of %d", len(emb))
|
||||||
}
|
}
|
||||||
@@ -185,8 +192,12 @@ func sqrt(f float32) float32 {
|
|||||||
func (p ProviderSQL) ListFiles() ([]string, error) {
|
func (p ProviderSQL) ListFiles() ([]string, error) {
|
||||||
fileLists := make([][]string, 0)
|
fileLists := make([][]string, 0)
|
||||||
|
|
||||||
// Query both tables and combine results
|
// Query all supported tables and combine results
|
||||||
for _, table := range []string{vecTableName384, vecTableName5120} {
|
tableNames := []string{
|
||||||
|
"embeddings_384", "embeddings_768", "embeddings_1024", "embeddings_1536",
|
||||||
|
"embeddings_2048", "embeddings_3072", "embeddings_4096", "embeddings_5120",
|
||||||
|
}
|
||||||
|
for _, table := range tableNames {
|
||||||
query := "SELECT DISTINCT filename FROM " + table
|
query := "SELECT DISTINCT filename FROM " + table
|
||||||
rows, err := p.db.Query(query)
|
rows, err := p.db.Query(query)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -225,7 +236,11 @@ func (p ProviderSQL) ListFiles() ([]string, error) {
|
|||||||
func (p ProviderSQL) RemoveEmbByFileName(filename string) error {
|
func (p ProviderSQL) RemoveEmbByFileName(filename string) error {
|
||||||
var errors []string
|
var errors []string
|
||||||
|
|
||||||
for _, table := range []string{vecTableName384, vecTableName5120} {
|
tableNames := []string{
|
||||||
|
"embeddings_384", "embeddings_768", "embeddings_1024", "embeddings_1536",
|
||||||
|
"embeddings_2048", "embeddings_3072", "embeddings_4096", "embeddings_5120",
|
||||||
|
}
|
||||||
|
for _, table := range tableNames {
|
||||||
query := fmt.Sprintf("DELETE FROM %s WHERE filename = ?", table)
|
query := fmt.Sprintf("DELETE FROM %s WHERE filename = ?", table)
|
||||||
if _, err := p.db.Exec(query, filename); err != nil {
|
if _, err := p.db.Exec(query, filename); err != nil {
|
||||||
errors = append(errors, err.Error())
|
errors = append(errors, err.Error())
|
||||||
|
|||||||
Reference in New Issue
Block a user