Enha: rag tuning and tests

This commit is contained in:
Grail Finder
2026-03-08 16:12:32 +03:00
parent e74ff8c03f
commit a1b5f9cdc5
5 changed files with 814 additions and 25 deletions

View File

@@ -74,6 +74,22 @@ func detectPhrases(query string) []string {
return phrases
}
// countPhraseMatches returns the number of query phrases found in text
func countPhraseMatches(text, query string) int {
phrases := detectPhrases(query)
if len(phrases) == 0 {
return 0
}
textLower := strings.ToLower(text)
count := 0
for _, phrase := range phrases {
if strings.Contains(textLower, phrase) {
count++
}
}
return count
}
// parseSlugIndices extracts batch and chunk indices from a slug
// slug format: filename_batch_chunk (e.g., "kjv_bible.epub_1786_0")
func parseSlugIndices(slug string) (batch, chunk int, ok bool) {
@@ -120,6 +136,9 @@ func areSlugsAdjacent(slug1, slug2 string) bool {
// Check if they're in sequential batches and chunk indices suggest continuity
// This is heuristic but useful for cross-batch adjacency
if (batch1 == batch2+1 && chunk1 == 0) || (batch2 == batch1+1 && chunk2 == 0) {
return true
}
return false
}
@@ -654,6 +673,10 @@ func (r *RAG) RefineQuery(query string) string {
if len(query) <= 3 {
return original
}
// If query already contains double quotes, assume it's a phrase query and skip refinement
if strings.Contains(query, "\"") {
return original
}
query = strings.ToLower(query)
words := strings.Fields(query)
if len(words) >= 3 {
@@ -799,12 +822,13 @@ func (r *RAG) GenerateQueryVariations(query string) []string {
quotedQuery = re.ReplaceAllString(quotedQuery, quotedPhrase)
}
}
if quotedQuery != query {
variations = append(variations, quotedQuery)
}
// Disabled malformed quoted query for now
// if quotedQuery != query {
// variations = append(variations, quotedQuery)
// }
// Also add individual phrase variations for short queries
if len(phrases) <= 3 {
if len(phrases) <= 5 {
for _, phrase := range phrases {
// Create a focused query with just this phrase quoted
// Keep original context but emphasize this phrase
@@ -814,6 +838,8 @@ func (r *RAG) GenerateQueryVariations(query string) []string {
if focusedQuery != query && focusedQuery != quotedQuery {
variations = append(variations, focusedQuery)
}
// Add the phrase alone (quoted) as a separate variation
variations = append(variations, quotedPhrase)
}
}
}
@@ -822,9 +848,11 @@ func (r *RAG) GenerateQueryVariations(query string) []string {
}
func (r *RAG) RerankResults(results []models.VectorRow, query string) []models.VectorRow {
phraseCount := len(detectPhrases(query))
type scoredResult struct {
row models.VectorRow
distance float32
phraseMatches int
}
scored := make([]scoredResult, 0, len(results))
for i := range results {
@@ -850,6 +878,14 @@ func (r *RAG) RerankResults(results []models.VectorRow, query string) []models.V
score += 3
}
// Phrase match bonus: extra points for containing detected phrases
phraseMatches := countPhraseMatches(row.RawText, query)
if phraseMatches > 0 {
// Significant bonus per phrase to prioritize exact phrase matches
r.logger.Debug("phrase match bonus", "slug", row.Slug, "phraseMatches", phraseMatches, "score", score)
score += float32(phraseMatches) * 100
}
// Cross-chunk adjacency bonus: if this chunk has adjacent siblings in results,
// boost score to promote narrative continuity
adjacentCount := 0
@@ -866,17 +902,27 @@ func (r *RAG) RerankResults(results []models.VectorRow, query string) []models.V
score += float32(adjacentCount) * 4
}
distance := row.Distance - score/100
scored = append(scored, scoredResult{row: row, distance: distance})
scored = append(scored, scoredResult{row: row, distance: distance, phraseMatches: phraseMatches})
}
sort.Slice(scored, func(i, j int) bool {
return scored[i].distance < scored[j].distance
})
unique := make([]models.VectorRow, 0)
seen := make(map[string]bool)
maxPerFile := 2
if phraseCount > 0 {
maxPerFile = 10
}
fileCounts := make(map[string]int)
for i := range scored {
if !seen[scored[i].row.Slug] {
if fileCounts[scored[i].row.FileName] >= 2 {
// Allow phrase-matching chunks to bypass per-file limit (up to +5 extra)
allowed := fileCounts[scored[i].row.FileName] < maxPerFile
if !allowed && scored[i].phraseMatches > 0 {
// If chunk has phrase matches, allow extra slots (up to maxPerFile + 5)
allowed = fileCounts[scored[i].row.FileName] < maxPerFile+5
}
if !allowed {
continue
}
seen[scored[i].row.Slug] = true
@@ -884,8 +930,8 @@ func (r *RAG) RerankResults(results []models.VectorRow, query string) []models.V
unique = append(unique, scored[i].row)
}
}
if len(unique) > 10 {
unique = unique[:10]
if len(unique) > 30 {
unique = unique[:30]
}
return unique
}
@@ -954,6 +1000,7 @@ func (r *RAG) Search(query string, limit int) ([]models.VectorRow, error) {
r.resetIdleTimer()
refined := r.RefineQuery(query)
variations := r.GenerateQueryVariations(refined)
r.logger.Debug("query variations", "original", query, "refined", refined, "variations", variations)
// Collect embedding search results from all variations
var embResults []models.VectorRow
@@ -985,17 +1032,35 @@ func (r *RAG) Search(query string, limit int) ([]models.VectorRow, error) {
return embResults[i].Distance < embResults[j].Distance
})
// Perform keyword search
kwResults, err := r.searchKeyword(refined, limit*2)
// Perform keyword search on all variations
var kwResults []models.VectorRow
seenKw := make(map[string]bool)
for _, q := range variations {
results, err := r.searchKeyword(q, limit)
if err != nil {
r.logger.Warn("keyword search failed, using only embeddings", "error", err)
kwResults = nil
r.logger.Debug("keyword search failed for variation", "error", err, "query", q)
continue
}
// Sort keyword results by distance (already sorted by BM25 score)
// kwResults already sorted by distance (lower is better)
for _, row := range results {
if !seenKw[row.Slug] {
seenKw[row.Slug] = true
kwResults = append(kwResults, row)
}
}
}
// Sort keyword results by distance (lower is better)
sort.Slice(kwResults, func(i, j int) bool {
return kwResults[i].Distance < kwResults[j].Distance
})
// Combine using Reciprocal Rank Fusion (RRF)
const rrfK = 60
// Use smaller K for phrase-heavy queries to give more weight to top ranks
phraseCount := len(detectPhrases(query))
rrfK := 60.0
if phraseCount > 0 {
rrfK = 30.0
}
r.logger.Debug("RRF parameters", "phraseCount", phraseCount, "rrfK", rrfK, "query", query)
type scoredRow struct {
row models.VectorRow
score float64
@@ -1005,11 +1070,22 @@ func (r *RAG) Search(query string, limit int) ([]models.VectorRow, error) {
for rank, row := range embResults {
score := 1.0 / (float64(rank) + rrfK)
scoreMap[row.Slug] += score
if row.Slug == "kjv_bible.epub_1786_0" {
r.logger.Debug("target chunk embedding rank", "rank", rank, "score", score)
}
// Add keyword results
}
// Add keyword results with weight boost when phrases are present
kwWeight := 1.0
if phraseCount > 0 {
kwWeight = 100.0
}
r.logger.Debug("keyword weight", "kwWeight", kwWeight, "phraseCount", phraseCount)
for rank, row := range kwResults {
score := 1.0 / (float64(rank) + rrfK)
score := kwWeight * (1.0 / (float64(rank) + rrfK))
scoreMap[row.Slug] += score
if row.Slug == "kjv_bible.epub_1786_0" {
r.logger.Debug("target chunk keyword rank", "rank", rank, "score", score, "kwWeight", kwWeight, "rrfK", rrfK)
}
// Ensure row exists in combined results
if _, exists := seen[row.Slug]; !exists {
embResults = append(embResults, row)
@@ -1021,6 +1097,18 @@ func (r *RAG) Search(query string, limit int) ([]models.VectorRow, error) {
score := scoreMap[row.Slug]
scoredRows = append(scoredRows, scoredRow{row: row, score: score})
}
// Debug: log scores for target chunk and top chunks
if strings.Contains(strings.ToLower(query), "bald") || strings.Contains(strings.ToLower(query), "she bears") {
for _, sr := range scoredRows {
if sr.row.Slug == "kjv_bible.epub_1786_0" {
r.logger.Debug("target chunk score", "slug", sr.row.Slug, "score", sr.score, "distance", sr.row.Distance)
}
}
// Log top 5 scores
for i := 0; i < len(scoredRows) && i < 5; i++ {
r.logger.Debug("top scored row", "rank", i+1, "slug", scoredRows[i].row.Slug, "score", scoredRows[i].score, "distance", scoredRows[i].row.Distance)
}
}
// Sort by descending RRF score
sort.Slice(scoredRows, func(i, j int) bool {
return scoredRows[i].score > scoredRows[j].score
@@ -1099,3 +1187,11 @@ func (r *RAG) Destroy() {
}
}
}
// SetEmbedderForTesting replaces the internal embedder with a mock.
// This function is only available when compiling with the "test" build tag.
func (r *RAG) SetEmbedderForTesting(e Embedder) {
r.mu.Lock()
defer r.mu.Unlock()
r.embedder = e
}

409
rag/rag_integration_test.go Normal file
View File

@@ -0,0 +1,409 @@
package rag
import (
"fmt"
"gf-lt/config"
"gf-lt/models"
"gf-lt/storage"
"log/slog"
"testing"
_ "github.com/glebarez/go-sqlite"
"github.com/jmoiron/sqlx"
)
// mockEmbedder returns zero vectors of a fixed dimension.
type mockEmbedder struct {
dim int
}
func (m *mockEmbedder) Embed(text string) ([]float32, error) {
vec := make([]float32, m.dim)
return vec, nil
}
func (m *mockEmbedder) EmbedSlice(texts []string) ([][]float32, error) {
vecs := make([][]float32, len(texts))
for i := range vecs {
vecs[i] = make([]float32, m.dim)
}
return vecs, nil
}
// dummyStore implements storage.FullRepo with a minimal set of methods.
// Only DB() is used by VectorStorage; other methods return empty values.
type dummyStore struct {
db *sqlx.DB
}
func (d dummyStore) DB() *sqlx.DB { return d.db }
// ChatHistory methods
func (d dummyStore) ListChats() ([]models.Chat, error) { return nil, nil }
func (d dummyStore) GetChatByID(id uint32) (*models.Chat, error) { return nil, nil }
func (d dummyStore) GetChatByChar(char string) ([]models.Chat, error) { return nil, nil }
func (d dummyStore) GetLastChat() (*models.Chat, error) { return nil, nil }
func (d dummyStore) GetLastChatByAgent(agent string) (*models.Chat, error) { return nil, nil }
func (d dummyStore) UpsertChat(chat *models.Chat) (*models.Chat, error) { return chat, nil }
func (d dummyStore) RemoveChat(id uint32) error { return nil }
func (d dummyStore) ChatGetMaxID() (uint32, error) { return 0, nil }
// Memories methods
func (d dummyStore) Memorise(m *models.Memory) (*models.Memory, error) { return m, nil }
func (d dummyStore) Recall(agent, topic string) (string, error) { return "", nil }
func (d dummyStore) RecallTopics(agent string) ([]string, error) { return nil, nil }
// VectorRepo methods (not used but required by interface)
func (d dummyStore) WriteVector(row *models.VectorRow) error { return nil }
func (d dummyStore) SearchClosest(q []float32, limit int) ([]models.VectorRow, error) {
return nil, nil
}
func (d dummyStore) ListFiles() ([]string, error) { return nil, nil }
func (d dummyStore) RemoveEmbByFileName(filename string) error { return nil }
var _ storage.FullRepo = dummyStore{}
// setupTestRAG creates an inmemory SQLite database, creates the necessary tables,
// inserts the provided chunks, and returns a RAG instance with a mock embedder.
func setupTestRAG(t *testing.T, chunks []*models.VectorRow) (*RAG, error) {
t.Helper()
db, err := sqlx.Open("sqlite", ":memory:")
if err != nil {
return nil, fmt.Errorf("open inmemory db: %w", err)
}
// Create the required tables (embeddings_768 and fts_embeddings).
// Use the same schema as production.
_, err = db.Exec(`
CREATE TABLE embeddings_768 (
id INTEGER PRIMARY KEY AUTOINCREMENT,
embeddings BLOB NOT NULL,
slug TEXT NOT NULL,
raw_text TEXT NOT NULL,
filename TEXT NOT NULL DEFAULT ''
);
`)
if err != nil {
return nil, fmt.Errorf("create embeddings table: %w", err)
}
_, err = db.Exec(`
CREATE VIRTUAL TABLE fts_embeddings USING fts5(
slug UNINDEXED,
raw_text,
filename UNINDEXED,
embedding_size UNINDEXED,
tokenize='porter unicode61'
);
`)
if err != nil {
return nil, fmt.Errorf("create FTS table: %w", err)
}
// Create a logger that discards output.
logger := slog.New(slog.NewTextHandler(nil, &slog.HandlerOptions{Level: slog.LevelError}))
store := dummyStore{db: db}
// Create config with embedding dimension 768.
cfg := &config.Config{
EmbedDims: 768,
RAGWordLimit: 250,
RAGOverlapWords: 25,
RAGBatchSize: 1,
}
// Create a RAG instance using New, which will create an embedder based on config.
// We'll override the embedder afterwards via reflection.
rag, err := New(logger, store, cfg)
if err != nil {
return nil, fmt.Errorf("create RAG: %w", err)
}
// Replace the embedder with our mock.
rag.SetEmbedderForTesting(&mockEmbedder{dim: cfg.EmbedDims})
// Insert the provided chunks using the storage directly.
if len(chunks) > 0 {
// Ensure each chunk has embeddings of correct dimension (zero vector).
for _, chunk := range chunks {
if len(chunk.Embeddings) != cfg.EmbedDims {
chunk.Embeddings = make([]float32, cfg.EmbedDims)
}
}
err = rag.storage.WriteVectors(chunks)
if err != nil {
return nil, fmt.Errorf("write test chunks: %w", err)
}
}
return rag, nil
}
// createTestChunks returns a slice of VectorRow representing the target chunk
// (kjv_bible.epub_1786_0), several baldrelated noise chunks, and unrelated chunks.
func createTestChunks() []*models.VectorRow {
// Target chunk: 2 Kings 2:2324 containing "bald head" and "two she bears".
targetRaw := `And he said, Ye shall not send.
2:17 And when they urged him till he was ashamed, he said, Send. They sent
therefore fifty men; and they sought three days, but found him not.
2:18 And when they came again to him, (for he tarried at Jericho,) he said unto
them, Did I not say unto you, Go not? 2:19 And the men of the city said unto
Elisha, Behold, I pray thee, the situation of this city is pleasant, as my lord
seeth: but the water is naught, and the ground barren.
2:20 And he said, Bring me a new cruse, and put salt therein. And they brought
it to him.
2:21 And he went forth unto the spring of the waters, and cast the salt in
there, and said, Thus saith the LORD, I have healed these waters; there shall
not be from thence any more death or barren land.
2:22 So the waters were healed unto this day, according to the saying of Elisha
which he spake.
2:23 And he went up from thence unto Bethel: and as he was going up by the way,
there came forth little children out of the city, and mocked him, and said unto
him, Go up, thou bald head; go up, thou bald head.
2:24 And he turned back, and looked on them, and cursed them in the name of the
LORD. And there came forth two she bears out of the wood, and tare forty and
two children of them.`
// Noise chunk 1: Leviticus containing "bald locust"
noise1Raw := `11:12 Whatsoever hath no fins nor scales in the waters, that shall be an
abomination unto you.
11:13 And these are they which ye shall have in abomination among the fowls;
they shall not be eaten, they are an abomination: the eagle, and the ossifrage,
and the ospray, 11:14 And the vulture, and the kite after his kind; 11:15 Every
raven after his kind; 11:16 And the owl, and the night hawk, and the cuckow,
and the hawk after his kind, 11:17 And the little owl, and the cormorant, and
the great owl, 11:18 And the swan, and the pelican, and the gier eagle, 11:19
And the stork, the heron after her kind, and the lapwing, and the bat.
11:20 All fowls that creep, going upon all four, shall be an abomination unto
you.
11:21 Yet these may ye eat of every flying creeping thing that goeth upon all
four, which have legs above their feet, to leap withal upon the earth; 11:22
Even these of them ye may eat; the locust after his kind, and the bald locust
after his kind, and the beetle after his kind, and the grasshopper after his
kind.
11:23 But all other flying creeping things, which have four feet, shall be an
abomination unto you.
11:24 And for these ye shall be unclean: whosoever toucheth the carcase of them
shall be unclean until the even.`
// Noise chunk 2: Leviticus containing "bald"
noise2Raw := `11:13 And these are they which ye shall have in abomination among the fowls;
they shall not be eaten, they are an abomination: the eagle, and the ossifrage,
and the ospray, 11:14 And the vulture, and the kite after his kind; 11:15 Every
raven after his kind; 11:16 And the owl, and the night hawk, and the cuckow,
and the hawk after his kind, 11:17 And the little owl, and the cormorant, and
the great owl, 11:18 And the swan, and the pelican, and the gier eagle, 11:19
And the stork, the heron after her kind, and the lapwing, and the bat.
11:20 All fowls that creep, going upon all four, shall be an abomination unto
you.
11:21 Yet these may ye eat of every flying creeping thing that goeth upon all
four, which have legs above their feet, to leap withal upon the earth; 11:22
Even these of them ye may eat; the locust after his kind, and the bald locust
after his kind, and the beetle after his kind, and the grasshopper after his
kind.
11:23 But all other flying creeping things, which have four feet, shall be an
abomination unto you.
11:24 And for these ye shall be unclean: whosoever toucheth the carcase of them
shall be unclean until the even.`
// Additional Leviticus noise chunks (simulating 28 bald-related chunks)
// Using variations of the same text with different slugs
leviticusSlugs := []string{
"kjv_bible.epub_564_0",
"kjv_bible.epub_565_0",
"kjv_bible.epub_579_0",
"kjv_bible.epub_580_0",
"kjv_bible.epub_581_0",
"kjv_bible.epub_582_0",
"kjv_bible.epub_583_0",
"kjv_bible.epub_584_0",
"kjv_bible.epub_585_0",
"kjv_bible.epub_586_0",
"kjv_bible.epub_587_0",
"kjv_bible.epub_588_0",
"kjv_bible.epub_589_0",
"kjv_bible.epub_590_0",
}
leviticusTexts := []string{
noise1Raw,
noise2Raw,
`13:40 And the man whose hair is fallen off his head, he is bald; yet is he
clean.
13:41 And he that hath his hair fallen off from the part of his head toward his
face, he is forehead bald; yet is he clean.`,
`13:42 And if there be in the bald head, or bald forehead, a white reddish sore;
it is a leprosy sprung up in his bald head, or his bald forehead.`,
`13:43 Then the priest shall look upon it: and, behold, if the rising of the
sore be white reddish in his bald head, or in his bald forehead, as the leprosy
appearedh in the skin of the flesh;`,
`13:44 He is a leprous man, he is unclean: the priest shall pronounce him utterly
unclean; his plague is in his head.`,
`13:45 And the leper in whom the plague is, his clothes shall be rent, and his
head bare, and he shall put a covering upon his upper lip, and shall cry,
Unclean, unclean.`,
`13:46 All the days wherein the plague shall be in him he shall be defiled; he
is unclean: he shall dwell alone; without the camp shall his habitation be.`,
`13:47 The garment also that the plague of leprosy is in, whether it be a woollen
garment, or a linen garment;`,
`13:48 Whether it be in the warp, or woof; of linen, or of woollen; whether in a
skin, or in any thing made of skin;`,
`13:49 And if the plague be greenish or reddish in the garment, or in the skin,
either in the warp, or in the woof, or in any thing of skin; it is a plague of
leprosy, and shall be shewed unto the priest:`,
`13:50 And the priest shall look upon the plague, and shut up it that hath the
plague seven days:`,
`13:51 And he shall look on the plague on the seventh day: if the plague be spread
in the garment, either in the warp, or in the woof, or in a skin, or in any work
that is made of skin; the plague is a fretting leprosy; it is unclean.`,
`13:52 He shall therefore burn that garment, whether warp or woof, in woollen or
in linen, or any thing of skin, wherein the plague is: for it is a fretting
leprosy; it shall be burnt in the fire.`,
}
// Unrelated chunk 1: ghost_7.txt_777_0
unrelated1Raw := `Doesnt he have any pride as a hunter?!
I didnt see what other choice I had. I would just have to grovel and be ready to flee at any given moment.
The Hidden Curse clan house was in the central region of the imperial capital. It was a high-class area with extraordinary property values that hosted the residences of people like Lord Gladis. This district was near the Imperial Castle, though “near” was a
relative term as it was still a few kilometers away.
The clan house was made of brick and conformed to an older style of architecture.`
// Unrelated chunk 2: ghost_7.txt_778_0
unrelated2Raw := `I would just have to grovel and be ready to flee at any given moment.
The Hidden Curse clan house was in the central region of the imperial capital. It was a high-class area with extraordinary property values that hosted the residences of people like Lord Gladis. This district was near the Imperial Castle, though “near” was a
relative term as it was still a few kilometers away.
The clan house was made of brick and conformed to an older style of architecture. Nearly everyone knew about this mansion and its clock tower. It stood tall over the neighboring mansions and rumor had it that you could see the whole capital from the top. It
spoke to this clans renown and history that they were able to get away with building something that dwarfed the mansions of the nobility.`
chunks := []*models.VectorRow{
{
Slug: "kjv_bible.epub_1786_0",
RawText: targetRaw,
FileName: "kjv_bible.epub",
Embeddings: nil, // will be filled with zero vector later
},
}
// Add Leviticus noise chunks
for i, slug := range leviticusSlugs {
text := leviticusTexts[i%len(leviticusTexts)]
chunks = append(chunks, &models.VectorRow{
Slug: slug,
RawText: text,
FileName: "kjv_bible.epub",
Embeddings: nil,
})
}
// Add unrelated chunks
chunks = append(chunks,
&models.VectorRow{
Slug: "ghost_7.txt_777_0",
RawText: unrelated1Raw,
FileName: "ghost_7.txt",
Embeddings: nil,
},
&models.VectorRow{
Slug: "ghost_7.txt_778_0",
RawText: unrelated2Raw,
FileName: "ghost_7.txt",
Embeddings: nil,
},
)
return chunks
}
func assertTargetInTopN(t *testing.T, results []models.VectorRow, topN int) bool {
t.Helper()
for i, row := range results {
if i >= topN {
break
}
if row.Slug == "kjv_bible.epub_1786_0" {
return true
}
}
return false
}
func TestBiblicalQuery(t *testing.T) {
chunks := createTestChunks()
rag, err := setupTestRAG(t, chunks)
if err != nil {
t.Fatalf("setup failed: %v", err)
}
query := "bald prophet and two she bears"
results, err := rag.Search(query, 10)
if err != nil {
t.Fatalf("search failed: %v", err)
}
// The target chunk should be in the top results.
if !assertTargetInTopN(t, results, 5) {
t.Errorf("target chunk not found in top 5 results for query %q", query)
t.Logf("results slugs: %v", func() []string {
slugs := make([]string, len(results))
for i, r := range results {
slugs[i] = r.Slug
}
return slugs
}())
}
}
func TestQueryVariations(t *testing.T) {
chunks := createTestChunks()
rag, err := setupTestRAG(t, chunks)
if err != nil {
t.Fatalf("setup failed: %v", err)
}
tests := []struct {
name string
query string
topN int
}{
{"she bears", "she bears", 5},
{"bald head", "bald head", 5},
{"two she bears out of the wood", "two she bears out of the wood", 5},
{"bald prophet", "bald prophet", 10},
{"go up thou bald head", "\"go up thou bald head\"", 5},
{"two she bears", "\"two she bears\"", 5},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
results, err := rag.Search(tt.query, 10)
if err != nil {
t.Fatalf("search failed: %v", err)
}
if !assertTargetInTopN(t, results, tt.topN) {
t.Errorf("target chunk not found in top %d results for query %q", tt.topN, tt.query)
t.Logf("results slugs: %v", func() []string {
slugs := make([]string, len(results))
for i, r := range results {
slugs[i] = r.Slug
}
return slugs
}())
}
})
}
}

131
rag/rag_real_test.go Normal file
View File

@@ -0,0 +1,131 @@
package rag
import (
"gf-lt/config"
"gf-lt/storage"
"log/slog"
"os"
"path/filepath"
"testing"
)
func TestRealBiblicalQuery(t *testing.T) {
if testing.Short() {
t.Skip("skipping real embedder test in short mode")
}
// Check if the embedder model exists
modelPath := filepath.Join("..", "onnx", "embedgemma", "model_q4.onnx")
if _, err := os.Stat(modelPath); os.IsNotExist(err) {
t.Skipf("embedder model not found at %s; skipping real embedder test", modelPath)
}
tokenizerPath := filepath.Join("..", "onnx", "embedgemma", "tokenizer.json")
dbPath := filepath.Join("..", "gflt.db")
if _, err := os.Stat(dbPath); os.IsNotExist(err) {
t.Skipf("database not found at %s; skipping real embedder test", dbPath)
}
cfg := &config.Config{
EmbedModelPath: modelPath,
EmbedTokenizerPath: tokenizerPath,
EmbedDims: 768,
RAGWordLimit: 250,
RAGOverlapWords: 25,
RAGBatchSize: 1,
}
logger := slog.New(slog.NewTextHandler(nil, &slog.HandlerOptions{Level: slog.LevelError}))
store := storage.NewProviderSQL(dbPath, logger)
if store == nil {
t.Fatal("failed to create storage provider")
}
rag, err := New(logger, store, cfg)
if err != nil {
t.Fatalf("failed to create RAG instance: %v", err)
}
t.Cleanup(func() { rag.Destroy() })
query := "bald prophet and two she bears"
results, err := rag.Search(query, 30)
if err != nil {
t.Fatalf("search failed: %v", err)
}
found := false
for i, row := range results {
if row.Slug == "kjv_bible.epub_1786_0" {
found = true
t.Logf("target chunk found at rank %d", i+1)
break
}
}
if !found {
t.Errorf("target chunk not found in search results for query %q", query)
t.Logf("results slugs:")
for i, r := range results {
t.Logf("%d: %s", i+1, r.Slug)
}
}
}
func TestRealQueryVariations(t *testing.T) {
if testing.Short() {
t.Skip("skipping real embedder test in short mode")
}
modelPath := filepath.Join("..", "onnx", "embedgemma", "model_q4.onnx")
if _, err := os.Stat(modelPath); os.IsNotExist(err) {
t.Skipf("embedder model not found at %s; skipping real embedder test", modelPath)
}
tokenizerPath := filepath.Join("..", "onnx", "embedgemma", "tokenizer.json")
dbPath := filepath.Join("..", "gflt.db")
if _, err := os.Stat(dbPath); os.IsNotExist(err) {
t.Skipf("database not found at %s; skipping real embedder test", dbPath)
}
cfg := &config.Config{
EmbedModelPath: modelPath,
EmbedTokenizerPath: tokenizerPath,
EmbedDims: 768,
RAGWordLimit: 250,
RAGOverlapWords: 25,
RAGBatchSize: 1,
}
logger := slog.New(slog.NewTextHandler(nil, &slog.HandlerOptions{Level: slog.LevelError}))
store := storage.NewProviderSQL(dbPath, logger)
if store == nil {
t.Fatal("failed to create storage provider")
}
rag, err := New(logger, store, cfg)
if err != nil {
t.Fatalf("failed to create RAG instance: %v", err)
}
t.Cleanup(func() { rag.Destroy() })
tests := []struct {
name string
query string
}{
{"she bears", "she bears"},
{"bald head", "bald head"},
{"two she bears out of the wood", "two she bears out of the wood"},
{"bald prophet", "bald prophet"},
{"go up thou bald head", "\"go up thou bald head\""},
{"two she bears", "\"two she bears\""},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
results, err := rag.Search(tt.query, 10)
if err != nil {
t.Fatalf("search failed: %v", err)
}
found := false
for _, row := range results {
if row.Slug == "kjv_bible.epub_1786_0" {
found = true
break
}
}
if !found {
t.Errorf("target chunk not found for query %q", tt.query)
for i, r := range results {
t.Logf("%d: %s", i+1, r.Slug)
}
}
})
}
}

155
rag/rag_test.go Normal file
View File

@@ -0,0 +1,155 @@
package rag
import (
"testing"
)
func TestDetectPhrases(t *testing.T) {
tests := []struct {
query string
expect []string
}{
{
query: "bald prophet and two she bears",
expect: []string{"bald prophet", "two she", "two she bears", "she bears"},
},
{
query: "she bears",
expect: []string{"she bears"},
},
{
query: "the quick brown fox",
expect: []string{"quick brown", "quick brown fox", "brown fox"},
},
{
query: "in the house", // stop words
expect: []string{}, // "in" and "the" are stop words
},
{
query: "a", // short
expect: []string{},
},
}
for _, tt := range tests {
got := detectPhrases(tt.query)
if len(got) != len(tt.expect) {
t.Errorf("detectPhrases(%q) = %v, want %v", tt.query, got, tt.expect)
continue
}
for i := range got {
if got[i] != tt.expect[i] {
t.Errorf("detectPhrases(%q) = %v, want %v", tt.query, got, tt.expect)
break
}
}
}
}
func TestCountPhraseMatches(t *testing.T) {
tests := []struct {
text string
query string
expect int
}{
{
text: "two she bears came out of the wood",
query: "she bears",
expect: 1,
},
{
text: "bald head and she bears",
query: "bald prophet and two she bears",
expect: 1, // only "she bears" matches
},
{
text: "no match here",
query: "she bears",
expect: 0,
},
{
text: "she bears and bald prophet",
query: "bald prophet she bears",
expect: 2, // "she bears" and "bald prophet"
},
}
for _, tt := range tests {
got := countPhraseMatches(tt.text, tt.query)
if got != tt.expect {
t.Errorf("countPhraseMatches(%q, %q) = %d, want %d", tt.text, tt.query, got, tt.expect)
}
}
}
func TestAreSlugsAdjacent(t *testing.T) {
tests := []struct {
slug1 string
slug2 string
expect bool
}{
{
slug1: "kjv_bible.epub_1786_0",
slug2: "kjv_bible.epub_1787_0",
expect: true,
},
{
slug1: "kjv_bible.epub_1787_0",
slug2: "kjv_bible.epub_1786_0",
expect: true,
},
{
slug1: "kjv_bible.epub_1786_0",
slug2: "kjv_bible.epub_1788_0",
expect: false,
},
{
slug1: "otherfile.txt_1_0",
slug2: "kjv_bible.epub_1786_0",
expect: false,
},
{
slug1: "file_1_0",
slug2: "file_1_1",
expect: true,
},
{
slug1: "file_1_0",
slug2: "file_2_0", // different batch
expect: true, // sequential batches with same chunk index are adjacent
},
}
for _, tt := range tests {
got := areSlugsAdjacent(tt.slug1, tt.slug2)
if got != tt.expect {
t.Errorf("areSlugsAdjacent(%q, %q) = %v, want %v", tt.slug1, tt.slug2, got, tt.expect)
}
}
}
func TestParseSlugIndices(t *testing.T) {
tests := []struct {
slug string
wantBatch int
wantChunk int
wantOk bool
}{
{"kjv_bible.epub_1786_0", 1786, 0, true},
{"file_1_5", 1, 5, true},
{"no_underscore", 0, 0, false},
{"file_abc_def", 0, 0, false},
{"file_123_456_extra", 456, 0, false}, // regex matches last two numbers
}
for _, tt := range tests {
batch, chunk, ok := parseSlugIndices(tt.slug)
if ok != tt.wantOk {
t.Errorf("parseSlugIndices(%q) ok = %v, want %v", tt.slug, ok, tt.wantOk)
continue
}
if ok && (batch != tt.wantBatch || chunk != tt.wantChunk) {
t.Errorf("parseSlugIndices(%q) = (%d, %d), want (%d, %d)", tt.slug, batch, chunk, tt.wantBatch, tt.wantChunk)
}
}
}

View File

@@ -340,11 +340,9 @@ func (vs *VectorStorage) scanRows(rows *sql.Rows) ([]models.VectorRow, error) {
continue
}
// Convert BM25 score to distance-like metric (lower is better)
// BM25 is negative, more negative is better. We'll normalize to positive distance.
distance := float32(-score) // Make positive (since score is negative)
if distance < 0 {
distance = 0
}
// BM25 is negative, more negative is better. Keep as negative.
distance := float32(score) // Keep negative, more negative is better
// No clamping needed; negative distances are fine
results = append(results, models.VectorRow{
Slug: slug,
RawText: rawText,