gf-lt/rag/rag_test.go

package rag

import (
	"testing"
)

func TestDetectPhrases(t *testing.T) {
	tests := []struct {
		query  string
		expect []string
	}{
		{
			query:  "bald prophet and two she bears",
			expect: []string{"bald prophet", "two she", "two she bears", "she bears"},
		},
		{
			query:  "she bears",
			expect: []string{"she bears"},
		},
		{
			query:  "the quick brown fox",
			expect: []string{"quick brown", "quick brown fox", "brown fox"},
		},
		{
			query:  "in the house", // stop words
			expect: []string{},     // "in" and "the" are stop words
		},
		{
			query:  "a", // short
			expect: []string{},
		},
	}

	for _, tt := range tests {
		got := detectPhrases(tt.query)
		if len(got) != len(tt.expect) {
			t.Errorf("detectPhrases(%q) = %v, want %v", tt.query, got, tt.expect)
			continue
		}
		for i := range got {
			if got[i] != tt.expect[i] {
				t.Errorf("detectPhrases(%q) = %v, want %v", tt.query, got, tt.expect)
				break
			}
		}
	}
}

func TestCountPhraseMatches(t *testing.T) {
	tests := []struct {
		text   string
		query  string
		expect int
	}{
		{
			text:   "two she bears came out of the wood",
			query:  "she bears",
			expect: 1,
		},
		{
			text:   "bald head and she bears",
			query:  "bald prophet and two she bears",
			expect: 1, // only "she bears" matches
		},
		{
			text:   "no match here",
			query:  "she bears",
			expect: 0,
		},
		{
			text:   "she bears and bald prophet",
			query:  "bald prophet she bears",
			expect: 2, // "she bears" and "bald prophet"
		},
	}

	for _, tt := range tests {
		got := countPhraseMatches(tt.text, tt.query)
		if got != tt.expect {
			t.Errorf("countPhraseMatches(%q, %q) = %d, want %d", tt.text, tt.query, got, tt.expect)
		}
	}
}

func TestAreSlugsAdjacent(t *testing.T) {
	tests := []struct {
		slug1  string
		slug2  string
		expect bool
	}{
		{
			slug1:  "kjv_bible.epub_1786_0",
			slug2:  "kjv_bible.epub_1787_0",
			expect: true,
		},
		{
			slug1:  "kjv_bible.epub_1787_0",
			slug2:  "kjv_bible.epub_1786_0",
			expect: true,
		},
		{
			slug1:  "kjv_bible.epub_1786_0",
			slug2:  "kjv_bible.epub_1788_0",
			expect: false,
		},
		{
			slug1:  "otherfile.txt_1_0",
			slug2:  "kjv_bible.epub_1786_0",
			expect: false,
		},
		{
			slug1:  "file_1_0",
			slug2:  "file_1_1",
			expect: true,
		},
		{
			slug1:  "file_1_0",
			slug2:  "file_2_0", // different batch
			expect: true,       // sequential batches with same chunk index are adjacent
		},
	}

	for _, tt := range tests {
		got := areSlugsAdjacent(tt.slug1, tt.slug2)
		if got != tt.expect {
			t.Errorf("areSlugsAdjacent(%q, %q) = %v, want %v", tt.slug1, tt.slug2, got, tt.expect)
		}
	}
}

func TestParseSlugIndices(t *testing.T) {
	tests := []struct {
		slug      string
		wantBatch int
		wantChunk int
		wantOk    bool
	}{
		{"kjv_bible.epub_1786_0", 1786, 0, true},
		{"file_1_5", 1, 5, true},
		{"no_underscore", 0, 0, false},
		{"file_abc_def", 0, 0, false},
		{"file_123_456_extra", 456, 0, false}, // regex matches last two numbers
	}

	for _, tt := range tests {
		batch, chunk, ok := parseSlugIndices(tt.slug)
		if ok != tt.wantOk {
			t.Errorf("parseSlugIndices(%q) ok = %v, want %v", tt.slug, ok, tt.wantOk)
			continue
		}
		if ok && (batch != tt.wantBatch || chunk != tt.wantChunk) {
			t.Errorf("parseSlugIndices(%q) = (%d, %d), want (%d, %d)", tt.slug, batch, chunk, tt.wantBatch, tt.wantChunk)
		}
	}
}