Feat: rag text extractors
This commit is contained in:
3
go.mod
3
go.mod
@@ -10,7 +10,10 @@ require (
|
|||||||
github.com/glebarez/go-sqlite v1.22.0
|
github.com/glebarez/go-sqlite v1.22.0
|
||||||
github.com/gopxl/beep/v2 v2.1.1
|
github.com/gopxl/beep/v2 v2.1.1
|
||||||
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b
|
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b
|
||||||
|
github.com/huantt/plaintext-extractor v1.1.0
|
||||||
github.com/jmoiron/sqlx v1.4.0
|
github.com/jmoiron/sqlx v1.4.0
|
||||||
|
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
|
||||||
|
github.com/n3integration/epub v0.2.0
|
||||||
github.com/neurosnap/sentences v1.1.2
|
github.com/neurosnap/sentences v1.1.2
|
||||||
github.com/rivo/tview v0.42.0
|
github.com/rivo/tview v0.42.0
|
||||||
)
|
)
|
||||||
|
|||||||
6
go.sum
6
go.sum
@@ -41,8 +41,12 @@ github.com/hajimehoshi/oto/v2 v2.3.1 h1:qrLKpNus2UfD674oxckKjNJmesp9hMh7u7QCrStB
|
|||||||
github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo=
|
github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo=
|
||||||
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||||
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||||
|
github.com/huantt/plaintext-extractor v1.1.0 h1:dZkJN0fGZf1o8x9UdR6hHqkZnqIwX94YlGJ/lSXUZ5c=
|
||||||
|
github.com/huantt/plaintext-extractor v1.1.0/go.mod h1:zIIbG/hZnsnLgzDbZ2T8fOrA4SLGWCoHWWYZo0Anx9c=
|
||||||
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
|
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
|
||||||
github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
|
github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
|
||||||
|
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+ExRDqGQltzXqN/xypdKP86niVn8=
|
||||||
|
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4=
|
||||||
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
|
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
|
||||||
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
|
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
|
||||||
github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag=
|
github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag=
|
||||||
@@ -51,6 +55,8 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE
|
|||||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
|
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
|
||||||
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
||||||
|
github.com/n3integration/epub v0.2.0 h1:mJhgjKmAf0BeUvZ3ZsidvQ5P/E6LFdwNEhf+anP5wTg=
|
||||||
|
github.com/n3integration/epub v0.2.0/go.mod h1:qaomUgu8jrj09pjpTTY6S8+i225vR4QXJ9VAv0Dy0Ac=
|
||||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||||
github.com/neurosnap/sentences v1.1.2 h1:iphYOzx/XckXeBiLIUBkPu2EKMJ+6jDbz/sLJZ7ZoUw=
|
github.com/neurosnap/sentences v1.1.2 h1:iphYOzx/XckXeBiLIUBkPu2EKMJ+6jDbz/sLJZ7ZoUw=
|
||||||
|
|||||||
153
rag/extractors.go
Normal file
153
rag/extractors.go
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
package rag
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/huantt/plaintext-extractor"
|
||||||
|
"github.com/ledongthuc/pdf"
|
||||||
|
"github.com/n3integration/epub"
|
||||||
|
)
|
||||||
|
|
||||||
|
func ExtractText(fpath string) (string, error) {
|
||||||
|
ext := strings.ToLower(path.Ext(fpath))
|
||||||
|
|
||||||
|
switch ext {
|
||||||
|
case ".txt":
|
||||||
|
return extractTextFromFile(fpath)
|
||||||
|
case ".md", ".markdown":
|
||||||
|
return extractTextFromMarkdown(fpath)
|
||||||
|
case ".html", ".htm":
|
||||||
|
return extractTextFromHtml(fpath)
|
||||||
|
case ".epub":
|
||||||
|
return extractTextFromEpub(fpath)
|
||||||
|
case ".pdf":
|
||||||
|
return extractTextFromPdf(fpath)
|
||||||
|
default:
|
||||||
|
return "", fmt.Errorf("unsupported file format: %s", ext)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractTextFromFile(fpath string) (string, error) {
|
||||||
|
data, err := os.ReadFile(fpath)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return string(data), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractTextFromMarkdown(fpath string) (string, error) {
|
||||||
|
data, err := os.ReadFile(fpath)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
extractor := plaintext.NewMarkdownExtractor()
|
||||||
|
text, err := extractor.PlainText(string(data))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return *text, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractTextFromHtml(fpath string) (string, error) {
|
||||||
|
data, err := os.ReadFile(fpath)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
extractor := plaintext.NewHtmlExtractor()
|
||||||
|
text, err := extractor.PlainText(string(data))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return *text, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractTextFromEpub(fpath string) (string, error) {
|
||||||
|
book, err := epub.Open(fpath)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("failed to open epub: %w", err)
|
||||||
|
}
|
||||||
|
defer book.Close()
|
||||||
|
|
||||||
|
var sb strings.Builder
|
||||||
|
|
||||||
|
err = book.Each(func(title string, xhtml io.ReadCloser) {
|
||||||
|
if sb.Len() > 0 {
|
||||||
|
sb.WriteString("\n\n")
|
||||||
|
}
|
||||||
|
sb.WriteString(title)
|
||||||
|
sb.WriteString("\n")
|
||||||
|
|
||||||
|
buf, readErr := io.ReadAll(xhtml)
|
||||||
|
if readErr == nil {
|
||||||
|
sb.WriteString(stripHTML(string(buf)))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("failed to iterate epub chapters: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if sb.Len() == 0 {
|
||||||
|
return "", errors.New("no content extracted from epub")
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.String(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func stripHTML(html string) string {
|
||||||
|
var sb strings.Builder
|
||||||
|
inTag := false
|
||||||
|
for _, r := range html {
|
||||||
|
switch r {
|
||||||
|
case '<':
|
||||||
|
inTag = true
|
||||||
|
case '>':
|
||||||
|
inTag = false
|
||||||
|
default:
|
||||||
|
if !inTag {
|
||||||
|
sb.WriteRune(r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractTextFromPdf(fpath string) (string, error) {
|
||||||
|
_, err := exec.LookPath("pdftotext")
|
||||||
|
if err == nil {
|
||||||
|
out, err := exec.Command("pdftotext", "-layout", fpath, "-").Output()
|
||||||
|
if err == nil && len(out) > 0 {
|
||||||
|
return string(out), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return extractTextFromPdfPureGo(fpath)
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractTextFromPdfPureGo(fpath string) (string, error) {
|
||||||
|
df, r, err := pdf.Open(fpath)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("failed to open pdf: %w", err)
|
||||||
|
}
|
||||||
|
defer df.Close()
|
||||||
|
|
||||||
|
textReader, err := r.GetPlainText()
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("failed to extract text from pdf: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var buf bytes.Buffer
|
||||||
|
_, err = io.Copy(&buf, textReader)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("failed to read pdf text: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return buf.String(), nil
|
||||||
|
}
|
||||||
@@ -7,7 +7,6 @@ import (
|
|||||||
"gf-lt/models"
|
"gf-lt/models"
|
||||||
"gf-lt/storage"
|
"gf-lt/storage"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
|
||||||
"path"
|
"path"
|
||||||
"regexp"
|
"regexp"
|
||||||
"sort"
|
"sort"
|
||||||
@@ -58,7 +57,7 @@ func wordCounter(sentence string) int {
|
|||||||
func (r *RAG) LoadRAG(fpath string) error {
|
func (r *RAG) LoadRAG(fpath string) error {
|
||||||
r.mu.Lock()
|
r.mu.Lock()
|
||||||
defer r.mu.Unlock()
|
defer r.mu.Unlock()
|
||||||
data, err := os.ReadFile(fpath)
|
fileText, err := ExtractText(fpath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -68,7 +67,6 @@ func (r *RAG) LoadRAG(fpath string) error {
|
|||||||
default:
|
default:
|
||||||
r.logger.Warn("LongJobStatusCh channel is full or closed, dropping status message", "message", LoadedFileRAGStatus)
|
r.logger.Warn("LongJobStatusCh channel is full or closed, dropping status message", "message", LoadedFileRAGStatus)
|
||||||
}
|
}
|
||||||
fileText := string(data)
|
|
||||||
tokenizer, err := english.NewSentenceTokenizer(nil)
|
tokenizer, err := english.NewSentenceTokenizer(nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|||||||
Reference in New Issue
Block a user