diff --git a/go.mod b/go.mod index 60b8526..21d9f52 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,10 @@ require ( github.com/glebarez/go-sqlite v1.22.0 github.com/gopxl/beep/v2 v2.1.1 github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b + github.com/huantt/plaintext-extractor v1.1.0 github.com/jmoiron/sqlx v1.4.0 + github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 + github.com/n3integration/epub v0.2.0 github.com/neurosnap/sentences v1.1.2 github.com/rivo/tview v0.42.0 ) diff --git a/go.sum b/go.sum index 1cf57af..5fa3959 100644 --- a/go.sum +++ b/go.sum @@ -41,8 +41,12 @@ github.com/hajimehoshi/oto/v2 v2.3.1 h1:qrLKpNus2UfD674oxckKjNJmesp9hMh7u7QCrStB github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/huantt/plaintext-extractor v1.1.0 h1:dZkJN0fGZf1o8x9UdR6hHqkZnqIwX94YlGJ/lSXUZ5c= +github.com/huantt/plaintext-extractor v1.1.0/go.mod h1:zIIbG/hZnsnLgzDbZ2T8fOrA4SLGWCoHWWYZo0Anx9c= github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= +github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+ExRDqGQltzXqN/xypdKP86niVn8= +github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag= @@ -51,6 +55,8 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= +github.com/n3integration/epub v0.2.0 h1:mJhgjKmAf0BeUvZ3ZsidvQ5P/E6LFdwNEhf+anP5wTg= +github.com/n3integration/epub v0.2.0/go.mod h1:qaomUgu8jrj09pjpTTY6S8+i225vR4QXJ9VAv0Dy0Ac= github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/neurosnap/sentences v1.1.2 h1:iphYOzx/XckXeBiLIUBkPu2EKMJ+6jDbz/sLJZ7ZoUw= diff --git a/rag/extractors.go b/rag/extractors.go new file mode 100644 index 0000000..fcc6a2a --- /dev/null +++ b/rag/extractors.go @@ -0,0 +1,153 @@ +package rag + +import ( + "bytes" + "errors" + "fmt" + "io" + "os" + "os/exec" + "path" + "strings" + + "github.com/huantt/plaintext-extractor" + "github.com/ledongthuc/pdf" + "github.com/n3integration/epub" +) + +func ExtractText(fpath string) (string, error) { + ext := strings.ToLower(path.Ext(fpath)) + + switch ext { + case ".txt": + return extractTextFromFile(fpath) + case ".md", ".markdown": + return extractTextFromMarkdown(fpath) + case ".html", ".htm": + return extractTextFromHtml(fpath) + case ".epub": + return extractTextFromEpub(fpath) + case ".pdf": + return extractTextFromPdf(fpath) + default: + return "", fmt.Errorf("unsupported file format: %s", ext) + } +} + +func extractTextFromFile(fpath string) (string, error) { + data, err := os.ReadFile(fpath) + if err != nil { + return "", err + } + return string(data), nil +} + +func extractTextFromMarkdown(fpath string) (string, error) { + data, err := os.ReadFile(fpath) + if err != nil { + return "", err + } + extractor := plaintext.NewMarkdownExtractor() + text, err := extractor.PlainText(string(data)) + if err != nil { + return "", err + } + return *text, nil +} + +func extractTextFromHtml(fpath string) (string, error) { + data, err := os.ReadFile(fpath) + if err != nil { + return "", err + } + extractor := plaintext.NewHtmlExtractor() + text, err := extractor.PlainText(string(data)) + if err != nil { + return "", err + } + return *text, nil +} + +func extractTextFromEpub(fpath string) (string, error) { + book, err := epub.Open(fpath) + if err != nil { + return "", fmt.Errorf("failed to open epub: %w", err) + } + defer book.Close() + + var sb strings.Builder + + err = book.Each(func(title string, xhtml io.ReadCloser) { + if sb.Len() > 0 { + sb.WriteString("\n\n") + } + sb.WriteString(title) + sb.WriteString("\n") + + buf, readErr := io.ReadAll(xhtml) + if readErr == nil { + sb.WriteString(stripHTML(string(buf))) + } + }) + + if err != nil { + return "", fmt.Errorf("failed to iterate epub chapters: %w", err) + } + + if sb.Len() == 0 { + return "", errors.New("no content extracted from epub") + } + + return sb.String(), nil +} + +func stripHTML(html string) string { + var sb strings.Builder + inTag := false + for _, r := range html { + switch r { + case '<': + inTag = true + case '>': + inTag = false + default: + if !inTag { + sb.WriteRune(r) + } + } + } + return sb.String() +} + +func extractTextFromPdf(fpath string) (string, error) { + _, err := exec.LookPath("pdftotext") + if err == nil { + out, err := exec.Command("pdftotext", "-layout", fpath, "-").Output() + if err == nil && len(out) > 0 { + return string(out), nil + } + } + + return extractTextFromPdfPureGo(fpath) +} + +func extractTextFromPdfPureGo(fpath string) (string, error) { + df, r, err := pdf.Open(fpath) + if err != nil { + return "", fmt.Errorf("failed to open pdf: %w", err) + } + defer df.Close() + + textReader, err := r.GetPlainText() + if err != nil { + return "", fmt.Errorf("failed to extract text from pdf: %w", err) + } + + var buf bytes.Buffer + _, err = io.Copy(&buf, textReader) + if err != nil { + return "", fmt.Errorf("failed to read pdf text: %w", err) + } + + return buf.String(), nil +} diff --git a/rag/rag.go b/rag/rag.go index b49bd97..b8b5447 100644 --- a/rag/rag.go +++ b/rag/rag.go @@ -7,7 +7,6 @@ import ( "gf-lt/models" "gf-lt/storage" "log/slog" - "os" "path" "regexp" "sort" @@ -58,7 +57,7 @@ func wordCounter(sentence string) int { func (r *RAG) LoadRAG(fpath string) error { r.mu.Lock() defer r.mu.Unlock() - data, err := os.ReadFile(fpath) + fileText, err := ExtractText(fpath) if err != nil { return err } @@ -68,7 +67,6 @@ func (r *RAG) LoadRAG(fpath string) error { default: r.logger.Warn("LongJobStatusCh channel is full or closed, dropping status message", "message", LoadedFileRAGStatus) } - fileText := string(data) tokenizer, err := english.NewSentenceTokenizer(nil) if err != nil { return err