diff --git a/go.mod b/go.mod index c5b9a7a..8753052 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,6 @@ require ( github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b github.com/jmoiron/sqlx v1.4.0 github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 - github.com/n3integration/epub v0.2.0 github.com/neurosnap/sentences v1.1.2 github.com/rivo/tview v0.42.0 github.com/yuin/goldmark v1.4.13 diff --git a/go.sum b/go.sum index 8fe858e..2e32cfc 100644 --- a/go.sum +++ b/go.sum @@ -53,8 +53,6 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= -github.com/n3integration/epub v0.2.0 h1:mJhgjKmAf0BeUvZ3ZsidvQ5P/E6LFdwNEhf+anP5wTg= -github.com/n3integration/epub v0.2.0/go.mod h1:qaomUgu8jrj09pjpTTY6S8+i225vR4QXJ9VAv0Dy0Ac= github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/neurosnap/sentences v1.1.2 h1:iphYOzx/XckXeBiLIUBkPu2EKMJ+6jDbz/sLJZ7ZoUw= diff --git a/rag/extractors.go b/rag/extractors.go index 1cf97af..4255fdb 100644 --- a/rag/extractors.go +++ b/rag/extractors.go @@ -1,6 +1,7 @@ package rag import ( + "archive/zip" "bytes" "errors" "fmt" @@ -12,7 +13,6 @@ import ( "github.com/PuerkitoBio/goquery" "github.com/ledongthuc/pdf" - "github.com/n3integration/epub" "github.com/yuin/goldmark" "github.com/yuin/goldmark/extension" "github.com/yuin/goldmark/parser" @@ -90,26 +90,46 @@ func extractTextFromMarkdown(fpath string) (string, error) { } func extractTextFromEpub(fpath string) (string, error) { - book, err := epub.Open(fpath) + r, err := zip.OpenReader(fpath) if err != nil { return "", fmt.Errorf("failed to open epub: %w", err) } - defer book.Close() + defer r.Close() + var sb strings.Builder - err = book.Each(func(title string, xhtml io.ReadCloser) { + + for _, f := range r.File { + ext := strings.ToLower(path.Ext(f.Name)) + if ext != ".xhtml" && ext != ".html" && ext != ".htm" && ext != ".xml" { + continue + } + + // Skip manifest, toc, ncx files - they don't contain book content + nameLower := strings.ToLower(f.Name) + if strings.Contains(nameLower, "toc") || strings.Contains(nameLower, "nav") || + strings.Contains(nameLower, "manifest") || strings.Contains(nameLower, ".opf") || + strings.HasSuffix(nameLower, ".ncx") { + continue + } + + rc, err := f.Open() + if err != nil { + continue + } + if sb.Len() > 0 { sb.WriteString("\n\n") } - sb.WriteString(title) + sb.WriteString(f.Name) sb.WriteString("\n") - buf, readErr := io.ReadAll(xhtml) + + buf, readErr := io.ReadAll(rc) + rc.Close() if readErr == nil { sb.WriteString(stripHTML(string(buf))) } - }) - if err != nil { - return "", fmt.Errorf("failed to iterate epub chapters: %w", err) } + if sb.Len() == 0 { return "", errors.New("no content extracted from epub") }