Fix (rag): epub load
This commit is contained in:
1
go.mod
1
go.mod
@@ -13,7 +13,6 @@ require (
|
|||||||
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b
|
github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b
|
||||||
github.com/jmoiron/sqlx v1.4.0
|
github.com/jmoiron/sqlx v1.4.0
|
||||||
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
|
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
|
||||||
github.com/n3integration/epub v0.2.0
|
|
||||||
github.com/neurosnap/sentences v1.1.2
|
github.com/neurosnap/sentences v1.1.2
|
||||||
github.com/rivo/tview v0.42.0
|
github.com/rivo/tview v0.42.0
|
||||||
github.com/yuin/goldmark v1.4.13
|
github.com/yuin/goldmark v1.4.13
|
||||||
|
|||||||
2
go.sum
2
go.sum
@@ -53,8 +53,6 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE
|
|||||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
|
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
|
||||||
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
||||||
github.com/n3integration/epub v0.2.0 h1:mJhgjKmAf0BeUvZ3ZsidvQ5P/E6LFdwNEhf+anP5wTg=
|
|
||||||
github.com/n3integration/epub v0.2.0/go.mod h1:qaomUgu8jrj09pjpTTY6S8+i225vR4QXJ9VAv0Dy0Ac=
|
|
||||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||||
github.com/neurosnap/sentences v1.1.2 h1:iphYOzx/XckXeBiLIUBkPu2EKMJ+6jDbz/sLJZ7ZoUw=
|
github.com/neurosnap/sentences v1.1.2 h1:iphYOzx/XckXeBiLIUBkPu2EKMJ+6jDbz/sLJZ7ZoUw=
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package rag
|
package rag
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"archive/zip"
|
||||||
"bytes"
|
"bytes"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -12,7 +13,6 @@ import (
|
|||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"github.com/ledongthuc/pdf"
|
"github.com/ledongthuc/pdf"
|
||||||
"github.com/n3integration/epub"
|
|
||||||
"github.com/yuin/goldmark"
|
"github.com/yuin/goldmark"
|
||||||
"github.com/yuin/goldmark/extension"
|
"github.com/yuin/goldmark/extension"
|
||||||
"github.com/yuin/goldmark/parser"
|
"github.com/yuin/goldmark/parser"
|
||||||
@@ -90,26 +90,46 @@ func extractTextFromMarkdown(fpath string) (string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func extractTextFromEpub(fpath string) (string, error) {
|
func extractTextFromEpub(fpath string) (string, error) {
|
||||||
book, err := epub.Open(fpath)
|
r, err := zip.OpenReader(fpath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("failed to open epub: %w", err)
|
return "", fmt.Errorf("failed to open epub: %w", err)
|
||||||
}
|
}
|
||||||
defer book.Close()
|
defer r.Close()
|
||||||
|
|
||||||
var sb strings.Builder
|
var sb strings.Builder
|
||||||
err = book.Each(func(title string, xhtml io.ReadCloser) {
|
|
||||||
|
for _, f := range r.File {
|
||||||
|
ext := strings.ToLower(path.Ext(f.Name))
|
||||||
|
if ext != ".xhtml" && ext != ".html" && ext != ".htm" && ext != ".xml" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip manifest, toc, ncx files - they don't contain book content
|
||||||
|
nameLower := strings.ToLower(f.Name)
|
||||||
|
if strings.Contains(nameLower, "toc") || strings.Contains(nameLower, "nav") ||
|
||||||
|
strings.Contains(nameLower, "manifest") || strings.Contains(nameLower, ".opf") ||
|
||||||
|
strings.HasSuffix(nameLower, ".ncx") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
rc, err := f.Open()
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
if sb.Len() > 0 {
|
if sb.Len() > 0 {
|
||||||
sb.WriteString("\n\n")
|
sb.WriteString("\n\n")
|
||||||
}
|
}
|
||||||
sb.WriteString(title)
|
sb.WriteString(f.Name)
|
||||||
sb.WriteString("\n")
|
sb.WriteString("\n")
|
||||||
buf, readErr := io.ReadAll(xhtml)
|
|
||||||
|
buf, readErr := io.ReadAll(rc)
|
||||||
|
rc.Close()
|
||||||
if readErr == nil {
|
if readErr == nil {
|
||||||
sb.WriteString(stripHTML(string(buf)))
|
sb.WriteString(stripHTML(string(buf)))
|
||||||
}
|
}
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("failed to iterate epub chapters: %w", err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if sb.Len() == 0 {
|
if sb.Len() == 0 {
|
||||||
return "", errors.New("no content extracted from epub")
|
return "", errors.New("no content extracted from epub")
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user