diff --git a/go.mod b/go.mod index 21d9f52..c5b9a7a 100644 --- a/go.mod +++ b/go.mod @@ -6,20 +6,20 @@ require ( github.com/BurntSushi/toml v1.5.0 github.com/GrailFinder/google-translate-tts v0.1.3 github.com/GrailFinder/searchagent v0.2.0 + github.com/PuerkitoBio/goquery v1.11.0 github.com/gdamore/tcell/v2 v2.13.2 github.com/glebarez/go-sqlite v1.22.0 github.com/gopxl/beep/v2 v2.1.1 github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b - github.com/huantt/plaintext-extractor v1.1.0 github.com/jmoiron/sqlx v1.4.0 github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 github.com/n3integration/epub v0.2.0 github.com/neurosnap/sentences v1.1.2 github.com/rivo/tview v0.42.0 + github.com/yuin/goldmark v1.4.13 ) require ( - github.com/PuerkitoBio/goquery v1.11.0 // indirect github.com/andybalholm/cascadia v1.3.3 // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/ebitengine/oto/v3 v3.4.0 // indirect diff --git a/go.sum b/go.sum index 5fa3959..8fe858e 100644 --- a/go.sum +++ b/go.sum @@ -41,8 +41,6 @@ github.com/hajimehoshi/oto/v2 v2.3.1 h1:qrLKpNus2UfD674oxckKjNJmesp9hMh7u7QCrStB github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= -github.com/huantt/plaintext-extractor v1.1.0 h1:dZkJN0fGZf1o8x9UdR6hHqkZnqIwX94YlGJ/lSXUZ5c= -github.com/huantt/plaintext-extractor v1.1.0/go.mod h1:zIIbG/hZnsnLgzDbZ2T8fOrA4SLGWCoHWWYZo0Anx9c= github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 h1:QwWKgMY28TAXaDl+ExRDqGQltzXqN/xypdKP86niVn8= @@ -73,6 +71,7 @@ github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/yuin/goldmark v1.4.13 h1:fVcFKWvrslecOb/tg+Cc05dkeYx540o0FuFt3nUVDoE= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= diff --git a/rag/extractors.go b/rag/extractors.go index fcc6a2a..1cf97af 100644 --- a/rag/extractors.go +++ b/rag/extractors.go @@ -10,21 +10,24 @@ import ( "path" "strings" - "github.com/huantt/plaintext-extractor" + "github.com/PuerkitoBio/goquery" "github.com/ledongthuc/pdf" "github.com/n3integration/epub" + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/extension" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/renderer/html" ) func ExtractText(fpath string) (string, error) { ext := strings.ToLower(path.Ext(fpath)) - switch ext { case ".txt": return extractTextFromFile(fpath) case ".md", ".markdown": return extractTextFromMarkdown(fpath) case ".html", ".htm": - return extractTextFromHtml(fpath) + return extractTextFromHtmlFile(fpath) case ".epub": return extractTextFromEpub(fpath) case ".pdf": @@ -42,30 +45,48 @@ func extractTextFromFile(fpath string) (string, error) { return string(data), nil } +func extractTextFromHtmlFile(fpath string) (string, error) { + data, err := os.ReadFile(fpath) + if err != nil { + return "", err + } + return extractTextFromHtmlContent(data) +} + +// non utf-8 encoding? +func extractTextFromHtmlContent(data []byte) (string, error) { + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data)) + if err != nil { + return "", err + } + // Remove script and style tags + doc.Find("script, style, noscript").Each(func(i int, s *goquery.Selection) { + s.Remove() + }) + // Get text and clean it + text := doc.Text() + // Collapse all whitespace (newlines, tabs, multiple spaces) into single spaces + cleaned := strings.Join(strings.Fields(text), " ") + return cleaned, nil +} + func extractTextFromMarkdown(fpath string) (string, error) { data, err := os.ReadFile(fpath) if err != nil { return "", err } - extractor := plaintext.NewMarkdownExtractor() - text, err := extractor.PlainText(string(data)) - if err != nil { + // Convert markdown to HTML + md := goldmark.New( + goldmark.WithExtensions(extension.GFM), + goldmark.WithParserOptions(parser.WithAutoHeadingID()), + goldmark.WithRendererOptions(html.WithUnsafe()), // allow raw HTML if needed + ) + var buf bytes.Buffer + if err := md.Convert(data, &buf); err != nil { return "", err } - return *text, nil -} - -func extractTextFromHtml(fpath string) (string, error) { - data, err := os.ReadFile(fpath) - if err != nil { - return "", err - } - extractor := plaintext.NewHtmlExtractor() - text, err := extractor.PlainText(string(data)) - if err != nil { - return "", err - } - return *text, nil + // Now extract text from the resulting HTML (using goquery or similar) + return extractTextFromHtmlContent(buf.Bytes()) } func extractTextFromEpub(fpath string) (string, error) { @@ -74,30 +95,24 @@ func extractTextFromEpub(fpath string) (string, error) { return "", fmt.Errorf("failed to open epub: %w", err) } defer book.Close() - var sb strings.Builder - err = book.Each(func(title string, xhtml io.ReadCloser) { if sb.Len() > 0 { sb.WriteString("\n\n") } sb.WriteString(title) sb.WriteString("\n") - buf, readErr := io.ReadAll(xhtml) if readErr == nil { sb.WriteString(stripHTML(string(buf))) } }) - if err != nil { return "", fmt.Errorf("failed to iterate epub chapters: %w", err) } - if sb.Len() == 0 { return "", errors.New("no content extracted from epub") } - return sb.String(), nil } @@ -127,7 +142,6 @@ func extractTextFromPdf(fpath string) (string, error) { return string(out), nil } } - return extractTextFromPdfPureGo(fpath) } @@ -137,17 +151,14 @@ func extractTextFromPdfPureGo(fpath string) (string, error) { return "", fmt.Errorf("failed to open pdf: %w", err) } defer df.Close() - textReader, err := r.GetPlainText() if err != nil { return "", fmt.Errorf("failed to extract text from pdf: %w", err) } - var buf bytes.Buffer _, err = io.Copy(&buf, textReader) if err != nil { return "", fmt.Errorf("failed to read pdf text: %w", err) } - return buf.String(), nil }