Dep (rag): better extractors
This commit is contained in:
@@ -10,21 +10,24 @@ import (
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
"github.com/huantt/plaintext-extractor"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/ledongthuc/pdf"
|
||||
"github.com/n3integration/epub"
|
||||
"github.com/yuin/goldmark"
|
||||
"github.com/yuin/goldmark/extension"
|
||||
"github.com/yuin/goldmark/parser"
|
||||
"github.com/yuin/goldmark/renderer/html"
|
||||
)
|
||||
|
||||
func ExtractText(fpath string) (string, error) {
|
||||
ext := strings.ToLower(path.Ext(fpath))
|
||||
|
||||
switch ext {
|
||||
case ".txt":
|
||||
return extractTextFromFile(fpath)
|
||||
case ".md", ".markdown":
|
||||
return extractTextFromMarkdown(fpath)
|
||||
case ".html", ".htm":
|
||||
return extractTextFromHtml(fpath)
|
||||
return extractTextFromHtmlFile(fpath)
|
||||
case ".epub":
|
||||
return extractTextFromEpub(fpath)
|
||||
case ".pdf":
|
||||
@@ -42,30 +45,48 @@ func extractTextFromFile(fpath string) (string, error) {
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
func extractTextFromHtmlFile(fpath string) (string, error) {
|
||||
data, err := os.ReadFile(fpath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return extractTextFromHtmlContent(data)
|
||||
}
|
||||
|
||||
// non utf-8 encoding?
|
||||
func extractTextFromHtmlContent(data []byte) (string, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Remove script and style tags
|
||||
doc.Find("script, style, noscript").Each(func(i int, s *goquery.Selection) {
|
||||
s.Remove()
|
||||
})
|
||||
// Get text and clean it
|
||||
text := doc.Text()
|
||||
// Collapse all whitespace (newlines, tabs, multiple spaces) into single spaces
|
||||
cleaned := strings.Join(strings.Fields(text), " ")
|
||||
return cleaned, nil
|
||||
}
|
||||
|
||||
func extractTextFromMarkdown(fpath string) (string, error) {
|
||||
data, err := os.ReadFile(fpath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
extractor := plaintext.NewMarkdownExtractor()
|
||||
text, err := extractor.PlainText(string(data))
|
||||
if err != nil {
|
||||
// Convert markdown to HTML
|
||||
md := goldmark.New(
|
||||
goldmark.WithExtensions(extension.GFM),
|
||||
goldmark.WithParserOptions(parser.WithAutoHeadingID()),
|
||||
goldmark.WithRendererOptions(html.WithUnsafe()), // allow raw HTML if needed
|
||||
)
|
||||
var buf bytes.Buffer
|
||||
if err := md.Convert(data, &buf); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return *text, nil
|
||||
}
|
||||
|
||||
func extractTextFromHtml(fpath string) (string, error) {
|
||||
data, err := os.ReadFile(fpath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
extractor := plaintext.NewHtmlExtractor()
|
||||
text, err := extractor.PlainText(string(data))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return *text, nil
|
||||
// Now extract text from the resulting HTML (using goquery or similar)
|
||||
return extractTextFromHtmlContent(buf.Bytes())
|
||||
}
|
||||
|
||||
func extractTextFromEpub(fpath string) (string, error) {
|
||||
@@ -74,30 +95,24 @@ func extractTextFromEpub(fpath string) (string, error) {
|
||||
return "", fmt.Errorf("failed to open epub: %w", err)
|
||||
}
|
||||
defer book.Close()
|
||||
|
||||
var sb strings.Builder
|
||||
|
||||
err = book.Each(func(title string, xhtml io.ReadCloser) {
|
||||
if sb.Len() > 0 {
|
||||
sb.WriteString("\n\n")
|
||||
}
|
||||
sb.WriteString(title)
|
||||
sb.WriteString("\n")
|
||||
|
||||
buf, readErr := io.ReadAll(xhtml)
|
||||
if readErr == nil {
|
||||
sb.WriteString(stripHTML(string(buf)))
|
||||
}
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to iterate epub chapters: %w", err)
|
||||
}
|
||||
|
||||
if sb.Len() == 0 {
|
||||
return "", errors.New("no content extracted from epub")
|
||||
}
|
||||
|
||||
return sb.String(), nil
|
||||
}
|
||||
|
||||
@@ -127,7 +142,6 @@ func extractTextFromPdf(fpath string) (string, error) {
|
||||
return string(out), nil
|
||||
}
|
||||
}
|
||||
|
||||
return extractTextFromPdfPureGo(fpath)
|
||||
}
|
||||
|
||||
@@ -137,17 +151,14 @@ func extractTextFromPdfPureGo(fpath string) (string, error) {
|
||||
return "", fmt.Errorf("failed to open pdf: %w", err)
|
||||
}
|
||||
defer df.Close()
|
||||
|
||||
textReader, err := r.GetPlainText()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to extract text from pdf: %w", err)
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
_, err = io.Copy(&buf, textReader)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to read pdf text: %w", err)
|
||||
}
|
||||
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user