Files
gf-lt/rag/extractors.go
2026-02-25 07:51:24 +03:00

165 lines
3.8 KiB
Go

package rag
import (
"bytes"
"errors"
"fmt"
"io"
"os"
"os/exec"
"path"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/ledongthuc/pdf"
"github.com/n3integration/epub"
"github.com/yuin/goldmark"
"github.com/yuin/goldmark/extension"
"github.com/yuin/goldmark/parser"
"github.com/yuin/goldmark/renderer/html"
)
func ExtractText(fpath string) (string, error) {
ext := strings.ToLower(path.Ext(fpath))
switch ext {
case ".txt":
return extractTextFromFile(fpath)
case ".md", ".markdown":
return extractTextFromMarkdown(fpath)
case ".html", ".htm":
return extractTextFromHtmlFile(fpath)
case ".epub":
return extractTextFromEpub(fpath)
case ".pdf":
return extractTextFromPdf(fpath)
default:
return "", fmt.Errorf("unsupported file format: %s", ext)
}
}
func extractTextFromFile(fpath string) (string, error) {
data, err := os.ReadFile(fpath)
if err != nil {
return "", err
}
return string(data), nil
}
func extractTextFromHtmlFile(fpath string) (string, error) {
data, err := os.ReadFile(fpath)
if err != nil {
return "", err
}
return extractTextFromHtmlContent(data)
}
// non utf-8 encoding?
func extractTextFromHtmlContent(data []byte) (string, error) {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data))
if err != nil {
return "", err
}
// Remove script and style tags
doc.Find("script, style, noscript").Each(func(i int, s *goquery.Selection) {
s.Remove()
})
// Get text and clean it
text := doc.Text()
// Collapse all whitespace (newlines, tabs, multiple spaces) into single spaces
cleaned := strings.Join(strings.Fields(text), " ")
return cleaned, nil
}
func extractTextFromMarkdown(fpath string) (string, error) {
data, err := os.ReadFile(fpath)
if err != nil {
return "", err
}
// Convert markdown to HTML
md := goldmark.New(
goldmark.WithExtensions(extension.GFM),
goldmark.WithParserOptions(parser.WithAutoHeadingID()),
goldmark.WithRendererOptions(html.WithUnsafe()), // allow raw HTML if needed
)
var buf bytes.Buffer
if err := md.Convert(data, &buf); err != nil {
return "", err
}
// Now extract text from the resulting HTML (using goquery or similar)
return extractTextFromHtmlContent(buf.Bytes())
}
func extractTextFromEpub(fpath string) (string, error) {
book, err := epub.Open(fpath)
if err != nil {
return "", fmt.Errorf("failed to open epub: %w", err)
}
defer book.Close()
var sb strings.Builder
err = book.Each(func(title string, xhtml io.ReadCloser) {
if sb.Len() > 0 {
sb.WriteString("\n\n")
}
sb.WriteString(title)
sb.WriteString("\n")
buf, readErr := io.ReadAll(xhtml)
if readErr == nil {
sb.WriteString(stripHTML(string(buf)))
}
})
if err != nil {
return "", fmt.Errorf("failed to iterate epub chapters: %w", err)
}
if sb.Len() == 0 {
return "", errors.New("no content extracted from epub")
}
return sb.String(), nil
}
func stripHTML(html string) string {
var sb strings.Builder
inTag := false
for _, r := range html {
switch r {
case '<':
inTag = true
case '>':
inTag = false
default:
if !inTag {
sb.WriteRune(r)
}
}
}
return sb.String()
}
func extractTextFromPdf(fpath string) (string, error) {
_, err := exec.LookPath("pdftotext")
if err == nil {
out, err := exec.Command("pdftotext", "-layout", fpath, "-").Output()
if err == nil && len(out) > 0 {
return string(out), nil
}
}
return extractTextFromPdfPureGo(fpath)
}
func extractTextFromPdfPureGo(fpath string) (string, error) {
df, r, err := pdf.Open(fpath)
if err != nil {
return "", fmt.Errorf("failed to open pdf: %w", err)
}
defer df.Close()
textReader, err := r.GetPlainText()
if err != nil {
return "", fmt.Errorf("failed to extract text from pdf: %w", err)
}
var buf bytes.Buffer
_, err = io.Copy(&buf, textReader)
if err != nil {
return "", fmt.Errorf("failed to read pdf text: %w", err)
}
return buf.String(), nil
}