182 lines
4.2 KiB
Go
182 lines
4.2 KiB
Go
package rag
|
|
|
|
import (
|
|
"archive/zip"
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"os/exec"
|
|
"path"
|
|
"strings"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/ledongthuc/pdf"
|
|
"github.com/yuin/goldmark"
|
|
"github.com/yuin/goldmark/extension"
|
|
"github.com/yuin/goldmark/parser"
|
|
"github.com/yuin/goldmark/renderer/html"
|
|
)
|
|
|
|
func ExtractText(fpath string) (string, error) {
|
|
ext := strings.ToLower(path.Ext(fpath))
|
|
switch ext {
|
|
case ".txt":
|
|
return extractTextFromFile(fpath)
|
|
case ".md", ".markdown":
|
|
return extractTextFromMarkdown(fpath)
|
|
case ".html", ".htm":
|
|
return extractTextFromHtmlFile(fpath)
|
|
case ".epub":
|
|
return extractTextFromEpub(fpath)
|
|
case ".pdf":
|
|
return extractTextFromPdf(fpath)
|
|
default:
|
|
return "", fmt.Errorf("unsupported file format: %s", ext)
|
|
}
|
|
}
|
|
|
|
func extractTextFromFile(fpath string) (string, error) {
|
|
data, err := os.ReadFile(fpath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return string(data), nil
|
|
}
|
|
|
|
func extractTextFromHtmlFile(fpath string) (string, error) {
|
|
data, err := os.ReadFile(fpath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return extractTextFromHtmlContent(data)
|
|
}
|
|
|
|
// non utf-8 encoding?
|
|
func extractTextFromHtmlContent(data []byte) (string, error) {
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data))
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
// Remove script and style tags
|
|
doc.Find("script, style, noscript").Each(func(i int, s *goquery.Selection) {
|
|
s.Remove()
|
|
})
|
|
// Get text and clean it
|
|
text := doc.Text()
|
|
// Collapse all whitespace (newlines, tabs, multiple spaces) into single spaces
|
|
cleaned := strings.Join(strings.Fields(text), " ")
|
|
return cleaned, nil
|
|
}
|
|
|
|
func extractTextFromMarkdown(fpath string) (string, error) {
|
|
data, err := os.ReadFile(fpath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
// Convert markdown to HTML
|
|
md := goldmark.New(
|
|
goldmark.WithExtensions(extension.GFM),
|
|
goldmark.WithParserOptions(parser.WithAutoHeadingID()),
|
|
goldmark.WithRendererOptions(html.WithUnsafe()), // allow raw HTML if needed
|
|
)
|
|
var buf bytes.Buffer
|
|
if err := md.Convert(data, &buf); err != nil {
|
|
return "", err
|
|
}
|
|
// Now extract text from the resulting HTML (using goquery or similar)
|
|
return extractTextFromHtmlContent(buf.Bytes())
|
|
}
|
|
|
|
func extractTextFromEpub(fpath string) (string, error) {
|
|
r, err := zip.OpenReader(fpath)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to open epub: %w", err)
|
|
}
|
|
defer r.Close()
|
|
var sb strings.Builder
|
|
for _, f := range r.File {
|
|
ext := strings.ToLower(path.Ext(f.Name))
|
|
if ext != ".xhtml" && ext != ".html" && ext != ".htm" && ext != ".xml" {
|
|
continue
|
|
}
|
|
|
|
// Skip manifest, toc, ncx files - they don't contain book content
|
|
nameLower := strings.ToLower(f.Name)
|
|
if strings.Contains(nameLower, "toc") || strings.Contains(nameLower, "nav") ||
|
|
strings.Contains(nameLower, "manifest") || strings.Contains(nameLower, ".opf") ||
|
|
strings.HasSuffix(nameLower, ".ncx") {
|
|
continue
|
|
}
|
|
|
|
rc, err := f.Open()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
if sb.Len() > 0 {
|
|
sb.WriteString("\n\n")
|
|
}
|
|
sb.WriteString(f.Name)
|
|
sb.WriteString("\n")
|
|
|
|
buf, readErr := io.ReadAll(rc)
|
|
rc.Close()
|
|
if readErr == nil {
|
|
sb.WriteString(stripHTML(string(buf)))
|
|
}
|
|
}
|
|
if sb.Len() == 0 {
|
|
return "", errors.New("no content extracted from epub")
|
|
}
|
|
return sb.String(), nil
|
|
}
|
|
|
|
func stripHTML(html string) string {
|
|
var sb strings.Builder
|
|
inTag := false
|
|
for _, r := range html {
|
|
switch r {
|
|
case '<':
|
|
inTag = true
|
|
case '>':
|
|
inTag = false
|
|
default:
|
|
if !inTag {
|
|
sb.WriteRune(r)
|
|
}
|
|
}
|
|
}
|
|
return sb.String()
|
|
}
|
|
|
|
func extractTextFromPdf(fpath string) (string, error) {
|
|
_, err := exec.LookPath("pdftotext")
|
|
if err == nil {
|
|
out, err := exec.Command("pdftotext", "-layout", fpath, "-").Output()
|
|
if err == nil && len(out) > 0 {
|
|
return string(out), nil
|
|
}
|
|
}
|
|
return extractTextFromPdfPureGo(fpath)
|
|
}
|
|
|
|
func extractTextFromPdfPureGo(fpath string) (string, error) {
|
|
df, r, err := pdf.Open(fpath)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to open pdf: %w", err)
|
|
}
|
|
defer df.Close()
|
|
textReader, err := r.GetPlainText()
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to extract text from pdf: %w", err)
|
|
}
|
|
var buf bytes.Buffer
|
|
_, err = io.Copy(&buf, textReader)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to read pdf text: %w", err)
|
|
}
|
|
return buf.String(), nil
|
|
}
|