package rag import ( "bytes" "errors" "fmt" "io" "os" "os/exec" "path" "strings" "github.com/PuerkitoBio/goquery" "github.com/ledongthuc/pdf" "github.com/n3integration/epub" "github.com/yuin/goldmark" "github.com/yuin/goldmark/extension" "github.com/yuin/goldmark/parser" "github.com/yuin/goldmark/renderer/html" ) func ExtractText(fpath string) (string, error) { ext := strings.ToLower(path.Ext(fpath)) switch ext { case ".txt": return extractTextFromFile(fpath) case ".md", ".markdown": return extractTextFromMarkdown(fpath) case ".html", ".htm": return extractTextFromHtmlFile(fpath) case ".epub": return extractTextFromEpub(fpath) case ".pdf": return extractTextFromPdf(fpath) default: return "", fmt.Errorf("unsupported file format: %s", ext) } } func extractTextFromFile(fpath string) (string, error) { data, err := os.ReadFile(fpath) if err != nil { return "", err } return string(data), nil } func extractTextFromHtmlFile(fpath string) (string, error) { data, err := os.ReadFile(fpath) if err != nil { return "", err } return extractTextFromHtmlContent(data) } // non utf-8 encoding? func extractTextFromHtmlContent(data []byte) (string, error) { doc, err := goquery.NewDocumentFromReader(bytes.NewReader(data)) if err != nil { return "", err } // Remove script and style tags doc.Find("script, style, noscript").Each(func(i int, s *goquery.Selection) { s.Remove() }) // Get text and clean it text := doc.Text() // Collapse all whitespace (newlines, tabs, multiple spaces) into single spaces cleaned := strings.Join(strings.Fields(text), " ") return cleaned, nil } func extractTextFromMarkdown(fpath string) (string, error) { data, err := os.ReadFile(fpath) if err != nil { return "", err } // Convert markdown to HTML md := goldmark.New( goldmark.WithExtensions(extension.GFM), goldmark.WithParserOptions(parser.WithAutoHeadingID()), goldmark.WithRendererOptions(html.WithUnsafe()), // allow raw HTML if needed ) var buf bytes.Buffer if err := md.Convert(data, &buf); err != nil { return "", err } // Now extract text from the resulting HTML (using goquery or similar) return extractTextFromHtmlContent(buf.Bytes()) } func extractTextFromEpub(fpath string) (string, error) { book, err := epub.Open(fpath) if err != nil { return "", fmt.Errorf("failed to open epub: %w", err) } defer book.Close() var sb strings.Builder err = book.Each(func(title string, xhtml io.ReadCloser) { if sb.Len() > 0 { sb.WriteString("\n\n") } sb.WriteString(title) sb.WriteString("\n") buf, readErr := io.ReadAll(xhtml) if readErr == nil { sb.WriteString(stripHTML(string(buf))) } }) if err != nil { return "", fmt.Errorf("failed to iterate epub chapters: %w", err) } if sb.Len() == 0 { return "", errors.New("no content extracted from epub") } return sb.String(), nil } func stripHTML(html string) string { var sb strings.Builder inTag := false for _, r := range html { switch r { case '<': inTag = true case '>': inTag = false default: if !inTag { sb.WriteRune(r) } } } return sb.String() } func extractTextFromPdf(fpath string) (string, error) { _, err := exec.LookPath("pdftotext") if err == nil { out, err := exec.Command("pdftotext", "-layout", fpath, "-").Output() if err == nil && len(out) > 0 { return string(out), nil } } return extractTextFromPdfPureGo(fpath) } func extractTextFromPdfPureGo(fpath string) (string, error) { df, r, err := pdf.Open(fpath) if err != nil { return "", fmt.Errorf("failed to open pdf: %w", err) } defer df.Close() textReader, err := r.GetPlainText() if err != nil { return "", fmt.Errorf("failed to extract text from pdf: %w", err) } var buf bytes.Buffer _, err = io.Copy(&buf, textReader) if err != nil { return "", fmt.Errorf("failed to read pdf text: %w", err) } return buf.String(), nil }