aboutsummaryrefslogtreecommitdiff
path: root/core/text.go
blob: ef4f86197131736eb03f59fffe17910cb375552c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// Text processing for RSS feed content.
// Used for web UI previews and search indexing - not ML (title-only scoring).
package core

import (
	"regexp"
	"strings"
)

// CleanFeedContent strips HTML, normalizes whitespace, truncates to 5KB
func CleanFeedContent(content string) string {
	if content == "" {
		return ""
	}

	content = StripHTMLTags(content)
	content = NormalizeSpace(content)

	maxLength := 5000
	if len(content) > maxLength {
		content = content[:maxLength] + "..."
	}

	return content
}

// StripHTMLTags removes HTML tags
func StripHTMLTags(content string) string {
	re := regexp.MustCompile(`<[^>]*>`)
	return re.ReplaceAllString(content, "")
}

// NormalizeSpace collapses whitespace and trims
func NormalizeSpace(s string) string {
	return strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
}