diff options
| author | Sam Scholten | 2025-12-15 19:34:17 +1000 |
|---|---|---|
| committer | Sam Scholten | 2025-12-15 19:34:59 +1000 |
| commit | 9f5978186ac3de07f4325975fecf4f538fe713b6 (patch) | |
| tree | 41440b703054fe59eb561ba81d80fd60380c1f7a /core/text.go | |
| download | scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.tar.gz scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.zip | |
Init v0.1.0
Diffstat (limited to 'core/text.go')
| -rw-r--r-- | core/text.go | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/core/text.go b/core/text.go new file mode 100644 index 0000000..ef4f861 --- /dev/null +++ b/core/text.go @@ -0,0 +1,36 @@ +// Text processing for RSS feed content. +// Used for web UI previews and search indexing - not ML (title-only scoring). +package core + +import ( + "regexp" + "strings" +) + +// CleanFeedContent strips HTML, normalizes whitespace, truncates to 5KB +func CleanFeedContent(content string) string { + if content == "" { + return "" + } + + content = StripHTMLTags(content) + content = NormalizeSpace(content) + + maxLength := 5000 + if len(content) > maxLength { + content = content[:maxLength] + "..." + } + + return content +} + +// StripHTMLTags removes HTML tags +func StripHTMLTags(content string) string { + re := regexp.MustCompile(`<[^>]*>`) + return re.ReplaceAllString(content, "") +} + +// NormalizeSpace collapses whitespace and trims +func NormalizeSpace(s string) string { + return strings.Join(strings.Fields(strings.TrimSpace(s)), " ") +} |
