From 9f5978186ac3de07f4325975fecf4f538fe713b6 Mon Sep 17 00:00:00 2001 From: Sam Scholten Date: Mon, 15 Dec 2025 19:34:17 +1000 Subject: Init v0.1.0 --- core/text.go | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 core/text.go (limited to 'core/text.go') diff --git a/core/text.go b/core/text.go new file mode 100644 index 0000000..ef4f861 --- /dev/null +++ b/core/text.go @@ -0,0 +1,36 @@ +// Text processing for RSS feed content. +// Used for web UI previews and search indexing - not ML (title-only scoring). +package core + +import ( + "regexp" + "strings" +) + +// CleanFeedContent strips HTML, normalizes whitespace, truncates to 5KB +func CleanFeedContent(content string) string { + if content == "" { + return "" + } + + content = StripHTMLTags(content) + content = NormalizeSpace(content) + + maxLength := 5000 + if len(content) > maxLength { + content = content[:maxLength] + "..." + } + + return content +} + +// StripHTMLTags removes HTML tags +func StripHTMLTags(content string) string { + re := regexp.MustCompile(`<[^>]*>`) + return re.ReplaceAllString(content, "") +} + +// NormalizeSpace collapses whitespace and trims +func NormalizeSpace(s string) string { + return strings.Join(strings.Fields(strings.TrimSpace(s)), " ") +} -- cgit v1.2.3