aboutsummaryrefslogtreecommitdiff
path: root/core/text.go
diff options
context:
space:
mode:
authorSam Scholten2025-12-15 19:34:17 +1000
committerSam Scholten2025-12-15 19:34:59 +1000
commit9f5978186ac3de07f4325975fecf4f538fe713b6 (patch)
tree41440b703054fe59eb561ba81d80fd60380c1f7a /core/text.go
downloadscholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.tar.gz
scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.zip
Init v0.1.0
Diffstat (limited to 'core/text.go')
-rw-r--r--core/text.go36
1 files changed, 36 insertions, 0 deletions
diff --git a/core/text.go b/core/text.go
new file mode 100644
index 0000000..ef4f861
--- /dev/null
+++ b/core/text.go
@@ -0,0 +1,36 @@
+// Text processing for RSS feed content.
+// Used for web UI previews and search indexing - not ML (title-only scoring).
+package core
+
+import (
+ "regexp"
+ "strings"
+)
+
+// CleanFeedContent strips HTML, normalizes whitespace, truncates to 5KB
+func CleanFeedContent(content string) string {
+ if content == "" {
+ return ""
+ }
+
+ content = StripHTMLTags(content)
+ content = NormalizeSpace(content)
+
+ maxLength := 5000
+ if len(content) > maxLength {
+ content = content[:maxLength] + "..."
+ }
+
+ return content
+}
+
+// StripHTMLTags removes HTML tags
+func StripHTMLTags(content string) string {
+ re := regexp.MustCompile(`<[^>]*>`)
+ return re.ReplaceAllString(content, "")
+}
+
+// NormalizeSpace collapses whitespace and trims
+func NormalizeSpace(s string) string {
+ return strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
+}