aboutsummaryrefslogtreecommitdiff
path: root/cmds
diff options
context:
space:
mode:
authorSam Scholten2025-12-15 19:34:17 +1000
committerSam Scholten2025-12-15 19:34:59 +1000
commit9f5978186ac3de07f4325975fecf4f538fe713b6 (patch)
tree41440b703054fe59eb561ba81d80fd60380c1f7a /cmds
downloadscholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.tar.gz
scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.zip
Init v0.1.0
Diffstat (limited to 'cmds')
-rw-r--r--cmds/scan.go416
-rw-r--r--cmds/serve.go1010
-rw-r--r--cmds/templates/live-feed.html158
-rw-r--r--cmds/templates/results.html279
-rw-r--r--cmds/templates/tools.html202
-rw-r--r--cmds/train.go841
-rw-r--r--cmds/train_test.go66
7 files changed, 2972 insertions, 0 deletions
diff --git a/cmds/scan.go b/cmds/scan.go
new file mode 100644
index 0000000..789157c
--- /dev/null
+++ b/cmds/scan.go
@@ -0,0 +1,416 @@
+// Scan command: filters articles using trained model.
+//
+// takes articles from RSS feed, text, or JSONL. Scores & outputs those passing.
+// Batches processing (default 50) to allow continuous streaming.
+package cmds
+
+import (
+ "bufio"
+ "context"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "net/url"
+ "os"
+ "path/filepath"
+ "strings"
+
+ "github.com/mmcdole/gofeed"
+ "scholscan/core"
+)
+
+
+// ============================================================================
+// ┏━╸┏━┓┏┳┓┏┳┓┏━┓┏┓╻╺┳┓
+// ┃ ┃ ┃┃┃┃┃┃┃┣━┫┃┗┫ ┃┃
+// ┗━╸┗━┛╹ ╹╹ ╹╹ ╹╹ ╹╺┻┛
+// ============================================================================
+
+
+// scores articles with trained model and outputs filtered results above thresh
+type ScanCommand struct {
+ URL string
+ FromText bool
+ FromArticles bool
+
+ ModelPath string
+ Threshold string
+
+ MinTitleLength int
+ ChunkSize int
+
+ EventsOut string
+ MetricsOut string
+ Verbose bool
+}
+
+func (c *ScanCommand) Name() string { return "scan" }
+
+func (c *ScanCommand) Init(args []string) error {
+ fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
+ fs.Usage = func() {
+ fmt.Fprint(fs.Output(), `Usage: scholscan scan [options]
+
+Fetches articles, scores with model, outputs matched (>thresh) ones.
+
+Source options (exactly one required):
+ --url <feed_url> Fetch articles from RSS/Atom feed
+ --from-text Extract URLs from text on stdin
+ --from-articles Use Article JSONL from stdin directly
+
+Model and filtering:
+ --model <path> Path to trained model JSON file (required)
+ --threshold <float> Score threshold (if not provided, uses model's recommended threshold)
+
+Enrichment options:
+`)
+ fs.PrintDefaults()
+ fmt.Fprint(fs.Output(), `
+Examples:
+ scholscan scan --url "http://some.blog/rss.xml" --model model.json > interesting.jsonl
+ echo "see https://example.com" | scholscan scan --from-text --model model.json
+ cat articles.jsonl | scholscan scan --from-articles --model model.json
+`)
+ }
+
+ fs.StringVar(&c.URL, "url", "", "RSS/Atom feed URL to fetch")
+ fs.BoolVar(&c.FromText, "from-text", false, "Extract URLs from text on stdin")
+ fs.BoolVar(&c.FromArticles, "from-articles", false, "Use Article JSONL from stdin")
+ fs.StringVar(&c.ModelPath, "model", "", "Path to trained model JSON file (required)")
+ fs.StringVar(&c.Threshold, "threshold", "", "Score threshold for filtering (if not provided, uses model's recommended threshold)")
+ fs.IntVar(&c.MinTitleLength, "min-title-length", core.MinTitleLength, "Minimum title length to consider valid")
+ fs.IntVar(&c.ChunkSize, "chunk-size", core.DefaultChunkSize, "Number of articles to process in each batch")
+ fs.StringVar(&c.EventsOut, "events-out", "events.jsonl", "Write per-article events to a JSONL file")
+ fs.StringVar(&c.MetricsOut, "metrics-out", "metrics.json", "Write summary metrics to a JSON file")
+ fs.BoolVar(&c.Verbose, "verbose", false, "Show progress information")
+
+ if err := fs.Parse(args); err != nil {
+ return err
+ }
+
+ if fs.NArg() != 0 {
+ return fmt.Errorf("unexpected arguments provided: %v", fs.Args())
+ }
+
+ // one src opt required
+ sourceCount := 0
+ if c.URL != "" {
+ sourceCount++
+ }
+ if c.FromText {
+ sourceCount++
+ }
+ if c.FromArticles {
+ sourceCount++
+ }
+
+ if sourceCount == 0 {
+ return fmt.Errorf("exactly one source option must be specified: --url, --from-text, or --from-articles")
+ }
+ if sourceCount > 1 {
+ return fmt.Errorf("only one source option may be specified: --url, --from-text, or --from-articles")
+ }
+
+ if c.ModelPath == "" {
+ return fmt.Errorf("--model flag is required")
+ }
+
+ // prevent dir traversal
+ if strings.Contains(filepath.Clean(c.ModelPath), "..") {
+ return fmt.Errorf("invalid model path: directory traversal not allowed")
+ }
+
+ if c.URL != "" {
+ if _, err := url.Parse(c.URL); err != nil {
+ return fmt.Errorf("invalid URL format: %w", err)
+ }
+ }
+
+ return nil
+}
+
+// Run runs the scan: load the model, decide on a threshold, get articles, then score them in chunks.
+// We bail out early on config problems but try to keep going even if some articles fail to fetch.
+func (c *ScanCommand) Run(stdin io.Reader, stdout io.Writer) error {
+ if c.Verbose {
+ log.SetOutput(os.Stderr)
+ log.Println("Starting scan workflow...")
+ log.Printf("Source: %v", c.getSourceDescription())
+ log.Printf("Model: %s", c.ModelPath)
+ }
+
+ model, err := c.loadModel()
+ if err != nil {
+ return fmt.Errorf("failed to load model: %w", err)
+ }
+
+ threshold, err := c.getThreshold(model)
+ if err != nil {
+ return fmt.Errorf("failed to determine threshold: %w", err)
+ }
+
+ if c.Verbose {
+ log.Printf("Using threshold: %.3f", threshold)
+ }
+
+ var articles []*core.Article
+ if c.FromArticles {
+ articles, err = c.readArticlesFromStdin(stdin)
+ } else {
+ articles, err = c.fetchArticles()
+ }
+ if err != nil {
+ return fmt.Errorf("failed to get articles: %w", err)
+ }
+
+ if c.Verbose {
+ log.Printf("Processing %d articles", len(articles))
+ }
+
+ // process articles in chunks
+ return c.processArticles(articles, model, threshold, stdout, stdin)
+}
+
+
+// ============================================================================
+// ┏┳┓┏━┓╺┳┓┏━╸╻ ┏┓ ┏━╸┏━┓┏┓╻┏━╸╻┏━╸
+// ┃┃┃┃ ┃ ┃┃┣╸ ┃ ┃╺╋╸ ┃ ┃ ┃┃┗┫┣╸ ┃┃╺┓
+// ╹ ╹┗━┛╺┻┛┗━╸┗━╸ ┗━┛ ┗━╸┗━┛╹ ╹╹ ╹┗━┛
+// ============================================================================
+
+
+
+func (c *ScanCommand) getSourceDescription() string {
+ if c.URL != "" {
+ return fmt.Sprintf("RSS feed: %s", c.URL)
+ }
+ if c.FromText {
+ return "text from stdin"
+ }
+ if c.FromArticles {
+ return "articles from stdin"
+ }
+ return "unknown"
+}
+
+// loadModel reads and parses the model JSON file.
+// The envelope contains weights, vocabulary, and optionally a recommended threshold.
+func (c *ScanCommand) loadModel() (*core.ModelEnvelope, error) {
+ f, err := os.Open(c.ModelPath)
+ if err != nil {
+ return nil, fmt.Errorf("failed to open model file %s: %w", c.ModelPath, err)
+ }
+ defer f.Close()
+
+ var model core.ModelEnvelope
+ if err := json.NewDecoder(f).Decode(&model); err != nil {
+ return nil, fmt.Errorf("failed to decode model: %w", err)
+ }
+
+ return &model, nil
+}
+
+func (c *ScanCommand) getThreshold(model *core.ModelEnvelope) (float64, error) {
+ if c.Threshold != "" {
+ var threshold float64
+ _, err := fmt.Sscanf(c.Threshold, "%f", &threshold)
+ if err == nil {
+ return threshold, nil
+ }
+ }
+
+ if model.Meta != nil {
+ if meta, ok := model.Meta["recommended_threshold"].(float64); ok {
+ return meta, nil
+ }
+ }
+
+ return core.DefaultScoreThreshold, nil
+}
+
+// ============================================================================
+// ┏━┓┏━┓╺┳╸╻┏━╸╻ ┏━╸ ┏━┓┏━┓┏━╸┏━┓
+// ┣━┫┣┳┛ ┃ ┃┃ ┃ ┣╸ ┗━┓┣┳┛┃ ┗━┓
+// ╹ ╹╹┗╸ ╹ ╹┗━╸┗━╸┗━╸ ┗━┛╹┗╸┗━╸┗━┛
+// ============================================================================
+
+
+func (c *ScanCommand) fetchArticles() ([]*core.Article, error) {
+ if c.FromText {
+ return c.extractURLsFromText(os.Stdin)
+ }
+ if c.URL != "" {
+ return c.fetchRSSFeed(c.URL)
+ }
+ return nil, fmt.Errorf("no valid source specified")
+}
+
+// extractURLsFromText pulls URLs from plain text on stdin.
+// We create minimal Article objects since only the URL is needed for scoring.
+func (c *ScanCommand) extractURLsFromText(stdin io.Reader) ([]*core.Article, error) {
+ var urls []string
+ s := bufio.NewScanner(stdin)
+ for s.Scan() {
+ line := s.Text()
+ // url extraction
+ fields := strings.Fields(line)
+ for _, field := range fields {
+ if strings.HasPrefix(field, "http://") || strings.HasPrefix(field, "https://") {
+ urls = append(urls, field)
+ }
+ }
+ }
+
+ // create Article objs for URLs
+ articles := make([]*core.Article, len(urls))
+ for i, url := range urls {
+ articles[i] = &core.Article{
+ URL: url,
+ Title: fmt.Sprintf("Article from %s", url),
+ Content: "",
+ }
+ }
+
+ return articles, s.Err()
+}
+
+// fetchRSSFeed fetches and parses a single RSS feed with a 30s timeout.
+// We skip articles with short titles since they're usually noise or truncated.
+func (c *ScanCommand) fetchRSSFeed(url string) ([]*core.Article, error) {
+ client := &http.Client{Timeout: core.DefaultHTTPTimeout}
+
+ req, err := http.NewRequest("GET", url, nil)
+ if err != nil {
+ return nil, fmt.Errorf("error building request: %w", err)
+ }
+ req.Header.Set("User-Agent", core.PoliteUserAgent)
+
+ ctx, cancel := context.WithTimeout(context.Background(), core.DefaultHTTPTimeout)
+ defer cancel()
+
+ resp, err := client.Do(req.WithContext(ctx))
+ if err != nil {
+ return nil, fmt.Errorf("error fetching %s: %w", url, err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("error reading response from %s: %w", url, err)
+ }
+
+ // parse feed
+ fp := gofeed.NewParser()
+ feed, err := fp.Parse(strings.NewReader(string(body)))
+ if err != nil {
+ return nil, fmt.Errorf("error parsing feed from %s: %w", url, err)
+ }
+
+ var articles []*core.Article
+ for _, item := range feed.Items {
+ article := &core.Article{
+ URL: item.Link,
+ Title: strings.TrimSpace(item.Title),
+ }
+
+ if len(article.Title) >= c.MinTitleLength {
+ articles = append(articles, article)
+ }
+ }
+
+ return articles, nil
+}
+
+// readArticlesFromStdin reads Article objects from JSONL on stdin.
+// Malformed lines are skipped to allow partial processing of corrupted input.
+func (c *ScanCommand) readArticlesFromStdin(stdin io.Reader) ([]*core.Article, error) {
+ var articles []*core.Article
+ decoder := json.NewDecoder(stdin)
+ for {
+ var article core.Article
+ if err := decoder.Decode(&article); err != nil {
+ if err == io.EOF {
+ break
+ }
+ continue
+ }
+
+ if len(article.Title) >= c.MinTitleLength {
+ articles = append(articles, &article)
+ }
+ }
+ return articles, nil
+}
+
+
+
+// ============================================================================
+// ┏━┓┏━┓┏━┓┏━╸┏━╸┏━┓┏━┓ ┏━┓┏━┓╺┳╸╻┏━╸╻ ┏━╸┏━┓
+// ┣━┛┣┳┛┃ ┃┃ ┣╸ ┗━┓┗━┓ ┣━┫┣┳┛ ┃ ┃┃ ┃ ┣╸ ┗━┓
+// ╹ ╹┗╸┗━┛┗━╸┗━╸┗━┛┗━┛ ╹ ╹╹┗╸ ╹ ╹┗━╸┗━╸┗━╸┗━┛
+// ============================================================================
+
+
+// processArticles handles scoring and filtering in batches to keep memory usage predictable.
+// Scoring errors don't crash the process - we log them and continue with the next article.
+func (c *ScanCommand) processArticles(articles []*core.Article, model *core.ModelEnvelope, threshold float64, stdout io.Writer, stdin io.Reader) error {
+ vectorizer := core.CreateVectorizerFromModel(model)
+
+ encoder := json.NewEncoder(stdout)
+
+ // process each batch
+ for i := 0; i < len(articles); i += c.ChunkSize {
+ end := i + c.ChunkSize
+ if end > len(articles) {
+ end = len(articles)
+ }
+
+ chunk := articles[i:end]
+ if c.Verbose {
+ log.Printf("Processing chunk %d-%d of %d articles", i+1, end, len(articles))
+ }
+
+ // calc score for batch
+ docs := make([]string, len(chunk))
+ for j, article := range chunk {
+ docs[j] = strings.TrimSpace(article.Title)
+ }
+
+ vectors := vectorizer.Transform(docs)
+ scores := make([]float64, len(chunk))
+
+ for j, vector := range vectors {
+ score, err := core.PredictScore(vector, model.Weights)
+ if err != nil {
+ log.Printf("Error computing score for article %d: %v", i+j, err)
+ scores[j] = 0.0
+ } else {
+ scores[j] = score
+ }
+ }
+
+ for j, article := range chunk {
+ score := scores[j]
+ article.Score = &score
+
+ if score >= threshold {
+ if err := encoder.Encode(article); err != nil {
+ log.Printf("Error encoding article: %v", err)
+ }
+ }
+ }
+ }
+
+ if c.Verbose {
+ log.Println("Scan complete")
+ }
+
+ return nil
+}
diff --git a/cmds/serve.go b/cmds/serve.go
new file mode 100644
index 0000000..92aa64c
--- /dev/null
+++ b/cmds/serve.go
@@ -0,0 +1,1010 @@
+// Serve command: HTTP server for web UI and APIs.
+//
+// Two main flows: live-feed (cached + background refresh) and tools (on-demand scoring).
+// Live-feed rescans all configured RSS feeds on a timer (default 24h), caches results,
+// serves filtered articles via web UI and JSON/RSS APIs.
+// Tools provides real-time /score (single title) and /scan (ad-hoc feed) endpoints.
+// Background refresh continues despite individual feed failures; RWMutex allows
+// many concurrent readers with exclusive writer updates.
+// Templates are embedded for single-binary deployment.
+package cmds
+
+import (
+ "bufio"
+ "context"
+ "embed"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "html/template"
+ "io"
+ "log"
+ "net/http"
+ "net/url"
+ "os"
+ "os/signal"
+ "path/filepath"
+ "regexp"
+ "sort"
+ "strings"
+ "sync"
+ "syscall"
+ "time"
+
+ "github.com/PuerkitoBio/goquery"
+ "github.com/mmcdole/gofeed"
+ "scholscan/core"
+)
+
+//go:embed templates/*.html
+var templateFS embed.FS
+
+// ============================================================================
+// ┏━╸┏┳┓╺┳┓ ┏━┓┏┓ ┏┓
+// ┃ ┃┃┃ ┃┃ ┃ ┃┣┻┓ ┃
+// ┗━╸╹ ╹╺┻┛ ┗━┛┗━┛┗━┛
+// ============================================================================
+
+type ServeCommand struct {
+ Port int
+ RSSWorldPath string
+ RefreshInterval string
+ ModelPath string
+ Title string
+
+ // Parsed interval
+ refreshInterval time.Duration
+ // Loaded model (cached)
+ model *core.ModelEnvelope
+ modelMu sync.RWMutex
+ // Cached filtered RSS results and timestamp.
+ // RWMutex allows many concurrent readers (HTTP handlers) with exclusive writer (background refresh).
+ filteredResults []*core.Article
+ filteredResultsTime time.Time
+ resultsMu sync.RWMutex
+ // Loaded templates
+ tmpl *template.Template
+}
+
+func (c *ServeCommand) Name() string { return "serve" }
+
+// Init configures the serve command with robust input validation.
+// Prevents directory traversal, validates paths, and sets sensible defaults.
+// Ensures only one configuration is possible to reduce runtime complexity.
+func (c *ServeCommand) Init(args []string) error {
+ fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
+ fs.Usage = func() {
+ fmt.Fprint(fs.Output(), `Usage: scholscan serve [options]
+
+ Start HTTP server for filtered RSS and scoring web UI.
+
+ Flags:
+ `)
+ fs.PrintDefaults()
+ fmt.Fprint(fs.Output(), `
+ Examples:
+ scholscan serve --port 8080 --rss-world rss_world.txt --model model.json
+ scholscan serve --refresh-interval 24h --model ./model.json --rss-world feeds.txt
+ `)
+ }
+
+ fs.IntVar(&c.Port, "port", 8080, "Port to listen on")
+ fs.StringVar(&c.RSSWorldPath, "rss-world", "rss_world.txt", "Path to RSS world file (one feed URL per line)")
+ fs.StringVar(&c.RefreshInterval, "refresh-interval", "24h", "Interval for background rescans (e.g., 24h, 1h)")
+ fs.StringVar(&c.ModelPath, "model", "model.json", "Path to trained model JSON file")
+ fs.StringVar(&c.Title, "title", "", "Custom title for the web interface")
+
+ if err := fs.Parse(args); err != nil {
+ return err
+ }
+
+ if fs.NArg() != 0 {
+ return fmt.Errorf("unexpected arguments provided: %v", fs.Args())
+ }
+
+ // Parse refresh interval
+ interval, err := time.ParseDuration(c.RefreshInterval)
+ if err != nil {
+ return fmt.Errorf("invalid refresh-interval %q: %w", c.RefreshInterval, err)
+ }
+ c.refreshInterval = interval
+
+ if strings.Contains(filepath.Clean(c.RSSWorldPath), "..") {
+ return fmt.Errorf("invalid rss-world path: directory traversal not allowed")
+ }
+ if strings.Contains(filepath.Clean(c.ModelPath), "..") {
+ return fmt.Errorf("invalid model path: directory traversal not allowed")
+ }
+
+ return nil
+}
+
+func (c *ServeCommand) Run(stdin io.Reader, stdout io.Writer) error {
+ log.Printf("Starting scholscan server on port %d", c.Port)
+
+ // Initialize filteredResultsTime to server start time
+ c.resultsMu.Lock()
+ c.filteredResultsTime = time.Now()
+ c.resultsMu.Unlock()
+
+ // Load templates at startup
+ tmpl, err := template.ParseFS(templateFS, "templates/*.html")
+ if err != nil {
+ return fmt.Errorf("failed to parse templates: %w", err)
+ }
+ c.tmpl = tmpl
+ log.Printf("Templates loaded successfully")
+
+ // Load model at startup
+ model, err := c.loadModel()
+ if err != nil {
+ return fmt.Errorf("failed to load model at startup: %w", err)
+ }
+ c.modelMu.Lock()
+ c.model = model
+ c.modelMu.Unlock()
+
+ log.Printf("Model loaded successfully")
+
+ // Start background ticker for periodic refresh
+ ticker := time.NewTicker(c.refreshInterval)
+ go c.backgroundRefresh(ticker)
+
+ // Perform initial scan asynchronously
+ go func() {
+ log.Println("Starting initial feed scan...")
+ if err := c.refreshFilteredResults(); err != nil {
+ log.Printf("Warning: initial scan failed: %v", err)
+ } else {
+ c.resultsMu.RLock()
+ count := len(c.filteredResults)
+ c.resultsMu.RUnlock()
+ log.Printf("Initial scan complete, %d articles filtered", count)
+ }
+ }()
+
+ // Setup HTTP handlers
+ http.HandleFunc("/", c.handleRoot)
+ http.HandleFunc("/live-feed", c.handleLiveFeed)
+ http.HandleFunc("/tools", c.handleTools)
+ http.HandleFunc("/score", c.handleScore)
+ http.HandleFunc("/scan", c.handleScan)
+ http.HandleFunc("/api/filtered/feed", c.handleFilteredFeed)
+ http.HandleFunc("/api/filtered/rss", c.handleFilteredRSS)
+ http.HandleFunc("/api/health", c.handleHealth)
+
+ // Setup server with graceful shutdown
+ server := &http.Server{
+ Addr: fmt.Sprintf(":%d", c.Port),
+ Handler: http.DefaultServeMux,
+ ReadTimeout: core.DefaultReadTimeout,
+ WriteTimeout: core.DefaultWriteTimeout,
+ IdleTimeout: core.DefaultIdleTimeout,
+ }
+
+ // Handle shutdown signals
+ sigChan := make(chan os.Signal, 1)
+ signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+
+ go func() {
+ <-sigChan
+ log.Println("Shutdown signal received")
+ ticker.Stop()
+ ctx, cancel := context.WithTimeout(context.Background(), core.DefaultShutdownTimeout)
+ defer cancel()
+ if err := server.Shutdown(ctx); err != nil {
+ log.Printf("Server shutdown error: %v", err)
+ }
+ }()
+
+ log.Printf("Server listening on http://localhost:%d", c.Port)
+ if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
+ return fmt.Errorf("server error: %w", err)
+ }
+
+ return nil
+}
+
+// ============================================================================
+// ┏━╸┏━┓┏━┓┏━╸ ╻ ┏━┓┏━╸╻┏━╸
+// ┃ ┃ ┃┣┳┛┣╸ ┃ ┃ ┃┃╺┓┃┃
+// ┗━╸┗━┛╹┗╸┗━╸ ┗━╸┗━┛┗━┛╹┗━╸
+// ============================================================================
+
+func (c *ServeCommand) loadModel() (*core.ModelEnvelope, error) {
+ f, err := os.Open(c.ModelPath)
+ if err != nil {
+ return nil, fmt.Errorf("failed to open model file %s: %w", c.ModelPath, err)
+ }
+ defer f.Close()
+
+ var model core.ModelEnvelope
+ if err := json.NewDecoder(f).Decode(&model); err != nil {
+ return nil, fmt.Errorf("failed to decode model: %w", err)
+ }
+
+ return &model, nil
+}
+
+func (c *ServeCommand) scoreArticle(article *core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope) float64 {
+ docs := []string{strings.TrimSpace(article.Title)}
+ vectors := vectorizer.Transform(docs)
+
+ if len(vectors) == 0 || len(vectors[0]) == 0 {
+ return 0.0
+ }
+
+ score, err := core.PredictScore(vectors[0], model.Weights)
+ if err != nil {
+ // Return 0.0 on error (below threshold). Malformed articles don't break the display,
+ // they just get filtered out. Log the error for diagnostics.
+ log.Printf("Error scoring article: %v", err)
+ return 0.0
+ }
+
+ return score
+}
+
+func (c *ServeCommand) getThreshold(model *core.ModelEnvelope) (float64, error) {
+ if model.Meta != nil {
+ if threshold, ok := model.Meta["recommended_threshold"].(float64); ok {
+ return threshold, nil
+ }
+ }
+ return core.DefaultScoreThreshold, nil
+}
+
+// scoreAndFormatArticles scores a list of articles and returns them formatted for templates.
+// Articles are scored using the model and vectorizer, then returned with human-readable ratings.
+func (c *ServeCommand) scoreAndFormatArticles(articles []*core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope, threshold float64) []map[string]interface{} {
+ type ArticleResponse struct {
+ Title string `json:"title"`
+ URL string `json:"url"`
+ Source string `json:"source,omitempty"`
+ Rating int `json:"rating"`
+ Score float64 `json:"score"`
+ }
+
+ scored := make([]ArticleResponse, 0, len(articles))
+ for _, article := range articles {
+ score := c.scoreArticle(article, vectorizer, model)
+ rating := core.ScoreToScale(score, threshold)
+
+ scored = append(scored, ArticleResponse{
+ Title: article.Title,
+ URL: article.URL,
+ Source: article.Source,
+ Rating: rating,
+ Score: score,
+ })
+ }
+
+ result := make([]map[string]interface{}, len(scored))
+ for i, a := range scored {
+ result[i] = map[string]interface{}{
+ "Title": a.Title,
+ "URL": a.URL,
+ "Source": a.Source,
+ "Rating": a.Rating,
+ "Score": a.Score,
+ }
+ }
+ return result
+}
+
+// ============================================================================
+// ┏━┓┏━┓┏━┓ ┏━┓╺┳╸╻ ╻┏━╸┏━╸
+// ┣┳┛┗━┓┗━┓ ┗━┓ ┃ ┃ ┃┣╸ ┣╸
+// ╹┗╸┗━┛┗━┛ ┗━┛ ╹ ┗━┛╹ ╹
+// ============================================================================
+
+func (c *ServeCommand) readRSSWorldFeeds() ([]string, error) {
+ f, err := os.Open(c.RSSWorldPath)
+ if err != nil {
+ return nil, fmt.Errorf("failed to open rss_world file %s: %w", c.RSSWorldPath, err)
+ }
+ defer f.Close()
+
+ var feeds []string
+ scanner := bufio.NewScanner(f)
+ for scanner.Scan() {
+ line := strings.TrimSpace(scanner.Text())
+ if line != "" && !strings.HasPrefix(line, "#") {
+ feeds = append(feeds, line)
+ }
+ }
+
+ if err := scanner.Err(); err != nil {
+ return nil, fmt.Errorf("error reading rss_world file: %w", err)
+ }
+
+ return feeds, nil
+}
+
+func (c *ServeCommand) refreshFilteredResults() error {
+ feeds, err := c.readRSSWorldFeeds()
+ if err != nil {
+ return err
+ }
+
+ c.modelMu.RLock()
+ model := c.model
+ c.modelMu.RUnlock()
+
+ if model == nil {
+ return fmt.Errorf("model not loaded")
+ }
+
+ // Scan all feeds. Continue on individual feed failures to maximize results.
+ // RSS feeds are often flaky; one down shouldn't prevent others from being processed.
+ var allArticles []*core.Article
+ for _, feed := range feeds {
+ articles, err := c.fetchRSSFeed(feed)
+ if err != nil {
+ log.Printf("Warning: failed to fetch feed %s: %v", feed, err)
+ continue
+ }
+ allArticles = append(allArticles, articles...)
+ }
+
+ // Score and filter articles
+ threshold, err := c.getThreshold(model)
+ if err != nil {
+ return err
+ }
+
+ vectorizer := core.CreateVectorizerFromModel(model)
+
+ filtered := make([]*core.Article, 0, len(allArticles))
+ for _, article := range allArticles {
+ score := c.scoreArticle(article, vectorizer, model)
+ if score >= threshold {
+ // Create a copy with score to avoid reference issues
+ articleCopy := *article
+ articleCopy.Score = &score
+ filtered = append(filtered, &articleCopy)
+ }
+ }
+
+ c.resultsMu.Lock()
+ c.filteredResults = filtered
+ c.filteredResultsTime = time.Now()
+ c.resultsMu.Unlock()
+
+ return nil
+}
+
+// backgroundRefresh runs in a goroutine, rescanning all RSS feeds on interval.
+// Failures in individual feeds don't affect others - we log and continue.
+func (c *ServeCommand) backgroundRefresh(ticker *time.Ticker) {
+ for range ticker.C {
+ log.Println("Background refresh started")
+ if err := c.refreshFilteredResults(); err != nil {
+ log.Printf("Background refresh error (continuing): %v", err)
+ } else {
+ c.resultsMu.RLock()
+ count := len(c.filteredResults)
+ c.resultsMu.RUnlock()
+ log.Printf("Background refresh complete, %d articles filtered", count)
+ }
+ }
+}
+
+func (c *ServeCommand) fetchRSSFeed(url string) ([]*core.Article, error) {
+ client := &http.Client{Timeout: core.DefaultHTTPTimeout}
+
+ req, err := http.NewRequest("GET", url, nil)
+ if err != nil {
+ return nil, fmt.Errorf("error building request: %w", err)
+ }
+ req.Header.Set("User-Agent", core.PoliteUserAgent)
+
+ ctx, cancel := context.WithTimeout(context.Background(), core.DefaultHTTPTimeout)
+ defer cancel()
+
+ resp, err := client.Do(req.WithContext(ctx))
+ if err != nil {
+ return nil, fmt.Errorf("error fetching %s: %w", url, err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("error reading response from %s: %w", url, err)
+ }
+
+ fp := gofeed.NewParser()
+ feed, err := fp.Parse(strings.NewReader(string(body)))
+ if err != nil {
+ return nil, fmt.Errorf("error parsing feed from %s: %w", url, err)
+ }
+
+ var articles []*core.Article
+ for _, item := range feed.Items {
+ article := &core.Article{
+ URL: item.Link,
+ Title: strings.TrimSpace(item.Title),
+ Source: feed.Title,
+ }
+
+ if item.PublishedParsed != nil {
+ article.PublishedAt = item.PublishedParsed
+ }
+
+ if len(article.Title) >= core.MinTitleLength {
+ articles = append(articles, article)
+ }
+ }
+
+ return articles, nil
+}
+
+// ============================================================================
+// ╻ ╻┏━╸┏┓ ╻ ╻╻
+// ┃╻┃┣╸ ┣┻┓ ┃ ┃┃
+// ┗┻┛┗━╸┗━┛ ┗━┛╹
+// ============================================================================
+
+func (c *ServeCommand) handleRoot(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/" {
+ http.NotFound(w, r)
+ return
+ }
+
+ // Redirect to live feed
+ http.Redirect(w, r, "/live-feed", http.StatusMovedPermanently)
+}
+
+func (c *ServeCommand) handleLiveFeed(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ c.resultsMu.RLock()
+ articles := c.filteredResults
+ resultsTime := c.filteredResultsTime
+ c.resultsMu.RUnlock()
+
+ c.modelMu.RLock()
+ model := c.model
+ c.modelMu.RUnlock()
+
+ if model == nil {
+ http.Error(w, "Model not loaded", http.StatusInternalServerError)
+ return
+ }
+
+ threshold, _ := c.getThreshold(model)
+
+ // Parse filter parameter (day, week, or all)
+ filter := r.URL.Query().Get("filter")
+ if filter == "" {
+ filter = "all"
+ }
+
+ // Filter articles by date if needed
+ now := time.Now()
+ filtered := articles
+ if filter == "day" || filter == "week" {
+ var cutoff time.Time
+ if filter == "day" {
+ cutoff = now.Add(-24 * time.Hour)
+ } else if filter == "week" {
+ cutoff = now.Add(-7 * 24 * time.Hour)
+ }
+
+ filtered = make([]*core.Article, 0, len(articles))
+ for _, article := range articles {
+ // Always include articles without PublishedAt
+ if article.PublishedAt == nil || article.PublishedAt.After(cutoff) {
+ filtered = append(filtered, article)
+ }
+ }
+ }
+
+ // Convert articles to template format
+ type TemplateArticle struct {
+ Title string
+ URL string
+ Source string
+ Rating int
+ Score float64
+ PublishedAt string
+ }
+
+ templateArticles := make([]TemplateArticle, 0, len(filtered))
+ for _, article := range filtered {
+ score := 0.0
+ if article.Score != nil {
+ score = *article.Score
+ }
+ rating := core.ScoreToScale(score, threshold)
+
+ publishedAt := ""
+ if article.PublishedAt != nil {
+ publishedAt = article.PublishedAt.Format("2006-01-02")
+ }
+
+ templateArticles = append(templateArticles, TemplateArticle{
+ Title: article.Title,
+ URL: article.URL,
+ Source: article.Source,
+ Rating: rating,
+ Score: score,
+ PublishedAt: publishedAt,
+ })
+ }
+
+ // Sort articles by score (highest first)
+ sort.Slice(templateArticles, func(i, j int) bool {
+ return templateArticles[i].Score > templateArticles[j].Score
+ })
+
+ data := map[string]interface{}{
+ "Page": "live-feed",
+ "Articles": templateArticles,
+ "Threshold": threshold,
+ "UpdatedAt": resultsTime.Format("2006-01-02 15:04:05"),
+ "Filter": filter,
+ "Title": displayTitle(c.Title),
+ }
+
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ if err := c.tmpl.ExecuteTemplate(w, "live-feed", data); err != nil {
+ http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+ }
+}
+
+func (c *ServeCommand) handleTools(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ data := map[string]interface{}{
+ "Page": "tools",
+ "Title": displayTitle(c.Title),
+ }
+
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ if err := c.tmpl.ExecuteTemplate(w, "tools", data); err != nil {
+ http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+ }
+}
+
+func (c *ServeCommand) handleScore(w http.ResponseWriter, r *http.Request) {
+ if r.Method == http.MethodGet {
+ c.handleTools(w, r)
+ return
+ }
+
+ if r.Method != http.MethodPost {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ c.modelMu.RLock()
+ model := c.model
+ c.modelMu.RUnlock()
+
+ if model == nil {
+ http.Error(w, "Model not loaded", http.StatusInternalServerError)
+ return
+ }
+
+ if err := r.ParseForm(); err != nil {
+ http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest)
+ return
+ }
+
+ title := strings.TrimSpace(r.FormValue("title"))
+ url := strings.TrimSpace(r.FormValue("url"))
+
+ // If URL provided, fetch and extract title from it; otherwise use provided title.
+ if url != "" {
+ extractedTitle, err := extractTitleFromURL(url)
+ if err != nil {
+ c.renderResultsError(w, fmt.Sprintf("Failed to extract title from URL: %v", err), title)
+ return
+ }
+ title = extractedTitle
+ }
+
+ // Validate input before scoring
+ if valErr := c.validateTitle(title); valErr != "" {
+ c.renderResultsError(w, valErr, title)
+ return
+ }
+
+ vectorizer := core.CreateVectorizerFromModel(model)
+ article := &core.Article{Title: title}
+ score := c.scoreArticle(article, vectorizer, model)
+
+ threshold, _ := c.getThreshold(model)
+ rating := core.ScoreToScale(score, threshold)
+
+ data := map[string]interface{}{
+ "Page": "tools",
+ "IsScoreResult": true,
+ "Title": title,
+ "Rating": rating,
+ "Score": score,
+ "Threshold": threshold,
+ "PageTitle": displayTitle(c.Title),
+ }
+
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
+ http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+ }
+}
+
+func (c *ServeCommand) handleScan(w http.ResponseWriter, r *http.Request) {
+ if r.Method == http.MethodGet {
+ c.handleTools(w, r)
+ return
+ }
+
+ if r.Method != http.MethodPost {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ c.modelMu.RLock()
+ model := c.model
+ c.modelMu.RUnlock()
+
+ if model == nil {
+ http.Error(w, "Model not loaded", http.StatusInternalServerError)
+ return
+ }
+
+ if err := r.ParseForm(); err != nil {
+ http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest)
+ return
+ }
+
+ feedURL := strings.TrimSpace(r.FormValue("feed_url"))
+
+ // Validate and fetch the feed
+ if valErr := c.validateFeedURL(feedURL); valErr != "" {
+ c.renderScanResultsError(w, valErr, feedURL)
+ return
+ }
+
+ articles, err := c.fetchRSSFeed(feedURL)
+ if err != nil {
+ c.renderScanResultsError(w, fmt.Sprintf("Failed to fetch feed: %v", err), feedURL)
+ return
+ }
+
+ // Score articles
+ threshold, _ := c.getThreshold(model)
+ vectorizer := core.CreateVectorizerFromModel(model)
+ scored := c.scoreAndFormatArticles(articles, vectorizer, model, threshold)
+
+ sort.Slice(scored, func(i, j int) bool {
+ iScore := scored[i]["Score"].(float64)
+ jScore := scored[j]["Score"].(float64)
+ return iScore > jScore
+ })
+
+ data := map[string]interface{}{
+ "Page": "tools",
+ "IsScanResult": true,
+ "FeedURL": feedURL,
+ "Articles": scored,
+ "Threshold": threshold,
+ "PageTitle": displayTitle(c.Title),
+ }
+
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
+ http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+ }
+}
+
+// ============================================================================
+// ┏━┓┏━┓╻ ┏━╸┏┓╻╺┳┓┏━┓┏━┓╻┏┓╻╺┳╸┏━┓
+// ┣━┫┣━┛┃ ┣╸ ┃┗┫ ┃┃┣━┛┃ ┃┃┃┗┫ ┃ ┗━┓
+// ╹ ╹╹ ╹ ┗━╸╹ ╹╺┻┛╹ ┗━┛╹╹ ╹ ╹ ┗━┛
+// ============================================================================
+
+func (c *ServeCommand) handleFilteredFeed(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ c.resultsMu.RLock()
+ articles := c.filteredResults
+ resultsTime := c.filteredResultsTime
+ c.resultsMu.RUnlock()
+
+ c.modelMu.RLock()
+ model := c.model
+ c.modelMu.RUnlock()
+
+ threshold, _ := c.getThreshold(model)
+
+ type ArticleResponse struct {
+ Title string `json:"title"`
+ URL string `json:"url"`
+ Source string `json:"source,omitempty"`
+ Rating int `json:"rating"`
+ Score float64 `json:"score"`
+ }
+
+ scored := make([]ArticleResponse, 0, len(articles))
+ for _, article := range articles {
+ score := 0.0
+ if article.Score != nil {
+ score = *article.Score
+ }
+ rating := core.ScoreToScale(score, threshold)
+
+ scored = append(scored, ArticleResponse{
+ Title: article.Title,
+ URL: article.URL,
+ Source: article.Source,
+ Rating: rating,
+ Score: score,
+ })
+ }
+
+ w.Header().Set("Content-Type", "application/json")
+ w.Header().Set("Cache-Control", "no-cache, no-store, must-revalidate")
+
+ if err := json.NewEncoder(w).Encode(map[string]interface{}{
+ "total": len(articles),
+ "threshold": threshold,
+ "updated_at": resultsTime,
+ "articles": scored,
+ }); err != nil {
+ http.Error(w, "Failed to encode response", http.StatusInternalServerError)
+ }
+}
+
+func (c *ServeCommand) handleFilteredRSS(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ c.resultsMu.RLock()
+ articles := c.filteredResults
+ c.resultsMu.RUnlock()
+
+ c.modelMu.RLock()
+ model := c.model
+ c.modelMu.RUnlock()
+
+ w.Header().Set("Content-Type", "application/rss+xml")
+ w.Header().Set("Cache-Control", "public, max-age=3600")
+
+ // Generate RSS feed
+ fmt.Fprintf(w, `<?xml version="1.0" encoding="UTF-8"?>
+ <rss version="2.0">
+ <channel>
+ <title>%s - Filtered Articles</title>
+ <link>http://scholscan.local</link>
+ <description>Articles filtered by your learned preferences (scored 1-10)</description>
+ `, displayTitle(c.Title))
+
+ for _, article := range articles {
+ rawScore := 0.0
+ if article.Score != nil {
+ rawScore = *article.Score
+ }
+
+ threshold, _ := c.getThreshold(model)
+ scaledScore := core.ScoreToScale(rawScore, threshold)
+
+ title := escapeXML(article.Title)
+ url := escapeXML(article.URL)
+ description := fmt.Sprintf("SCHOLSCAN SCORE = %d/10 (raw: %.3f)", scaledScore, rawScore)
+
+ fmt.Fprintf(w, ` <item>
+ <title>%s</title>
+ <link>%s</link>
+ <description>%s</description>
+ </item>
+ `, title, url, description)
+ }
+
+ fmt.Fprint(w, ` </channel>
+ </rss>`)
+}
+
+func (c *ServeCommand) handleHealth(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ c.modelMu.RLock()
+ modelLoaded := c.model != nil
+ c.modelMu.RUnlock()
+
+ status := "ok"
+ if !modelLoaded {
+ status = "model_not_loaded"
+ w.WriteHeader(http.StatusInternalServerError)
+ }
+
+ w.Header().Set("Content-Type", "application/json")
+ if err := json.NewEncoder(w).Encode(map[string]interface{}{
+ "status": status,
+ "model_loaded": modelLoaded,
+ "timestamp": time.Now().Unix(),
+ }); err != nil {
+ http.Error(w, "Failed to encode response", http.StatusInternalServerError)
+ }
+}
+
+// ============================================================================
+// ╻ ╻┏━╸╻ ┏━┓┏━╸┏━┓┏━┓
+// ┣━┫┣╸ ┃ ┣━┛┣╸ ┣┳┛┗━┓
+// ╹ ╹┗━╸┗━╸╹ ┗━╸╹┗╸┗━┛
+// ============================================================================
+
+func displayTitle(custom string) string {
+ if custom != "" {
+ return custom
+ }
+ return "ScholScan"
+}
+
+// extractTitleFromURL fetches the content from a URL and extracts the title from the HTML.
+// Designed to be resilient: tries multiple title sources, handles various URL formats,
+// and provides meaningful error feedback if extraction fails.
+func extractTitleFromURL(rawURL string) (string, error) {
+ if rawURL == "" {
+ return "", fmt.Errorf("empty URL")
+ }
+
+ // Check if it's a DOI
+ if strings.HasPrefix(rawURL, "10.") {
+ // Convert DOI to URL
+ rawURL = fmt.Sprintf("https://doi.org/%s", rawURL)
+ } else if !strings.HasPrefix(rawURL, "http://") && !strings.HasPrefix(rawURL, "https://") {
+ rawURL = "https://" + rawURL
+ }
+
+ ctx, cancel := context.WithTimeout(context.Background(), core.DefaultContextTimeout)
+ defer cancel()
+
+ req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil)
+ if err != nil {
+ return "", fmt.Errorf("invalid URL: %w", err)
+ }
+ req.Header.Set("User-Agent", core.PoliteUserAgent)
+ req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
+
+ resp, err := core.DoRequestWithRetry(ctx, core.DefaultHTTPClient, req)
+ if err != nil {
+ return "", fmt.Errorf("failed to fetch URL: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return "", fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
+ }
+
+ doc, err := goquery.NewDocumentFromReader(resp.Body)
+ if err != nil {
+ return "", fmt.Errorf("failed to parse HTML: %w", err)
+ }
+
+ // Fallback chain: <title> → og:title → twitter:title → <h1>
+ // Different sites populate these differently; trying multiple increases success rate.
+ title := ""
+
+ if t := doc.Find("title").Text(); t != "" {
+ title = strings.TrimSpace(t)
+ }
+
+ if title == "" {
+ if t, exists := doc.Find(`meta[property="og:title"]`).Attr("content"); exists && t != "" {
+ title = strings.TrimSpace(t)
+ }
+ }
+
+ if title == "" {
+ if t, exists := doc.Find(`meta[name="twitter:title"]`).Attr("content"); exists && t != "" {
+ title = strings.TrimSpace(t)
+ }
+ }
+
+ if title == "" {
+ if t := doc.Find("h1").First().Text(); t != "" {
+ title = strings.TrimSpace(t)
+ }
+ }
+
+ if title == "" {
+ return "", fmt.Errorf("could not extract title from page")
+ }
+
+ // Clean up common title patterns
+ reClean := regexp.MustCompile(`\s*\|\s*`)
+ title = reClean.ReplaceAllString(title, "")
+
+ rePub := regexp.MustCompile(`^[^|]*\|\s*`)
+ title = rePub.ReplaceAllString(title, "")
+ title = strings.TrimSpace(title)
+
+ if len(title) < 10 {
+ return "", fmt.Errorf("extracted title too short: %q", title)
+ }
+
+ return title, nil
+}
+
+// validateTitle checks that a title is suitable for scoring.
+// Returns an error message string if invalid, empty string if valid.
+func (c *ServeCommand) validateTitle(title string) string {
+ if strings.TrimSpace(title) == "" {
+ return "Title cannot be empty"
+ }
+ if len(title) > 1000 {
+ return "Title too long (max 1000 characters)"
+ }
+ return ""
+}
+
+// renderResultsError renders the results template with an error message.
+func (c *ServeCommand) renderResultsError(w http.ResponseWriter, errMsg, title string) {
+ data := map[string]interface{}{
+ "Page": "tools",
+ "IsScoreResult": true,
+ "Error": errMsg,
+ "Title": title,
+ "PageTitle": displayTitle(c.Title),
+ }
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
+ http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+ }
+}
+
+// validateFeedURL checks that a feed URL is non-empty and has valid format.
+// Returns an error message string if invalid, empty string if valid.
+func (c *ServeCommand) validateFeedURL(feedURL string) string {
+ if feedURL == "" {
+ return "Feed URL cannot be empty"
+ }
+ if _, err := url.Parse(feedURL); err != nil {
+ return "Invalid URL format"
+ }
+ return ""
+}
+
+// renderScanResultsError renders the results template with an error for scan operation.
+func (c *ServeCommand) renderScanResultsError(w http.ResponseWriter, errMsg, feedURL string) {
+ data := map[string]interface{}{
+ "Page": "tools",
+ "IsScanResult": true,
+ "Error": errMsg,
+ "FeedURL": feedURL,
+ "PageTitle": displayTitle(c.Title),
+ }
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
+ http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+ }
+}
+
+func escapeXML(s string) string {
+ s = strings.ReplaceAll(s, "&", "&amp;")
+ s = strings.ReplaceAll(s, "<", "&lt;")
+ s = strings.ReplaceAll(s, ">", "&gt;")
+ s = strings.ReplaceAll(s, "\"", "&quot;")
+ s = strings.ReplaceAll(s, "'", "&apos;")
+ return s
+}
diff --git a/cmds/templates/live-feed.html b/cmds/templates/live-feed.html
new file mode 100644
index 0000000..1529ee1
--- /dev/null
+++ b/cmds/templates/live-feed.html
@@ -0,0 +1,158 @@
+{{define "live-feed"}}
+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <title>{{.Title}} - Live Feed</title>
+ <style>
+ /* ========================================
+ BASE STYLE
+ ======================================== */
+ * { margin: 0; padding: 0; box-sizing: border-box; }
+ body {
+ font-family: monospace;
+ background: #fff;
+ color: #000;
+ padding: 20px;
+ line-height: 1.6;
+ }
+ h1 {
+ font-size: 1.2em;
+ font-weight: bold;
+ margin-bottom: 20px;
+ }
+
+ /* ========================================
+ NAV (live-feed | score-scan)
+ ======================================== */
+ .nav {
+ margin-bottom: 30px;
+ display: flex;
+ gap: 30px;
+ border-bottom: 1px solid #000;
+ padding-bottom: 10px;
+ }
+ .nav a {
+ text-decoration: none;
+ color: #000;
+ font-family: monospace;
+ }
+ .nav a.active {
+ border-bottom: 2px solid #000;
+ padding-bottom: 5px;
+ }
+
+ /* ========================================
+ ARTICLE LIST
+ ======================================== */
+ .article {
+ margin-bottom: 15px;
+ padding: 10px;
+ border: 1px solid #ccc;
+ }
+ .article a {
+ color: #00f;
+ text-decoration: underline;
+ }
+ .article-meta {
+ margin-top: 8px;
+ color: #666;
+ font-size: 0.9em;
+ }
+
+ /* ========================================
+ ARTICLE LIST STUFF
+ ======================================== */
+ .summary {
+ margin-bottom: 15px;
+ padding: 10px;
+ border: 1px solid #000;
+ background: #f9f9f9;
+ }
+ .rss-link {
+ background: #f9f9f9;
+ padding: 15px;
+ border: 1px solid #000;
+ margin-bottom: 20px;
+ }
+ .rss-link a {
+ color: #00f;
+ text-decoration: underline;
+ }
+ .feed-list {
+ max-height: 600px;
+ overflow-y: auto;
+ border: 1px solid #000;
+ padding: 10px;
+ }
+
+ .error {
+ color: #f00;
+ margin-top: 10px;
+ padding: 10px;
+ border: 1px solid #f00;
+ }
+ </style>
+</head>
+<body>
+ <h1><a href="/live-feed" style="color: inherit; text-decoration: none;">{{.Title}}</a></h1>
+ <div class="nav">
+ <a href="/live-feed" class="active">Live Feed</a>
+ <a href="/tools">Score & Scan</a>
+ </div>
+
+ <div class="rss-link">
+ <strong>Filtered RSS Feed:</strong>
+ <a href="/api/filtered/rss" target="_blank">Subscribe to filtered articles</a>
+ <span style="margin-left: 10px; color: #666; font-size: 0.9em;">(rss link for feed readers)</span>
+ <div style="margin-top: 10px; padding-top: 10px; border-top: 1px solid #ccc; color: #666; font-size: 0.9em;">
+ Last updated: <span id="feedTimestamp">{{if .UpdatedAt}}{{.UpdatedAt}}{{else}}—{{end}}</span>
+ </div>
+ </div>
+
+ <div style="margin-bottom: 20px;">
+ <strong>Filter by date:</strong>
+ <div style="margin-top: 8px; display: flex; gap: 10px;">
+ <a href="/live-feed?filter=day" style="padding: 6px 12px; text-decoration: none; {{if eq .Filter "day"}}background: #000; color: #fff;{{else}}border: 1px solid #000; color: #000;{{end}}">Last 24h</a>
+ <a href="/live-feed?filter=week" style="padding: 6px 12px; text-decoration: none; {{if eq .Filter "week"}}background: #000; color: #fff;{{else}}border: 1px solid #000; color: #000;{{end}}">Last 7 days</a>
+ <a href="/live-feed?filter=all" style="padding: 6px 12px; text-decoration: none; {{if eq .Filter "all"}}background: #000; color: #fff;{{else}}border: 1px solid #000; color: #000;{{end}}">All</a>
+ </div>
+ </div>
+
+ <div class="feed-list">
+ {{if .Error}}
+ <div class="error">{{.Error}}</div>
+ {{else if .Articles}}
+ <div class="summary">
+ <strong>{{len .Articles}}</strong> articles (threshold: {{printf "%.2f" .Threshold}})
+ </div>
+ {{$threshold := .Threshold}}
+ {{range .Articles}}
+ {{$isGood := ge .Score $threshold}}
+ {{$bgColor := "white"}}
+ {{if $isGood}}
+ {{$bgColor = "#e8f5e9"}}
+ {{else}}
+ {{$bgColor = "#ffebee"}}
+ {{end}}
+ {{$indicator := "✗"}}
+ {{if $isGood}}
+ {{$indicator = "✓"}}
+ {{end}}
+ <div class="article" style="background-color: {{$bgColor}};">
+ <div style="font-weight: bold;">
+ <a href="{{.URL}}" target="_blank">{{.Title}}</a>
+ </div>
+ <div class="article-meta">
+ Rating: {{$indicator}} {{.Rating}}/10 (raw: {{printf "%.3f" .Score}}) · {{.Source}}{{if .PublishedAt}} · {{.PublishedAt}}{{end}}
+ </div>
+ </div>
+ {{end}}
+ {{else}}
+ <p>No articles to display</p>
+ {{end}}
+ </div>
+</body>
+</html>
+{{end}}
diff --git a/cmds/templates/results.html b/cmds/templates/results.html
new file mode 100644
index 0000000..13f68e0
--- /dev/null
+++ b/cmds/templates/results.html
@@ -0,0 +1,279 @@
+{{define "results"}}
+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <title>{{.PageTitle}} - Results</title>
+ <style>
+ /* ========================================
+ BASE STYLE
+ ======================================== */
+ * { margin: 0; padding: 0; box-sizing: border-box; }
+ body {
+ font-family: monospace;
+ background: #fff;
+ color: #000;
+ padding: 20px;
+ line-height: 1.6;
+ }
+ h1 {
+ font-size: 1.2em;
+ font-weight: bold;
+ margin-bottom: 20px;
+ }
+ h2 {
+ font-size: 1em;
+ font-weight: bold;
+ margin-bottom: 15px;
+ border-bottom: 1px solid #000;
+ padding-bottom: 10px;
+ }
+
+ /* ========================================
+ NAV (live-feed | score-scan)
+ ======================================== */
+ .nav {
+ margin-bottom: 30px;
+ display: flex;
+ gap: 30px;
+ border-bottom: 1px solid #000;
+ padding-bottom: 10px;
+ }
+ .nav a {
+ text-decoration: none;
+ color: #000;
+ font-family: monospace;
+ }
+ .nav a.active {
+ border-bottom: 2px solid #000;
+ padding-bottom: 5px;
+ }
+
+ /* ========================================
+ LAYOUT (2-column grid for score-scan)
+ ======================================== */
+ .container {
+ max-width: 1200px;
+ margin: 0 auto;
+ display: grid;
+ grid-template-columns: 1fr 1fr;
+ gap: 30px;
+ }
+ .section {
+ border: 1px solid #000;
+ padding: 20px;
+ }
+
+ /* ========================================
+ FORMS (input, textarea, button)
+ ======================================== */
+ label {
+ display: block;
+ margin-top: 15px;
+ font-weight: bold;
+ }
+ input, textarea {
+ display: block;
+ width: 100%;
+ margin-top: 5px;
+ padding: 5px;
+ border: 1px solid #000;
+ font-family: monospace;
+ }
+ textarea {
+ resize: vertical;
+ min-height: 80px;
+ }
+ button {
+ margin-top: 15px;
+ padding: 5px 15px;
+ border: 1px solid #000;
+ background: #fff;
+ cursor: pointer;
+ font-family: monospace;
+ }
+ button:hover {
+ background: #000;
+ color: #fff;
+ }
+ button:active {
+ opacity: 0.8;
+ }
+
+ /* ========================================
+ RESULT BOXES
+ ======================================== */
+ .result {
+ margin-top: 20px;
+ padding: 15px;
+ border: 1px solid #000;
+ background: #f5f5f5;
+ }
+ .score {
+ font-size: 3em;
+ font-weight: bold;
+ text-align: center;
+ margin: 20px 0;
+ }
+ .error {
+ color: #f00;
+ margin-top: 10px;
+ padding: 10px;
+ border: 1px solid #f00;
+ }
+
+ /* ========================================
+ ARTICLE LIST
+ ======================================== */
+ .article {
+ margin-bottom: 15px;
+ padding: 10px;
+ border: 1px solid #ccc;
+ }
+ .article a {
+ color: #00f;
+ text-decoration: underline;
+ }
+ .article-meta {
+ margin-top: 8px;
+ color: #666;
+ font-size: 0.9em;
+ }
+
+ /* ========================================
+ ARTICLE LIST STUFF
+ ======================================== */
+ .summary {
+ margin-bottom: 15px;
+ padding: 10px;
+ border: 1px solid #000;
+ background: #f9f9f9;
+ }
+
+ small {
+ display: block;
+ margin-top: 5px;
+ color: #666;
+ }
+
+ /* ========================================
+ MOBILE
+ ======================================== */
+ @media (max-width: 960px) {
+ .container {
+ grid-template-columns: 1fr;
+ gap: 20px;
+ }
+ }
+ </style>
+</head>
+<body>
+ <h1><a href="/live-feed" style="color: inherit; text-decoration: none;">{{.PageTitle}}</a></h1>
+ <div class="nav">
+ <a href="/live-feed">Live Feed</a>
+ <a href="/tools" class="active">Score & Scan</a>
+ </div>
+
+ <div class="container">
+ {{if .IsScoreResult}}
+ <div class="section">
+ <h2>Score Article</h2>
+ {{if .Error}}
+ <div class="error">{{.Error}}</div>
+ <form method="POST" action="/score" style="margin-top: 20px;">
+ <label for="scoreTitle">Title:</label>
+ <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" value="{{.Title}}" />
+ <label for="scoreURL">URL or DOI:</label>
+ <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" />
+ <small>If URL is provided, title will be automatically extracted</small>
+ <button type="submit">Score</button>
+ </form>
+ {{else}}
+ <div class="result">
+ <div class="score">{{.Rating}}/10</div>
+ <p style="text-align: center; color: #666;">Score: {{printf "%.3f" .Score}}</p>
+ <p style="text-align: center; margin-top: 10px; font-size: 0.9em;">{{.Title}}</p>
+ </div>
+ <form method="POST" action="/score" style="margin-top: 20px;">
+ <label for="scoreTitle">Title:</label>
+ <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" />
+ <label for="scoreURL">URL or DOI:</label>
+ <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" />
+ <small>If URL is provided, title will be automatically extracted</small>
+ <button type="submit">Score Another</button>
+ </form>
+ {{end}}
+ </div>
+
+ <div class="section">
+ <h2>Scan Feed</h2>
+ <form method="POST" action="/scan">
+ <label for="feedURL">RSS Feed URL:</label>
+ <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" required />
+ <button type="submit">Scan</button>
+ </form>
+ </div>
+
+ {{else if .IsScanResult}}
+ <div class="section">
+ <h2>Score Article</h2>
+ <form method="POST" action="/score">
+ <label for="scoreTitle">Title:</label>
+ <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" />
+ <label for="scoreURL">URL or DOI:</label>
+ <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" />
+ <small>If URL is provided, title will be automatically extracted</small>
+ <button type="submit">Score</button>
+ </form>
+ </div>
+
+ <div class="section">
+ <h2>Scan Feed</h2>
+ {{if .Error}}
+ <div class="error">{{.Error}}</div>
+ <form method="POST" action="/scan" style="margin-top: 20px;">
+ <label for="feedURL">RSS Feed URL:</label>
+ <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" value="{{.FeedURL}}" required />
+ <button type="submit">Try Again</button>
+ </form>
+ {{else}}
+ <div class="summary">
+ <strong>{{len .Articles}}</strong> articles from {{.FeedURL}} (threshold: {{printf "%.2f" .Threshold}})
+ </div>
+ <div style="max-height: 500px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
+ {{$threshold := .Threshold}}
+ {{range .Articles}}
+ {{$isGood := ge .Score $threshold}}
+ {{$bgColor := "white"}}
+ {{if $isGood}}
+ {{$bgColor = "#e8f5e9"}}
+ {{else}}
+ {{$bgColor = "#ffebee"}}
+ {{end}}
+ {{$indicator := "✗"}}
+ {{if $isGood}}
+ {{$indicator = "✓"}}
+ {{end}}
+ <div class="article" style="background-color: {{$bgColor}};">
+ <div style="font-weight: bold;">
+ <a href="{{.URL}}" target="_blank">{{.Title}}</a>
+ </div>
+ <div class="article-meta">
+ Rating: {{$indicator}} {{.Rating}}/10 (raw: {{printf "%.3f" .Score}}) · {{.Source}}
+ </div>
+ </div>
+ {{end}}
+ </div>
+ <form method="POST" action="/scan" style="margin-top: 20px;">
+ <label for="feedURL">RSS Feed URL:</label>
+ <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" required />
+ <button type="submit">Scan Another</button>
+ </form>
+ {{end}}
+ </div>
+ {{end}}
+ </div>
+</body>
+</html>
+{{end}}
diff --git a/cmds/templates/tools.html b/cmds/templates/tools.html
new file mode 100644
index 0000000..def04fe
--- /dev/null
+++ b/cmds/templates/tools.html
@@ -0,0 +1,202 @@
+{{define "tools"}}
+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <title>{{.Title}} - Score & Scan</title>
+ <style>
+ /* ========================================
+ BASE STYLE
+ ======================================== */
+ * { margin: 0; padding: 0; box-sizing: border-box; }
+ body {
+ font-family: monospace;
+ background: #fff;
+ color: #000;
+ padding: 20px;
+ line-height: 1.6;
+ }
+ h1 {
+ font-size: 1.2em;
+ font-weight: bold;
+ margin-bottom: 20px;
+ }
+ h2 {
+ font-size: 1em;
+ font-weight: bold;
+ margin-bottom: 15px;
+ border-bottom: 1px solid #000;
+ padding-bottom: 10px;
+ }
+
+ /* ========================================
+ NAV (live-feed | score-scan)
+ ======================================== */
+ .nav {
+ margin-bottom: 30px;
+ display: flex;
+ gap: 30px;
+ border-bottom: 1px solid #000;
+ padding-bottom: 10px;
+ }
+ .nav a {
+ text-decoration: none;
+ color: #000;
+ font-family: monospace;
+ }
+ .nav a.active {
+ border-bottom: 2px solid #000;
+ padding-bottom: 5px;
+ }
+
+ /* ========================================
+ LAYOUT (2-column grid for score-scan)
+ ======================================== */
+ .container {
+ max-width: 1200px;
+ margin: 0 auto;
+ display: grid;
+ grid-template-columns: 1fr 1fr;
+ gap: 30px;
+ }
+ .section {
+ border: 1px solid #000;
+ padding: 20px;
+ }
+
+ /* ========================================
+ FORMS (input, textarea, button)
+ ======================================== */
+ label {
+ display: block;
+ margin-top: 15px;
+ font-weight: bold;
+ }
+ input, textarea {
+ display: block;
+ width: 100%;
+ margin-top: 5px;
+ padding: 5px;
+ border: 1px solid #000;
+ font-family: monospace;
+ }
+ textarea {
+ resize: vertical;
+ min-height: 80px;
+ }
+ button {
+ margin-top: 15px;
+ padding: 5px 15px;
+ border: 1px solid #000;
+ background: #fff;
+ cursor: pointer;
+ font-family: monospace;
+ }
+ button:hover {
+ background: #000;
+ color: #fff;
+ }
+ button:active {
+ opacity: 0.8;
+ }
+
+ /* ========================================
+ RESULT BOXES
+ ======================================== */
+ .result {
+ margin-top: 20px;
+ padding: 15px;
+ border: 1px solid #000;
+ background: #f5f5f5;
+ }
+ .score {
+ font-size: 3em;
+ font-weight: bold;
+ text-align: center;
+ margin: 20px 0;
+ }
+ .error {
+ color: #f00;
+ margin-top: 10px;
+ padding: 10px;
+ border: 1px solid #f00;
+ }
+
+ /* ========================================
+ ARTICLE LIST
+ ======================================== */
+ .article {
+ margin-bottom: 15px;
+ padding: 10px;
+ border: 1px solid #ccc;
+ }
+ .article a {
+ color: #00f;
+ text-decoration: underline;
+ }
+ .article-meta {
+ margin-top: 8px;
+ color: #666;
+ font-size: 0.9em;
+ }
+
+ /* ========================================
+ ARTICLE LIST STUFF
+ ======================================== */
+ .summary {
+ margin-bottom: 15px;
+ padding: 10px;
+ border: 1px solid #000;
+ background: #f9f9f9;
+ }
+
+ small {
+ display: block;
+ margin-top: 5px;
+ color: #666;
+ }
+
+ /* ========================================
+ MOBILE
+ ======================================== */
+ @media (max-width: 960px) {
+ .container {
+ grid-template-columns: 1fr;
+ gap: 20px;
+ }
+ }
+ </style>
+</head>
+<body>
+ <h1><a href="/live-feed" style="color: inherit; text-decoration: none;">{{.Title}}</a></h1>
+ <div class="nav">
+ <a href="/live-feed">Live Feed</a>
+ <a href="/tools" class="active">Score & Scan</a>
+ </div>
+
+ <div class="container">
+ <div class="section">
+ <h2>Score Article</h2>
+ <form method="POST" action="/score">
+ <label for="scoreTitle">Title:</label>
+ <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" />
+ <label for="scoreURL" style="margin-top: 10px;">URL or DOI:</label>
+ <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" />
+ <small>If URL is provided, title will be automatically extracted</small>
+ <button type="submit">Score</button>
+ </form>
+ </div>
+
+ <div class="section">
+ <h2>Scan Feed</h2>
+ <form method="POST" action="/scan">
+ <label for="feedURL">RSS Feed URL:</label>
+ <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" required />
+ <button type="submit">Scan</button>
+ </form>
+ </div>
+ </div>
+</body>
+</html>
+{{end}}
diff --git a/cmds/train.go b/cmds/train.go
new file mode 100644
index 0000000..e7e8915
--- /dev/null
+++ b/cmds/train.go
@@ -0,0 +1,841 @@
+// Train command learns model from positive examples and RSS feeds.
+// Loads positives, fetches RSS feeds as negatives, excludes overlap,
+// trains TF-IDF + logistic regression with 1:1 class balancing.
+// Outputs model with validation threshold to stdout.
+package cmds
+
+import (
+ "bufio"
+ "bytes"
+ "context"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "math"
+ "math/rand"
+ "net/http"
+ "net/url"
+ "os"
+ "path/filepath"
+ "strings"
+ "time"
+
+ "github.com/mmcdole/gofeed"
+ "scholscan/core"
+)
+
+// ============================================================================
+// ┏━╸┏┳┓╺┳┓ ┏━┓┏┓ ┏┓
+// ┃ ┃┃┃ ┃┃ ┃ ┃┣┻┓ ┃
+// ┗━╸╹ ╹╺┻┛ ┗━┛┗━┛┗━┛
+// ============================================================================
+
+// Learns model from positive examples and RSS feeds
+// Outputs trained model JSON to stdout
+type TrainCommand struct {
+ positivesFile string
+ rssFeedsFile string
+ verboseOutput bool
+ lambda float64
+ minDF int
+ maxDF float64
+ ngramMax int
+}
+
+func (c *TrainCommand) Name() string { return "train" }
+
+func (c *TrainCommand) Init(args []string) error {
+ fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
+ fs.Usage = func() {
+ fmt.Fprint(fs.Output(), `Usage: scholscan train POSITIVES_FILE --rss-feeds RSS_FEEDS_FILE > model.json
+
+Train a TF-IDF + logistic regression model from positive examples and RSS feeds.
+
+The training workflow:
+ 1. Load positive examples from POSITIVES_FILE
+ 2. Fetch articles from RSS feeds list
+ 3. Exclude any positive examples from RSS feed articles
+ 4. Train model with balanced classes
+ 5. Output trained model to stdout as JSON
+
+Flags:
+`)
+ fs.PrintDefaults()
+ fmt.Fprint(fs.Output(), `
+Arguments:
+ POSITIVES_FILE Path to JSONL file with positive examples (required)
+
+Example:
+ scholscan train positives.jsonl --rss-feeds rss_world.txt > model.json
+`)
+ }
+
+ fs.StringVar(&c.rssFeedsFile, "rss-feeds", "", "Path to text file with RSS feed URLs (required)")
+ fs.BoolVar(&c.verboseOutput, "verbose", false, "Show progress information")
+ fs.Float64Var(&c.lambda, "lambda", 0.001, "L2 regularization parameter for logistic regression")
+ fs.IntVar(&c.minDF, "min-df", 2, "Minimum document frequency (absolute count)")
+ fs.Float64Var(&c.maxDF, "max-df", 0.8, "Maximum document frequency (ratio, 0-1)")
+ fs.IntVar(&c.ngramMax, "ngram-max", 2, "Maximum n-gram size (e.g., 1=unigrams, 2=unigrams+bigrams)")
+
+ // Check for help flag first
+ for _, arg := range args {
+ if arg == "--help" || arg == "-h" {
+ fs.Usage()
+ return flag.ErrHelp
+ }
+ }
+
+ // Extract positional argument (POSITIVES_FILE) before parsing flags
+ if len(args) == 0 {
+ return fmt.Errorf("POSITIVES_FILE argument is required")
+ }
+ // The first argument should be the positives file, the rest are flags
+ c.positivesFile = args[0]
+ flagArgs := args[1:]
+
+ if err := fs.Parse(flagArgs); err != nil {
+ return err
+ }
+
+ if c.rssFeedsFile == "" {
+ return fmt.Errorf("--rss-feeds flag is required")
+ }
+
+ // Validate paths are safe (prevent directory traversal)
+ if strings.Contains(filepath.Clean(c.positivesFile), "..") {
+ return fmt.Errorf("invalid positives file path: directory traversal not allowed")
+ }
+ if strings.Contains(filepath.Clean(c.rssFeedsFile), "..") {
+ return fmt.Errorf("invalid RSS feeds file path: directory traversal not allowed")
+ }
+
+ return nil
+}
+
+func (c *TrainCommand) Run(stdin io.Reader, stdout io.Writer) error {
+ if c.verboseOutput {
+ log.SetOutput(os.Stderr)
+ log.Println("Starting training workflow...")
+ log.Printf("Positives: %s", c.positivesFile)
+ log.Printf("RSS feeds: %s", c.rssFeedsFile)
+ }
+
+ if c.verboseOutput {
+ log.Printf("Loading positives from %s...", c.positivesFile)
+ }
+ positives, err := c.loadArticles(c.positivesFile)
+ if err != nil {
+ return fmt.Errorf("failed to load positives: %w", err)
+ }
+ if c.verboseOutput {
+ log.Printf("Loaded %d positive examples", len(positives))
+ }
+
+ if c.verboseOutput {
+ log.Printf("Loading RSS feeds from %s...", c.rssFeedsFile)
+ }
+ rssURLs, err := c.loadRSSURLs(c.rssFeedsFile)
+ if err != nil {
+ return fmt.Errorf("failed to load RSS feeds: %w", err)
+ }
+ if c.verboseOutput {
+ log.Printf("Found %d RSS feeds to fetch", len(rssURLs))
+ }
+
+ negatives, err := c.fetchFromRSSFeeds(rssURLs)
+ if err != nil {
+ return fmt.Errorf("failed to fetch from RSS feeds: %w", err)
+ }
+ if c.verboseOutput {
+ log.Printf("Fetched %d articles from RSS feeds", len(negatives))
+ }
+
+ negatives = c.excludePositives(negatives, positives)
+ if c.verboseOutput {
+ log.Printf("After exclusion: %d negative examples", len(negatives))
+ }
+
+ if len(positives) == 0 || len(negatives) == 0 {
+ return fmt.Errorf("need both positive (%d) and negative (%d) examples for training", len(positives), len(negatives))
+ }
+
+ if c.verboseOutput {
+ log.Println("Training model...")
+ }
+ model, err := c.trainModel(positives, negatives)
+ if err != nil {
+ return fmt.Errorf("failed to train model: %w", err)
+ }
+
+ // Output model
+ encoder := json.NewEncoder(stdout)
+ encoder.SetIndent("", " ")
+ if err := encoder.Encode(model); err != nil {
+ return fmt.Errorf("failed to write model: %w", err)
+ }
+
+ return nil
+}
+
+// ============================================================================
+// ╺┳┓┏━┓╺┳╸┏━┓ ╻ ┏━┓┏━┓╺┳┓╻┏┓╻┏━╸
+// ┃┃┣━┫ ┃ ┣━┫ ┃ ┃ ┃┣━┫ ┃┃┃┃┗┫┃╺┓
+// ╺┻┛╹ ╹ ╹ ╹ ╹ ┗━╸┗━┛╹ ╹╺┻┛╹╹ ╹┗━┛
+// ============================================================================
+
+func (c *TrainCommand) loadArticles(filename string) ([]*core.Article, error) {
+ file, err := os.Open(filename)
+ if err != nil {
+ return nil, err
+ }
+ defer file.Close()
+
+ var articles []*core.Article
+ decoder := json.NewDecoder(file)
+ lineCount := 0
+ for {
+ var article core.Article
+ if err := decoder.Decode(&article); err != nil {
+ if err == io.EOF {
+ break
+ }
+ // Skip malformed json lines, don't fail on bad input.
+ lineCount++
+ continue
+ }
+ articles = append(articles, &article)
+ lineCount++
+ if lineCount%500 == 0 && c.verboseOutput {
+ log.Printf(" Loaded %d articles so far", len(articles))
+ }
+ }
+ return articles, nil
+}
+
+// loadRSSURLs loads RSS feed URLs from a text file
+func (c *TrainCommand) loadRSSURLs(filename string) ([]string, error) {
+ file, err := os.Open(filename)
+ if err != nil {
+ return nil, err
+ }
+ defer file.Close()
+
+ var urls []string
+ scanner := bufio.NewScanner(file)
+ for scanner.Scan() {
+ line := strings.TrimSpace(scanner.Text())
+ if line != "" && !strings.HasPrefix(line, "#") {
+ urls = append(urls, line)
+ }
+ }
+ return urls, scanner.Err()
+}
+
+// fetchFromRSSFeeds fetches articles from multiple RSS feeds in parallel
+func (c *TrainCommand) fetchFromRSSFeeds(rssURLs []string) ([]*core.Article, error) {
+ client := core.DefaultHTTPClient
+ type result struct {
+ url string
+ articles []*core.Article
+ err error
+ }
+ resultChan := make(chan result, len(rssURLs))
+
+ for _, rssURL := range rssURLs {
+ go func(url string) {
+ articles, err := c.fetchRSSFeed(client, url)
+ resultChan <- result{url: url, articles: articles, err: err}
+ }(rssURL)
+ }
+
+ var allArticles []*core.Article
+ for i := 0; i < len(rssURLs); i++ {
+ res := <-resultChan
+ if res.err != nil {
+ if c.verboseOutput {
+ log.Printf("%s: failed to fetch", shortURL(res.url))
+ }
+ } else {
+ if c.verboseOutput {
+ log.Printf("%s: %d articles", shortURL(res.url), len(res.articles))
+ }
+ allArticles = append(allArticles, res.articles...)
+ }
+ }
+
+ return allArticles, nil
+}
+
+// ParseRSSFeed parses an RSS/Atom feed from the provided body into a slice of Articles.
+func ParseRSSFeed(body []byte, baseURL string) ([]*core.Article, error) {
+ fp := gofeed.NewParser()
+ feed, err := fp.Parse(bytes.NewReader(body))
+ if err != nil {
+ return nil, err
+ }
+
+ var articles []*core.Article
+ for _, item := range feed.Items {
+ // Prefer explicit content; fall back to description.
+ content := strings.TrimSpace(item.Content)
+ if content == "" {
+ content = item.Description
+ }
+ // Also check custom content field (for <content> tags in RSS)
+ if content == "" && item.Custom != nil {
+ if c, ok := item.Custom["content"]; ok && c != "" {
+ content = c
+ }
+ }
+
+ // Clean and limit content length
+ content = core.CleanFeedContent(content)
+
+ articles = append(articles, &core.Article{
+ URL: item.Link,
+ Title: item.Title,
+ Content: content,
+ })
+ }
+ return articles, nil
+}
+
+// fetchRSSFeed fetches and parses a single RSS feed
+func (c *TrainCommand) fetchRSSFeed(client *http.Client, rssURL string) ([]*core.Article, error) {
+ var body []byte
+ var err error
+
+ // Handle file:// URLs locally
+ if strings.HasPrefix(rssURL, "file://") {
+ // Remove file:// prefix
+ filePath := strings.TrimPrefix(rssURL, "file://")
+ body, err = os.ReadFile(filePath)
+ if err != nil {
+ return nil, fmt.Errorf("error reading file %s: %w", filePath, err)
+ }
+ } else {
+ // Handle HTTP/HTTPS URLs normally
+ req, err := http.NewRequest("GET", rssURL, nil)
+ if err != nil {
+ return nil, fmt.Errorf("error building request: %w", err)
+ }
+ req.Header.Set("User-Agent", core.PoliteUserAgent)
+
+ // Make request with retry logic
+ ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ defer cancel()
+
+ resp, err := core.DoRequestWithRetry(ctx, client, req)
+ if err != nil {
+ return nil, fmt.Errorf("error fetching %s: %w", rssURL, err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, rssURL)
+ }
+
+ // Read response body
+ body, err = io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("error reading response from %s: %w", rssURL, err)
+ }
+ }
+
+ // Parse RSS/Atom feed
+ return ParseRSSFeed(body, rssURL)
+}
+
+// ============================================================================
+// ╺┳┓┏━┓╺┳╸┏━┓ ┏━┓┏━┓┏━╸┏━┓
+// ┃┃┣━┫ ┃ ┣━┫ ┣━┛┣┳┛┣╸ ┣━┛
+// ╺┻┛╹ ╹ ╹ ╹ ╹ ╹ ╹┗╸┗━╸╹
+// ============================================================================
+
+func (c *TrainCommand) excludePositives(negatives, positives []*core.Article) []*core.Article {
+ // Build set of positive URLs for O(1) lookup
+ positiveURLs := make(map[string]bool)
+ for _, pos := range positives {
+ positiveURLs[pos.URL] = true
+ }
+
+ // Filter out positives
+ var filtered []*core.Article
+ for _, neg := range negatives {
+ if !positiveURLs[neg.URL] {
+ filtered = append(filtered, neg)
+ }
+ }
+
+ return filtered
+}
+
+// splitTrainingData performs a deterministic 80/20 split (seed=42).
+// Deterministic ensures reproducible model training across runs.
+func (c *TrainCommand) splitTrainingData(documents []string, labels []float64) (
+ trainDocs, valDocs []string,
+ trainLabels, valLabels []float64,
+) {
+ const validationSplitRatio = 0.2
+ const splitSeed = 42
+
+ if len(documents) < 3 {
+ // Not enough data to split, use all for training.
+ // A split requires at least 2 training documents to avoid MaxDF issues
+ // and at least 1 validation document.
+ return documents, nil, labels, nil
+ }
+
+ // Create a reproducible random source and shuffle indices.
+ rng := rand.New(rand.NewSource(splitSeed))
+ indices := make([]int, len(documents))
+ for i := range indices {
+ indices[i] = i
+ }
+ rng.Shuffle(len(indices), func(i, j int) {
+ indices[i], indices[j] = indices[j], indices[i]
+ })
+
+ splitIndex := int(float64(len(documents)) * (1.0 - validationSplitRatio))
+ trainIndices := indices[:splitIndex]
+ valIndices := indices[splitIndex:]
+
+ trainDocs = make([]string, len(trainIndices))
+ trainLabels = make([]float64, len(trainIndices))
+ for i, idx := range trainIndices {
+ trainDocs[i] = documents[idx]
+ trainLabels[i] = labels[idx]
+ }
+
+ valDocs = make([]string, len(valIndices))
+ valLabels = make([]float64, len(valIndices))
+ for i, idx := range valIndices {
+ valDocs[i] = documents[idx]
+ valLabels[i] = labels[idx]
+ }
+
+ return trainDocs, valDocs, trainLabels, valLabels
+}
+
+// Downsample majority class to 1:1 ratio AFTER vectorizer.Fit() to preserve IDF values.
+func (c *TrainCommand) downsampleToBalance(docs []string, labels []float64) ([]string, []float64) {
+ // Count positives and negatives
+ var posDocs, negDocs []string
+ var posLabels, negLabels []float64
+
+ for i, label := range labels {
+ if label == 1.0 {
+ posDocs = append(posDocs, docs[i])
+ posLabels = append(posLabels, label)
+ } else {
+ negDocs = append(negDocs, docs[i])
+ negLabels = append(negLabels, label)
+ }
+ }
+
+ // If already balanced, return as-is
+ if len(posDocs) == len(negDocs) {
+ return docs, labels
+ }
+
+ // Determine which class is majority
+ var majorityDocs, minorityDocs []string
+ var majorityLabels, minorityLabels []float64
+
+ if len(negDocs) > len(posDocs) {
+ // Negatives are majority
+ majorityDocs, minorityDocs = negDocs, posDocs
+ majorityLabels, minorityLabels = negLabels, posLabels
+ } else {
+ // Positives are majority (unlikely but handle)
+ majorityDocs, minorityDocs = posDocs, negDocs
+ majorityLabels, minorityLabels = posLabels, negLabels
+ }
+
+ // Downsample majority to match minority size
+ minoritySize := len(minorityDocs)
+ rng := rand.New(rand.NewSource(42)) // Use fixed seed for reproducibility
+
+ // Create random indices for downsampling
+ indices := make([]int, len(majorityDocs))
+ for i := range indices {
+ indices[i] = i
+ }
+ rng.Shuffle(len(indices), func(i, j int) {
+ indices[i], indices[j] = indices[j], indices[i]
+ })
+
+ // Select downsampled majority
+ downsampledDocs := make([]string, 0, minoritySize*2)
+ downsampledLabels := make([]float64, 0, minoritySize*2)
+
+ // Add all minority samples
+ downsampledDocs = append(downsampledDocs, minorityDocs...)
+ downsampledLabels = append(downsampledLabels, minorityLabels...)
+
+ // Add downsampled majority
+ for i := 0; i < minoritySize; i++ {
+ idx := indices[i]
+ downsampledDocs = append(downsampledDocs, majorityDocs[idx])
+ downsampledLabels = append(downsampledLabels, majorityLabels[idx])
+ }
+
+ return downsampledDocs, downsampledLabels
+}
+
+// ============================================================================
+// ╺┳╸┏━┓┏━┓╻┏┓╻ ┏┳┓┏━┓╺┳┓┏━╸╻
+// ┃ ┣┳┛┣━┫┃┃┗┫ ┃┃┃┃ ┃ ┃┃┣╸ ┃
+// ╹ ╹┗╸╹ ╹╹╹ ╹ ╹ ╹┗━┛╺┻┛┗━╸┗━╸
+// ============================================================================
+
+// trainModel trains a TF-IDF + logistic regression model
+func (c *TrainCommand) trainModel(positives, negatives []*core.Article) (*core.ModelEnvelope, error) {
+ // Combine datasets and create labels
+ var documents []string
+ var labels []float64
+
+ // Process positives
+ for _, article := range positives {
+ // Skip articles with titles that are too short
+ if len(article.Title) < 15 {
+ continue
+ }
+ documents = append(documents, article.Title)
+ labels = append(labels, 1.0)
+ }
+
+ // Process negatives
+ for _, article := range negatives {
+ // Skip articles with titles that are too short
+ if len(article.Title) < 15 {
+ continue
+ }
+ documents = append(documents, article.Title)
+ labels = append(labels, 0.0)
+ }
+
+ // Use parameters from CLI flags (with defaults matching Julia implementation)
+ const vocabCap = 50000
+
+ // Deterministic 80/20 split for train/validation
+ trainDocs, valDocs, trainLabels, valLabels := c.splitTrainingData(documents, labels)
+
+ // Create TF-IDF vectorizer with the specified parameters
+ vectorizer := &core.TFIDFVectorizer{
+ NgramMin: 1,
+ NgramMax: c.ngramMax,
+ MinDF: c.minDF,
+ MaxDF: c.maxDF,
+ VocabCap: vocabCap,
+ Vocabulary: make(map[string]float64),
+ }
+ // Fit vectorizer on UNBALANCED training data to match Julia implementation
+ // This preserves document frequencies properly
+ vectorizer.Fit(trainDocs)
+
+ // Downsample negatives to 1:1 ratio AFTER fitting (match Julia approach)
+ balancedTrainDocs, balancedTrainLabels := c.downsampleToBalance(trainDocs, trainLabels)
+
+ // Transform both training and validation sets
+ trainVectors := vectorizer.Transform(balancedTrainDocs)
+ valVectors := vectorizer.Transform(valDocs)
+
+ // Use uniform class weights since we've balanced the dataset
+ classWeights := map[float64]float64{
+ 1.0: 1.0,
+ 0.0: 1.0,
+ }
+
+ // Train logistic regression with the specified lambda parameter
+ lr := &core.LogisticRegression{
+ LearningRate: 0.5,
+ Lambda: c.lambda,
+ Iterations: 500,
+ Tolerance: 0.000001,
+ }
+ lr.Validate()
+ weights, err := lr.Fit(trainVectors, balancedTrainLabels, classWeights)
+ if err != nil {
+ return nil, fmt.Errorf("failed to train logistic regression model: %w", err)
+ }
+
+ // Find the best threshold on the validation set
+ recommendedThreshold, scoreDistributions := c.findBestThreshold(valVectors, valLabels, weights)
+
+ // Count classes for metadata
+ var posCount, negCount float64
+ for _, label := range labels {
+ if label == 1.0 {
+ posCount++
+ } else {
+ negCount++
+ }
+ }
+
+ // Create model envelope
+ model := &core.ModelEnvelope{
+ Algorithm: "tfidf-go",
+ Impl: "go",
+ Version: "1",
+ CreatedAt: time.Now().UTC(),
+ Meta: map[string]any{
+ "positives": len(positives),
+ "negatives": len(negatives),
+ "class_counts": map[string]int{
+ "pos": int(posCount),
+ "neg": int(negCount),
+ },
+ "vectorizer_params": map[string]any{
+ "ngram_min": vectorizer.NgramMin,
+ "ngram_max": vectorizer.NgramMax,
+ "min_df": vectorizer.MinDF,
+ "max_df": vectorizer.MaxDF,
+ "vocab_cap": vectorizer.VocabCap,
+ },
+ "model_params": map[string]any{
+ "learning_rate": lr.LearningRate,
+ "lambda": lr.Lambda,
+ "iterations": lr.Iterations,
+ "tolerance": lr.Tolerance,
+ },
+ "recommended_threshold": recommendedThreshold,
+ "score_distributions": scoreDistributions,
+ },
+ Vectorizer: vectorizer.Vocabulary,
+ OrderedVocab: vectorizer.OrderedVocab,
+ Weights: weights,
+ }
+
+ return model, nil
+}
+
+// ============================================================================
+// ┏┳┓┏━╸╺┳╸┏━┓╻┏━╸┏━┓
+// ┃┃┃┣╸ ┃ ┣┳┛┃┃ ┗━┓
+// ╹ ╹┗━╸ ╹ ╹┗╸╹┗━╸┗━┛
+// ============================================================================
+
+// ClassificationMetrics holds the evaluation metrics
+type ClassificationMetrics struct {
+ TruePositives int
+ TrueNegatives int
+ FalsePositives int
+ FalseNegatives int
+ Accuracy float64
+ Precision float64
+ Recall float64
+ F1Score float64
+}
+
+// Calculate computes the metrics from raw counts
+func (m *ClassificationMetrics) Calculate() {
+ total := m.TruePositives + m.TrueNegatives + m.FalsePositives + m.FalseNegatives
+
+ if total > 0 {
+ m.Accuracy = float64(m.TruePositives+m.TrueNegatives) / float64(total)
+ }
+
+ if m.TruePositives+m.FalsePositives > 0 {
+ m.Precision = float64(m.TruePositives) / float64(m.TruePositives+m.FalsePositives)
+ }
+
+ if m.TruePositives+m.FalseNegatives > 0 {
+ m.Recall = float64(m.TruePositives) / float64(m.TruePositives+m.FalseNegatives)
+ }
+
+ if m.Precision+m.Recall > 0 {
+ m.F1Score = 2 * (m.Precision * m.Recall) / (m.Precision + m.Recall)
+ }
+}
+
+// findBestThreshold sweeps a range of thresholds on a validation set to find
+// the one that maximizes combined F1 + separation score.
+func (c *TrainCommand) findBestThreshold(
+ validationVectors [][]float64,
+ validationLabels []float64,
+ weights []float64,
+) (float64, map[string]any) {
+ if len(validationVectors) == 0 {
+ return 0.5, nil // Default if no validation data
+ }
+
+ scores := make([]float64, len(validationVectors))
+ for i, vector := range validationVectors {
+ score, err := core.PredictScore(vector, weights)
+ if err != nil {
+ // This should not happen with valid data, but as a fallback:
+ return 0.5, nil
+ }
+ scores[i] = score
+ }
+
+ // Collect score distributions by label
+ var posScores, negScores []float64
+ for i, score := range scores {
+ if validationLabels[i] == 1.0 {
+ posScores = append(posScores, score)
+ } else {
+ negScores = append(negScores, score)
+ }
+ }
+
+ // Compute stats for each class
+ posStats := computeScoreStats(posScores)
+ negStats := computeScoreStats(negScores)
+
+ // Calculate Cohen's d (effect size) to measure class separation in the learned space
+ posMean := posStats["mean"]
+ negMean := negStats["mean"]
+ posStd := posStats["std"]
+ negStd := negStats["std"]
+
+ var cohensD float64
+ if posStd > 0 && negStd > 0 {
+ pooledStd := math.Sqrt((posStd*posStd + negStd*negStd) / 2)
+ cohensD = math.Abs(posMean-negMean) / pooledStd
+ }
+
+ // Calculate separation ratio to understand how much the classes overlap on the score scale
+ totalRange := math.Max(posStats["max"], negStats["max"]) - math.Min(posStats["min"], negStats["min"])
+ overlapStart := math.Max(posStats["min"], negStats["min"])
+ overlapEnd := math.Min(posStats["max"], negStats["max"])
+ overlapRange := math.Max(0, overlapEnd-overlapStart)
+ separationRatio := 0.0
+ if totalRange > 0 {
+ separationRatio = (totalRange - overlapRange) / totalRange
+ }
+
+ // Find threshold that balances false positives and false negatives using Youden's J.
+ // This metric (Sensitivity + Specificity - 1) equally weights both false positive
+ // and false negative rates. Why not F1? F1 biases toward precision when classes
+ // are imbalanced; a validation set of 10 positives and 1000 negatives would push
+ // the threshold too high. Youden's J treats both types of error equally, which
+ // better reflects real use: missing a relevant article (false negative) is as bad
+ // as showing an irrelevant one (false positive).
+ bestCombinedScore := -1.0
+ bestThreshold := 0.5
+ var bestMetrics ClassificationMetrics
+
+ boolLabels := make([]bool, len(validationLabels))
+ for i, l := range validationLabels {
+ boolLabels[i] = l == 1.0
+ }
+
+ for i := 5; i <= 95; i++ {
+ threshold := float64(i) / 100.0
+ metrics := computeMetrics(scores, boolLabels, threshold)
+
+ sensitivity := metrics.Recall // TPR: TP / (TP + FN)
+ specificity := 0.0
+ if metrics.TrueNegatives+metrics.FalsePositives > 0 {
+ specificity = float64(metrics.TrueNegatives) / float64(metrics.TrueNegatives+metrics.FalsePositives)
+ }
+ youdenJ := sensitivity + specificity - 1.0
+
+ if youdenJ > bestCombinedScore {
+ bestCombinedScore = youdenJ
+ bestThreshold = threshold
+ bestMetrics = metrics
+ }
+ }
+
+ distributions := map[string]any{
+ "positive": posStats,
+ "negative": negStats,
+ "cohens_d": cohensD,
+ "separation_ratio": separationRatio,
+ "best_f1": bestMetrics.F1Score,
+ "best_precision": bestMetrics.Precision,
+ "best_recall": bestMetrics.Recall,
+ }
+
+ return bestThreshold, distributions
+}
+
+// computeScoreStats computes min, max, mean, and std for a slice of scores
+func computeScoreStats(scores []float64) map[string]float64 {
+ if len(scores) == 0 {
+ return map[string]float64{
+ "min": 0.0,
+ "max": 0.0,
+ "mean": 0.0,
+ "std": 0.0,
+ }
+ }
+
+ min, max := scores[0], scores[0]
+ sum := 0.0
+
+ for _, score := range scores {
+ if score < min {
+ min = score
+ }
+ if score > max {
+ max = score
+ }
+ sum += score
+ }
+
+ mean := sum / float64(len(scores))
+
+ // Calculate standard deviation
+ variance := 0.0
+ for _, score := range scores {
+ diff := score - mean
+ variance += diff * diff
+ }
+ variance /= float64(len(scores))
+ std := math.Sqrt(variance)
+
+ return map[string]float64{
+ "min": min,
+ "max": max,
+ "mean": mean,
+ "std": std,
+ }
+}
+
+// computeMetrics calculates classification metrics
+func computeMetrics(scores []float64, labels []bool, threshold float64) ClassificationMetrics {
+ var metrics ClassificationMetrics
+ for i, score := range scores {
+ predicted := score > threshold
+ actual := labels[i]
+
+ if predicted && actual {
+ metrics.TruePositives++
+ } else if predicted && !actual {
+ metrics.FalsePositives++
+ } else if !predicted && actual {
+ metrics.FalseNegatives++
+ } else {
+ metrics.TrueNegatives++
+ }
+ }
+ metrics.Calculate()
+ return metrics
+}
+
+// ============================================================================
+// ╻ ╻┏━╸╻ ┏━┓┏━╸┏━┓┏━┓
+// ┣━┫┣╸ ┃ ┣━┛┣╸ ┣┳┛┗━┓
+// ╹ ╹┗━╸┗━╸╹ ┗━╸╹┗╸┗━┛
+// ============================================================================
+
+// shortURL formats a URL to be human-readable and not too long
+func shortURL(urlStr string) string {
+ u, err := url.Parse(urlStr)
+ if err != nil {
+ return urlStr
+ }
+
+ path := u.Path
+ if len(path) > 30 {
+ path = path[:30] + "..."
+ }
+
+ return u.Host + path
+}
diff --git a/cmds/train_test.go b/cmds/train_test.go
new file mode 100644
index 0000000..8298494
--- /dev/null
+++ b/cmds/train_test.go
@@ -0,0 +1,66 @@
+package cmds
+
+import (
+ "scholscan/core"
+ "strings"
+ "testing"
+)
+
+// test RSS parsing
+func TestParseRSSFeed(t *testing.T) {
+ rssXML := `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+<channel>
+<title>Test Feed</title>
+<item>
+<title>Test Article 1</title>
+<link>https://example.com/article1</link>
+<description>This is a test article with some content.</description>
+</item>
+<item>
+<title>Test Article 2</title>
+<link>https://example.com/article2</link>
+<content><![CDATA[<p>This is content with <b>HTML</b> tags.</p>]]></content>
+</item>
+</channel>
+</rss>`
+
+ articles, err := ParseRSSFeed([]byte(rssXML), "https://example.com/feed")
+ if err != nil {
+ t.Fatalf("Failed to parse RSS feed: %v", err)
+ }
+
+ if len(articles) != 2 {
+ t.Fatalf("Expected 2 articles, got %d", len(articles))
+ }
+
+ if articles[0].Title != "Test Article 1" {
+ t.Errorf("Expected title 'Test Article 1', got '%s'", articles[0].Title)
+ }
+ if articles[0].URL != "https://example.com/article1" {
+ t.Errorf("Expected URL 'https://example.com/article1', got '%s'", articles[0].URL)
+ }
+ if articles[0].Content != "This is a test article with some content." {
+ t.Errorf("Expected content 'This is a test article with some content.', got '%s'", articles[0].Content)
+ }
+
+ if articles[1].Title != "Test Article 2" {
+ t.Errorf("Expected title 'Test Article 2', got '%s'", articles[1].Title)
+ }
+ if articles[1].Content != "This is content with HTML tags." {
+ t.Errorf("Expected 'This is content with HTML tags.', got '%s'", articles[1].Content)
+ }
+}
+
+func TestCleanFeedContent(t *testing.T) {
+ longInput := strings.Repeat("test content ", 500) // 6000+ bytes
+ result := core.CleanFeedContent(longInput)
+
+ if len(result) <= 5000 {
+ t.Errorf("Expected content to be truncated to >5000 chars, got %d", len(result))
+ }
+
+ if !strings.HasSuffix(result, "...") {
+ t.Errorf("Expected truncated content to end with '...', got '%s'", result[len(result)-3:])
+ }
+}