Init v0.1.0

author: Sam Scholten 2025-12-15 19:34:17 +1000
committer: Sam Scholten 2025-12-15 19:34:59 +1000
commit: 9f5978186ac3de07f4325975fecf4f538fe713b6 (patch)
tree: 41440b703054fe59eb561ba81d80fd60380c1f7a /cmds
download: scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.tar.gz
scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.zip
7 files changed, 2972 insertions, 0 deletions
diff --git a/cmds/scan.go b/cmds/scan.go
new file mode 100644
index 0000000..789157c
--- /dev/null
+++ b/cmds/scan.go
@@ -0,0 +1,416 @@
+// Scan command: filters articles using trained model.
+//
+// takes articles from RSS feed, text, or JSONL. Scores & outputs those passing.
+// Batches processing (default 50) to allow continuous streaming.
+package cmds
+
+import (
+	"bufio"
+	"context"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"net/url"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/mmcdole/gofeed"
+	"scholscan/core"
+)
+
+
+// ============================================================================
+// ┏━╸┏━┓┏┳┓┏┳┓┏━┓┏┓╻╺┳┓
+// ┃  ┃ ┃┃┃┃┃┃┃┣━┫┃┗┫ ┃┃
+// ┗━╸┗━┛╹ ╹╹ ╹╹ ╹╹ ╹╺┻┛
+// ============================================================================
+
+
+// scores articles with trained model and outputs filtered results above thresh
+type ScanCommand struct {
+	URL          string
+	FromText     bool
+	FromArticles bool
+
+	ModelPath string
+	Threshold string
+
+	MinTitleLength int
+	ChunkSize      int
+
+	EventsOut  string
+	MetricsOut string
+	Verbose    bool
+}
+
+func (c *ScanCommand) Name() string { return "scan" }
+
+func (c *ScanCommand) Init(args []string) error {
+	fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
+	fs.Usage = func() {
+		fmt.Fprint(fs.Output(), `Usage: scholscan scan [options]
+
+Fetches articles, scores with model, outputs matched (>thresh) ones.
+
+Source options (exactly one required):
+  --url <feed_url>     Fetch articles from RSS/Atom feed
+  --from-text          Extract URLs from text on stdin
+  --from-articles      Use Article JSONL from stdin directly
+
+Model and filtering:
+  --model <path>       Path to trained model JSON file (required)
+  --threshold <float>  Score threshold (if not provided, uses model's recommended threshold)
+
+Enrichment options:
+`)
+		fs.PrintDefaults()
+		fmt.Fprint(fs.Output(), `
+Examples:
+  scholscan scan --url "http://some.blog/rss.xml" --model model.json > interesting.jsonl
+  echo "see https://example.com" | scholscan scan --from-text --model model.json
+  cat articles.jsonl | scholscan scan --from-articles --model model.json
+`)
+	}
+
+	fs.StringVar(&c.URL, "url", "", "RSS/Atom feed URL to fetch")
+	fs.BoolVar(&c.FromText, "from-text", false, "Extract URLs from text on stdin")
+	fs.BoolVar(&c.FromArticles, "from-articles", false, "Use Article JSONL from stdin")
+	fs.StringVar(&c.ModelPath, "model", "", "Path to trained model JSON file (required)")
+	fs.StringVar(&c.Threshold, "threshold", "", "Score threshold for filtering (if not provided, uses model's recommended threshold)")
+	fs.IntVar(&c.MinTitleLength, "min-title-length", core.MinTitleLength, "Minimum title length to consider valid")
+	fs.IntVar(&c.ChunkSize, "chunk-size", core.DefaultChunkSize, "Number of articles to process in each batch")
+	fs.StringVar(&c.EventsOut, "events-out", "events.jsonl", "Write per-article events to a JSONL file")
+	fs.StringVar(&c.MetricsOut, "metrics-out", "metrics.json", "Write summary metrics to a JSON file")
+	fs.BoolVar(&c.Verbose, "verbose", false, "Show progress information")
+
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+
+	if fs.NArg() != 0 {
+		return fmt.Errorf("unexpected arguments provided: %v", fs.Args())
+	}
+
+	// one src opt required
+	sourceCount := 0
+	if c.URL != "" {
+		sourceCount++
+	}
+	if c.FromText {
+		sourceCount++
+	}
+	if c.FromArticles {
+		sourceCount++
+	}
+
+	if sourceCount == 0 {
+		return fmt.Errorf("exactly one source option must be specified: --url, --from-text, or --from-articles")
+	}
+	if sourceCount > 1 {
+		return fmt.Errorf("only one source option may be specified: --url, --from-text, or --from-articles")
+	}
+
+	if c.ModelPath == "" {
+		return fmt.Errorf("--model flag is required")
+	}
+
+	// prevent dir traversal
+	if strings.Contains(filepath.Clean(c.ModelPath), "..") {
+		return fmt.Errorf("invalid model path: directory traversal not allowed")
+	}
+
+	if c.URL != "" {
+		if _, err := url.Parse(c.URL); err != nil {
+			return fmt.Errorf("invalid URL format: %w", err)
+		}
+	}
+
+	return nil
+}
+
+// Run runs the scan: load the model, decide on a threshold, get articles, then score them in chunks.
+// We bail out early on config problems but try to keep going even if some articles fail to fetch.
+func (c *ScanCommand) Run(stdin io.Reader, stdout io.Writer) error {
+	if c.Verbose {
+		log.SetOutput(os.Stderr)
+		log.Println("Starting scan workflow...")
+		log.Printf("Source: %v", c.getSourceDescription())
+		log.Printf("Model: %s", c.ModelPath)
+	}
+
+	model, err := c.loadModel()
+	if err != nil {
+		return fmt.Errorf("failed to load model: %w", err)
+	}
+
+	threshold, err := c.getThreshold(model)
+	if err != nil {
+		return fmt.Errorf("failed to determine threshold: %w", err)
+	}
+
+	if c.Verbose {
+		log.Printf("Using threshold: %.3f", threshold)
+	}
+
+	var articles []*core.Article
+	if c.FromArticles {
+		articles, err = c.readArticlesFromStdin(stdin)
+	} else {
+		articles, err = c.fetchArticles()
+	}
+	if err != nil {
+		return fmt.Errorf("failed to get articles: %w", err)
+	}
+
+	if c.Verbose {
+		log.Printf("Processing %d articles", len(articles))
+	}
+
+	// process articles in chunks
+	return c.processArticles(articles, model, threshold, stdout, stdin)
+}
+
+
+// ============================================================================
+// ┏┳┓┏━┓╺┳┓┏━╸╻     ┏┓     ┏━╸┏━┓┏┓╻┏━╸╻┏━╸
+// ┃┃┃┃ ┃ ┃┃┣╸ ┃     ┃╺╋╸   ┃  ┃ ┃┃┗┫┣╸ ┃┃╺┓
+// ╹ ╹┗━┛╺┻┛┗━╸┗━╸   ┗━┛    ┗━╸┗━┛╹ ╹╹  ╹┗━┛
+// ============================================================================
+
+
+
+func (c *ScanCommand) getSourceDescription() string {
+	if c.URL != "" {
+		return fmt.Sprintf("RSS feed: %s", c.URL)
+	}
+	if c.FromText {
+		return "text from stdin"
+	}
+	if c.FromArticles {
+		return "articles from stdin"
+	}
+	return "unknown"
+}
+
+// loadModel reads and parses the model JSON file.
+// The envelope contains weights, vocabulary, and optionally a recommended threshold.
+func (c *ScanCommand) loadModel() (*core.ModelEnvelope, error) {
+	f, err := os.Open(c.ModelPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open model file %s: %w", c.ModelPath, err)
+	}
+	defer f.Close()
+
+	var model core.ModelEnvelope
+	if err := json.NewDecoder(f).Decode(&model); err != nil {
+		return nil, fmt.Errorf("failed to decode model: %w", err)
+	}
+
+	return &model, nil
+}
+
+func (c *ScanCommand) getThreshold(model *core.ModelEnvelope) (float64, error) {
+	if c.Threshold != "" {
+		var threshold float64
+		_, err := fmt.Sscanf(c.Threshold, "%f", &threshold)
+		if err == nil {
+			return threshold, nil
+		}
+	}
+
+	if model.Meta != nil {
+		if meta, ok := model.Meta["recommended_threshold"].(float64); ok {
+			return meta, nil
+		}
+	}
+
+	return core.DefaultScoreThreshold, nil
+}
+
+// ============================================================================
+// ┏━┓┏━┓╺┳╸╻┏━╸╻  ┏━╸   ┏━┓┏━┓┏━╸┏━┓
+// ┣━┫┣┳┛ ┃ ┃┃  ┃  ┣╸    ┗━┓┣┳┛┃  ┗━┓
+// ╹ ╹╹┗╸ ╹ ╹┗━╸┗━╸┗━╸   ┗━┛╹┗╸┗━╸┗━┛
+// ============================================================================
+
+
+func (c *ScanCommand) fetchArticles() ([]*core.Article, error) {
+	if c.FromText {
+		return c.extractURLsFromText(os.Stdin)
+	}
+	if c.URL != "" {
+		return c.fetchRSSFeed(c.URL)
+	}
+	return nil, fmt.Errorf("no valid source specified")
+}
+
+// extractURLsFromText pulls URLs from plain text on stdin.
+// We create minimal Article objects since only the URL is needed for scoring.
+func (c *ScanCommand) extractURLsFromText(stdin io.Reader) ([]*core.Article, error) {
+	var urls []string
+	s := bufio.NewScanner(stdin)
+	for s.Scan() {
+		line := s.Text()
+		// url extraction
+		fields := strings.Fields(line)
+		for _, field := range fields {
+			if strings.HasPrefix(field, "http://") || strings.HasPrefix(field, "https://") {
+				urls = append(urls, field)
+			}
+		}
+	}
+
+	// create Article objs for URLs
+	articles := make([]*core.Article, len(urls))
+	for i, url := range urls {
+		articles[i] = &core.Article{
+			URL:     url,
+			Title:   fmt.Sprintf("Article from %s", url),
+			Content: "",
+		}
+	}
+
+	return articles, s.Err()
+}
+
+// fetchRSSFeed fetches and parses a single RSS feed with a 30s timeout.
+// We skip articles with short titles since they're usually noise or truncated.
+func (c *ScanCommand) fetchRSSFeed(url string) ([]*core.Article, error) {
+	client := &http.Client{Timeout: core.DefaultHTTPTimeout}
+
+	req, err := http.NewRequest("GET", url, nil)
+	if err != nil {
+		return nil, fmt.Errorf("error building request: %w", err)
+	}
+	req.Header.Set("User-Agent", core.PoliteUserAgent)
+
+	ctx, cancel := context.WithTimeout(context.Background(), core.DefaultHTTPTimeout)
+	defer cancel()
+
+	resp, err := client.Do(req.WithContext(ctx))
+	if err != nil {
+		return nil, fmt.Errorf("error fetching %s: %w", url, err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
+	}
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("error reading response from %s: %w", url, err)
+	}
+
+	// parse feed
+	fp := gofeed.NewParser()
+	feed, err := fp.Parse(strings.NewReader(string(body)))
+	if err != nil {
+		return nil, fmt.Errorf("error parsing feed from %s: %w", url, err)
+	}
+
+	var articles []*core.Article
+	for _, item := range feed.Items {
+		article := &core.Article{
+			URL:   item.Link,
+			Title: strings.TrimSpace(item.Title),
+		}
+
+		if len(article.Title) >= c.MinTitleLength {
+			articles = append(articles, article)
+		}
+	}
+
+	return articles, nil
+}
+
+// readArticlesFromStdin reads Article objects from JSONL on stdin.
+// Malformed lines are skipped to allow partial processing of corrupted input.
+func (c *ScanCommand) readArticlesFromStdin(stdin io.Reader) ([]*core.Article, error) {
+	var articles []*core.Article
+	decoder := json.NewDecoder(stdin)
+	for {
+		var article core.Article
+		if err := decoder.Decode(&article); err != nil {
+			if err == io.EOF {
+				break
+			}
+			continue
+		}
+
+		if len(article.Title) >= c.MinTitleLength {
+			articles = append(articles, &article)
+		}
+	}
+	return articles, nil
+}
+
+
+
+// ============================================================================
+// ┏━┓┏━┓┏━┓┏━╸┏━╸┏━┓┏━┓   ┏━┓┏━┓╺┳╸╻┏━╸╻  ┏━╸┏━┓
+// ┣━┛┣┳┛┃ ┃┃  ┣╸ ┗━┓┗━┓   ┣━┫┣┳┛ ┃ ┃┃  ┃  ┣╸ ┗━┓
+// ╹  ╹┗╸┗━┛┗━╸┗━╸┗━┛┗━┛   ╹ ╹╹┗╸ ╹ ╹┗━╸┗━╸┗━╸┗━┛
+// ============================================================================
+
+
+// processArticles handles scoring and filtering in batches to keep memory usage predictable.
+// Scoring errors don't crash the process - we log them and continue with the next article.
+func (c *ScanCommand) processArticles(articles []*core.Article, model *core.ModelEnvelope, threshold float64, stdout io.Writer, stdin io.Reader) error {
+	vectorizer := core.CreateVectorizerFromModel(model)
+
+	encoder := json.NewEncoder(stdout)
+
+	// process each batch
+	for i := 0; i < len(articles); i += c.ChunkSize {
+		end := i + c.ChunkSize
+		if end > len(articles) {
+			end = len(articles)
+		}
+
+		chunk := articles[i:end]
+		if c.Verbose {
+			log.Printf("Processing chunk %d-%d of %d articles", i+1, end, len(articles))
+		}
+
+		// calc score for batch
+		docs := make([]string, len(chunk))
+		for j, article := range chunk {
+			docs[j] = strings.TrimSpace(article.Title)
+		}
+
+		vectors := vectorizer.Transform(docs)
+		scores := make([]float64, len(chunk))
+
+		for j, vector := range vectors {
+			score, err := core.PredictScore(vector, model.Weights)
+			if err != nil {
+				log.Printf("Error computing score for article %d: %v", i+j, err)
+				scores[j] = 0.0
+			} else {
+				scores[j] = score
+			}
+		}
+
+		for j, article := range chunk {
+			score := scores[j]
+			article.Score = &score
+
+			if score >= threshold {
+				if err := encoder.Encode(article); err != nil {
+					log.Printf("Error encoding article: %v", err)
+				}
+			}
+		}
+	}
+
+	if c.Verbose {
+		log.Println("Scan complete")
+	}
+
+	return nil
+}
diff --git a/cmds/serve.go b/cmds/serve.go
new file mode 100644
index 0000000..92aa64c
--- /dev/null
+++ b/cmds/serve.go
@@ -0,0 +1,1010 @@
+// Serve command: HTTP server for web UI and APIs.
+//
+// Two main flows: live-feed (cached + background refresh) and tools (on-demand scoring).
+// Live-feed rescans all configured RSS feeds on a timer (default 24h), caches results,
+// serves filtered articles via web UI and JSON/RSS APIs.
+// Tools provides real-time /score (single title) and /scan (ad-hoc feed) endpoints.
+// Background refresh continues despite individual feed failures; RWMutex allows
+// many concurrent readers with exclusive writer updates.
+// Templates are embedded for single-binary deployment.
+package cmds
+
+import (
+	"bufio"
+	"context"
+	"embed"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"html/template"
+	"io"
+	"log"
+	"net/http"
+	"net/url"
+	"os"
+	"os/signal"
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strings"
+	"sync"
+	"syscall"
+	"time"
+
+	"github.com/PuerkitoBio/goquery"
+	"github.com/mmcdole/gofeed"
+	"scholscan/core"
+)
+
+//go:embed templates/*.html
+var templateFS embed.FS
+
+// ============================================================================
+// ┏━╸┏┳┓╺┳┓   ┏━┓┏┓  ┏┓
+// ┃  ┃┃┃ ┃┃   ┃ ┃┣┻┓  ┃
+// ┗━╸╹ ╹╺┻┛   ┗━┛┗━┛┗━┛
+// ============================================================================
+
+type ServeCommand struct {
+	Port            int
+	RSSWorldPath    string
+	RefreshInterval string
+	ModelPath       string
+	Title           string
+
+	// Parsed interval
+	refreshInterval time.Duration
+	// Loaded model (cached)
+	model   *core.ModelEnvelope
+	modelMu sync.RWMutex
+	// Cached filtered RSS results and timestamp.
+	// RWMutex allows many concurrent readers (HTTP handlers) with exclusive writer (background refresh).
+	filteredResults     []*core.Article
+	filteredResultsTime time.Time
+	resultsMu           sync.RWMutex
+	// Loaded templates
+	tmpl *template.Template
+}
+
+func (c *ServeCommand) Name() string { return "serve" }
+
+// Init configures the serve command with robust input validation.
+// Prevents directory traversal, validates paths, and sets sensible defaults.
+// Ensures only one configuration is possible to reduce runtime complexity.
+func (c *ServeCommand) Init(args []string) error {
+	fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
+	fs.Usage = func() {
+		fmt.Fprint(fs.Output(), `Usage: scholscan serve [options]
+
+		Start HTTP server for filtered RSS and scoring web UI.
+
+		Flags:
+		`)
+		fs.PrintDefaults()
+		fmt.Fprint(fs.Output(), `
+		Examples:
+		scholscan serve --port 8080 --rss-world rss_world.txt --model model.json
+		scholscan serve --refresh-interval 24h --model ./model.json --rss-world feeds.txt
+		`)
+	}
+
+	fs.IntVar(&c.Port, "port", 8080, "Port to listen on")
+	fs.StringVar(&c.RSSWorldPath, "rss-world", "rss_world.txt", "Path to RSS world file (one feed URL per line)")
+	fs.StringVar(&c.RefreshInterval, "refresh-interval", "24h", "Interval for background rescans (e.g., 24h, 1h)")
+	fs.StringVar(&c.ModelPath, "model", "model.json", "Path to trained model JSON file")
+	fs.StringVar(&c.Title, "title", "", "Custom title for the web interface")
+
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+
+	if fs.NArg() != 0 {
+		return fmt.Errorf("unexpected arguments provided: %v", fs.Args())
+	}
+
+	// Parse refresh interval
+	interval, err := time.ParseDuration(c.RefreshInterval)
+	if err != nil {
+		return fmt.Errorf("invalid refresh-interval %q: %w", c.RefreshInterval, err)
+	}
+	c.refreshInterval = interval
+
+	if strings.Contains(filepath.Clean(c.RSSWorldPath), "..") {
+		return fmt.Errorf("invalid rss-world path: directory traversal not allowed")
+	}
+	if strings.Contains(filepath.Clean(c.ModelPath), "..") {
+		return fmt.Errorf("invalid model path: directory traversal not allowed")
+	}
+
+	return nil
+}
+
+func (c *ServeCommand) Run(stdin io.Reader, stdout io.Writer) error {
+	log.Printf("Starting scholscan server on port %d", c.Port)
+
+	// Initialize filteredResultsTime to server start time
+	c.resultsMu.Lock()
+	c.filteredResultsTime = time.Now()
+	c.resultsMu.Unlock()
+
+	// Load templates at startup
+	tmpl, err := template.ParseFS(templateFS, "templates/*.html")
+	if err != nil {
+		return fmt.Errorf("failed to parse templates: %w", err)
+	}
+	c.tmpl = tmpl
+	log.Printf("Templates loaded successfully")
+
+	// Load model at startup
+	model, err := c.loadModel()
+	if err != nil {
+		return fmt.Errorf("failed to load model at startup: %w", err)
+	}
+	c.modelMu.Lock()
+	c.model = model
+	c.modelMu.Unlock()
+
+	log.Printf("Model loaded successfully")
+
+	// Start background ticker for periodic refresh
+	ticker := time.NewTicker(c.refreshInterval)
+	go c.backgroundRefresh(ticker)
+
+	// Perform initial scan asynchronously
+	go func() {
+		log.Println("Starting initial feed scan...")
+		if err := c.refreshFilteredResults(); err != nil {
+			log.Printf("Warning: initial scan failed: %v", err)
+		} else {
+			c.resultsMu.RLock()
+			count := len(c.filteredResults)
+			c.resultsMu.RUnlock()
+			log.Printf("Initial scan complete, %d articles filtered", count)
+		}
+	}()
+
+	// Setup HTTP handlers
+	http.HandleFunc("/", c.handleRoot)
+	http.HandleFunc("/live-feed", c.handleLiveFeed)
+	http.HandleFunc("/tools", c.handleTools)
+	http.HandleFunc("/score", c.handleScore)
+	http.HandleFunc("/scan", c.handleScan)
+	http.HandleFunc("/api/filtered/feed", c.handleFilteredFeed)
+	http.HandleFunc("/api/filtered/rss", c.handleFilteredRSS)
+	http.HandleFunc("/api/health", c.handleHealth)
+
+	// Setup server with graceful shutdown
+	server := &http.Server{
+		Addr:         fmt.Sprintf(":%d", c.Port),
+		Handler:      http.DefaultServeMux,
+		ReadTimeout:  core.DefaultReadTimeout,
+		WriteTimeout: core.DefaultWriteTimeout,
+		IdleTimeout:  core.DefaultIdleTimeout,
+	}
+
+	// Handle shutdown signals
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+
+	go func() {
+		<-sigChan
+		log.Println("Shutdown signal received")
+		ticker.Stop()
+		ctx, cancel := context.WithTimeout(context.Background(), core.DefaultShutdownTimeout)
+		defer cancel()
+		if err := server.Shutdown(ctx); err != nil {
+			log.Printf("Server shutdown error: %v", err)
+		}
+	}()
+
+	log.Printf("Server listening on http://localhost:%d", c.Port)
+	if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
+		return fmt.Errorf("server error: %w", err)
+	}
+
+	return nil
+}
+
+// ============================================================================
+// ┏━╸┏━┓┏━┓┏━╸   ╻  ┏━┓┏━╸╻┏━╸
+// ┃  ┃ ┃┣┳┛┣╸    ┃  ┃ ┃┃╺┓┃┃
+// ┗━╸┗━┛╹┗╸┗━╸   ┗━╸┗━┛┗━┛╹┗━╸
+// ============================================================================
+
+func (c *ServeCommand) loadModel() (*core.ModelEnvelope, error) {
+	f, err := os.Open(c.ModelPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open model file %s: %w", c.ModelPath, err)
+	}
+	defer f.Close()
+
+	var model core.ModelEnvelope
+	if err := json.NewDecoder(f).Decode(&model); err != nil {
+		return nil, fmt.Errorf("failed to decode model: %w", err)
+	}
+
+	return &model, nil
+}
+
+func (c *ServeCommand) scoreArticle(article *core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope) float64 {
+	docs := []string{strings.TrimSpace(article.Title)}
+	vectors := vectorizer.Transform(docs)
+
+	if len(vectors) == 0 || len(vectors[0]) == 0 {
+		return 0.0
+	}
+
+	score, err := core.PredictScore(vectors[0], model.Weights)
+	if err != nil {
+		// Return 0.0 on error (below threshold). Malformed articles don't break the display,
+		// they just get filtered out. Log the error for diagnostics.
+		log.Printf("Error scoring article: %v", err)
+		return 0.0
+	}
+
+	return score
+}
+
+func (c *ServeCommand) getThreshold(model *core.ModelEnvelope) (float64, error) {
+	if model.Meta != nil {
+		if threshold, ok := model.Meta["recommended_threshold"].(float64); ok {
+			return threshold, nil
+		}
+	}
+	return core.DefaultScoreThreshold, nil
+}
+
+// scoreAndFormatArticles scores a list of articles and returns them formatted for templates.
+// Articles are scored using the model and vectorizer, then returned with human-readable ratings.
+func (c *ServeCommand) scoreAndFormatArticles(articles []*core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope, threshold float64) []map[string]interface{} {
+	type ArticleResponse struct {
+		Title  string  `json:"title"`
+		URL    string  `json:"url"`
+		Source string  `json:"source,omitempty"`
+		Rating int     `json:"rating"`
+		Score  float64 `json:"score"`
+	}
+
+	scored := make([]ArticleResponse, 0, len(articles))
+	for _, article := range articles {
+		score := c.scoreArticle(article, vectorizer, model)
+		rating := core.ScoreToScale(score, threshold)
+
+		scored = append(scored, ArticleResponse{
+			Title:  article.Title,
+			URL:    article.URL,
+			Source: article.Source,
+			Rating: rating,
+			Score:  score,
+		})
+	}
+
+	result := make([]map[string]interface{}, len(scored))
+	for i, a := range scored {
+		result[i] = map[string]interface{}{
+			"Title":  a.Title,
+			"URL":    a.URL,
+			"Source": a.Source,
+			"Rating": a.Rating,
+			"Score":  a.Score,
+		}
+	}
+	return result
+}
+
+// ============================================================================
+// ┏━┓┏━┓┏━┓   ┏━┓╺┳╸╻ ╻┏━╸┏━╸
+// ┣┳┛┗━┓┗━┓   ┗━┓ ┃ ┃ ┃┣╸ ┣╸
+// ╹┗╸┗━┛┗━┛   ┗━┛ ╹ ┗━┛╹  ╹
+// ============================================================================
+
+func (c *ServeCommand) readRSSWorldFeeds() ([]string, error) {
+	f, err := os.Open(c.RSSWorldPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open rss_world file %s: %w", c.RSSWorldPath, err)
+	}
+	defer f.Close()
+
+	var feeds []string
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if line != "" && !strings.HasPrefix(line, "#") {
+			feeds = append(feeds, line)
+		}
+	}
+
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("error reading rss_world file: %w", err)
+	}
+
+	return feeds, nil
+}
+
+func (c *ServeCommand) refreshFilteredResults() error {
+	feeds, err := c.readRSSWorldFeeds()
+	if err != nil {
+		return err
+	}
+
+	c.modelMu.RLock()
+	model := c.model
+	c.modelMu.RUnlock()
+
+	if model == nil {
+		return fmt.Errorf("model not loaded")
+	}
+
+	// Scan all feeds. Continue on individual feed failures to maximize results.
+	// RSS feeds are often flaky; one down shouldn't prevent others from being processed.
+	var allArticles []*core.Article
+	for _, feed := range feeds {
+		articles, err := c.fetchRSSFeed(feed)
+		if err != nil {
+			log.Printf("Warning: failed to fetch feed %s: %v", feed, err)
+			continue
+		}
+		allArticles = append(allArticles, articles...)
+	}
+
+	// Score and filter articles
+	threshold, err := c.getThreshold(model)
+	if err != nil {
+		return err
+	}
+
+	vectorizer := core.CreateVectorizerFromModel(model)
+
+	filtered := make([]*core.Article, 0, len(allArticles))
+	for _, article := range allArticles {
+		score := c.scoreArticle(article, vectorizer, model)
+		if score >= threshold {
+			// Create a copy with score to avoid reference issues
+			articleCopy := *article
+			articleCopy.Score = &score
+			filtered = append(filtered, &articleCopy)
+		}
+	}
+
+	c.resultsMu.Lock()
+	c.filteredResults = filtered
+	c.filteredResultsTime = time.Now()
+	c.resultsMu.Unlock()
+
+	return nil
+}
+
+// backgroundRefresh runs in a goroutine, rescanning all RSS feeds on interval.
+// Failures in individual feeds don't affect others - we log and continue.
+func (c *ServeCommand) backgroundRefresh(ticker *time.Ticker) {
+	for range ticker.C {
+		log.Println("Background refresh started")
+		if err := c.refreshFilteredResults(); err != nil {
+			log.Printf("Background refresh error (continuing): %v", err)
+		} else {
+			c.resultsMu.RLock()
+			count := len(c.filteredResults)
+			c.resultsMu.RUnlock()
+			log.Printf("Background refresh complete, %d articles filtered", count)
+		}
+	}
+}
+
+func (c *ServeCommand) fetchRSSFeed(url string) ([]*core.Article, error) {
+	client := &http.Client{Timeout: core.DefaultHTTPTimeout}
+
+	req, err := http.NewRequest("GET", url, nil)
+	if err != nil {
+		return nil, fmt.Errorf("error building request: %w", err)
+	}
+	req.Header.Set("User-Agent", core.PoliteUserAgent)
+
+	ctx, cancel := context.WithTimeout(context.Background(), core.DefaultHTTPTimeout)
+	defer cancel()
+
+	resp, err := client.Do(req.WithContext(ctx))
+	if err != nil {
+		return nil, fmt.Errorf("error fetching %s: %w", url, err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
+	}
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("error reading response from %s: %w", url, err)
+	}
+
+	fp := gofeed.NewParser()
+	feed, err := fp.Parse(strings.NewReader(string(body)))
+	if err != nil {
+		return nil, fmt.Errorf("error parsing feed from %s: %w", url, err)
+	}
+
+	var articles []*core.Article
+	for _, item := range feed.Items {
+		article := &core.Article{
+			URL:    item.Link,
+			Title:  strings.TrimSpace(item.Title),
+			Source: feed.Title,
+		}
+
+		if item.PublishedParsed != nil {
+			article.PublishedAt = item.PublishedParsed
+		}
+
+		if len(article.Title) >= core.MinTitleLength {
+			articles = append(articles, article)
+		}
+	}
+
+	return articles, nil
+}
+
+// ============================================================================
+// ╻ ╻┏━╸┏┓    ╻ ╻╻
+// ┃╻┃┣╸ ┣┻┓   ┃ ┃┃
+// ┗┻┛┗━╸┗━┛   ┗━┛╹
+// ============================================================================
+
+func (c *ServeCommand) handleRoot(w http.ResponseWriter, r *http.Request) {
+	if r.URL.Path != "/" {
+		http.NotFound(w, r)
+		return
+	}
+
+	// Redirect to live feed
+	http.Redirect(w, r, "/live-feed", http.StatusMovedPermanently)
+}
+
+func (c *ServeCommand) handleLiveFeed(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	c.resultsMu.RLock()
+	articles := c.filteredResults
+	resultsTime := c.filteredResultsTime
+	c.resultsMu.RUnlock()
+
+	c.modelMu.RLock()
+	model := c.model
+	c.modelMu.RUnlock()
+
+	if model == nil {
+		http.Error(w, "Model not loaded", http.StatusInternalServerError)
+		return
+	}
+
+	threshold, _ := c.getThreshold(model)
+
+	// Parse filter parameter (day, week, or all)
+	filter := r.URL.Query().Get("filter")
+	if filter == "" {
+		filter = "all"
+	}
+
+	// Filter articles by date if needed
+	now := time.Now()
+	filtered := articles
+	if filter == "day" || filter == "week" {
+		var cutoff time.Time
+		if filter == "day" {
+			cutoff = now.Add(-24 * time.Hour)
+		} else if filter == "week" {
+			cutoff = now.Add(-7 * 24 * time.Hour)
+		}
+
+		filtered = make([]*core.Article, 0, len(articles))
+		for _, article := range articles {
+			// Always include articles without PublishedAt
+			if article.PublishedAt == nil || article.PublishedAt.After(cutoff) {
+				filtered = append(filtered, article)
+			}
+		}
+	}
+
+	// Convert articles to template format
+	type TemplateArticle struct {
+		Title       string
+		URL         string
+		Source      string
+		Rating      int
+		Score       float64
+		PublishedAt string
+	}
+
+	templateArticles := make([]TemplateArticle, 0, len(filtered))
+	for _, article := range filtered {
+		score := 0.0
+		if article.Score != nil {
+			score = *article.Score
+		}
+		rating := core.ScoreToScale(score, threshold)
+
+		publishedAt := ""
+		if article.PublishedAt != nil {
+			publishedAt = article.PublishedAt.Format("2006-01-02")
+		}
+
+		templateArticles = append(templateArticles, TemplateArticle{
+			Title:       article.Title,
+			URL:         article.URL,
+			Source:      article.Source,
+			Rating:      rating,
+			Score:       score,
+			PublishedAt: publishedAt,
+		})
+	}
+
+	// Sort articles by score (highest first)
+	sort.Slice(templateArticles, func(i, j int) bool {
+		return templateArticles[i].Score > templateArticles[j].Score
+	})
+
+	data := map[string]interface{}{
+		"Page":      "live-feed",
+		"Articles":  templateArticles,
+		"Threshold": threshold,
+		"UpdatedAt": resultsTime.Format("2006-01-02 15:04:05"),
+		"Filter":    filter,
+		"Title":     displayTitle(c.Title),
+	}
+
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	if err := c.tmpl.ExecuteTemplate(w, "live-feed", data); err != nil {
+		http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+	}
+}
+
+func (c *ServeCommand) handleTools(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	data := map[string]interface{}{
+		"Page":  "tools",
+		"Title": displayTitle(c.Title),
+	}
+
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	if err := c.tmpl.ExecuteTemplate(w, "tools", data); err != nil {
+		http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+	}
+}
+
+func (c *ServeCommand) handleScore(w http.ResponseWriter, r *http.Request) {
+	if r.Method == http.MethodGet {
+		c.handleTools(w, r)
+		return
+	}
+
+	if r.Method != http.MethodPost {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	c.modelMu.RLock()
+	model := c.model
+	c.modelMu.RUnlock()
+
+	if model == nil {
+		http.Error(w, "Model not loaded", http.StatusInternalServerError)
+		return
+	}
+
+	if err := r.ParseForm(); err != nil {
+		http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest)
+		return
+	}
+
+	title := strings.TrimSpace(r.FormValue("title"))
+	url := strings.TrimSpace(r.FormValue("url"))
+
+	// If URL provided, fetch and extract title from it; otherwise use provided title.
+	if url != "" {
+		extractedTitle, err := extractTitleFromURL(url)
+		if err != nil {
+			c.renderResultsError(w, fmt.Sprintf("Failed to extract title from URL: %v", err), title)
+			return
+		}
+		title = extractedTitle
+	}
+
+	// Validate input before scoring
+	if valErr := c.validateTitle(title); valErr != "" {
+		c.renderResultsError(w, valErr, title)
+		return
+	}
+
+	vectorizer := core.CreateVectorizerFromModel(model)
+	article := &core.Article{Title: title}
+	score := c.scoreArticle(article, vectorizer, model)
+
+	threshold, _ := c.getThreshold(model)
+	rating := core.ScoreToScale(score, threshold)
+
+	data := map[string]interface{}{
+		"Page":          "tools",
+		"IsScoreResult": true,
+		"Title":         title,
+		"Rating":        rating,
+		"Score":         score,
+		"Threshold":     threshold,
+		"PageTitle":     displayTitle(c.Title),
+	}
+
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
+		http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+	}
+}
+
+func (c *ServeCommand) handleScan(w http.ResponseWriter, r *http.Request) {
+	if r.Method == http.MethodGet {
+		c.handleTools(w, r)
+		return
+	}
+
+	if r.Method != http.MethodPost {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	c.modelMu.RLock()
+	model := c.model
+	c.modelMu.RUnlock()
+
+	if model == nil {
+		http.Error(w, "Model not loaded", http.StatusInternalServerError)
+		return
+	}
+
+	if err := r.ParseForm(); err != nil {
+		http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest)
+		return
+	}
+
+	feedURL := strings.TrimSpace(r.FormValue("feed_url"))
+
+	// Validate and fetch the feed
+	if valErr := c.validateFeedURL(feedURL); valErr != "" {
+		c.renderScanResultsError(w, valErr, feedURL)
+		return
+	}
+
+	articles, err := c.fetchRSSFeed(feedURL)
+	if err != nil {
+		c.renderScanResultsError(w, fmt.Sprintf("Failed to fetch feed: %v", err), feedURL)
+		return
+	}
+
+	// Score articles
+	threshold, _ := c.getThreshold(model)
+	vectorizer := core.CreateVectorizerFromModel(model)
+	scored := c.scoreAndFormatArticles(articles, vectorizer, model, threshold)
+
+	sort.Slice(scored, func(i, j int) bool {
+		iScore := scored[i]["Score"].(float64)
+		jScore := scored[j]["Score"].(float64)
+		return iScore > jScore
+	})
+
+	data := map[string]interface{}{
+		"Page":         "tools",
+		"IsScanResult": true,
+		"FeedURL":      feedURL,
+		"Articles":     scored,
+		"Threshold":    threshold,
+		"PageTitle":    displayTitle(c.Title),
+	}
+
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
+		http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+	}
+}
+
+// ============================================================================
+// ┏━┓┏━┓╻   ┏━╸┏┓╻╺┳┓┏━┓┏━┓╻┏┓╻╺┳╸┏━┓
+// ┣━┫┣━┛┃   ┣╸ ┃┗┫ ┃┃┣━┛┃ ┃┃┃┗┫ ┃ ┗━┓
+// ╹ ╹╹  ╹   ┗━╸╹ ╹╺┻┛╹  ┗━┛╹╹ ╹ ╹ ┗━┛
+// ============================================================================
+
+func (c *ServeCommand) handleFilteredFeed(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	c.resultsMu.RLock()
+	articles := c.filteredResults
+	resultsTime := c.filteredResultsTime
+	c.resultsMu.RUnlock()
+
+	c.modelMu.RLock()
+	model := c.model
+	c.modelMu.RUnlock()
+
+	threshold, _ := c.getThreshold(model)
+
+	type ArticleResponse struct {
+		Title  string  `json:"title"`
+		URL    string  `json:"url"`
+		Source string  `json:"source,omitempty"`
+		Rating int     `json:"rating"`
+		Score  float64 `json:"score"`
+	}
+
+	scored := make([]ArticleResponse, 0, len(articles))
+	for _, article := range articles {
+		score := 0.0
+		if article.Score != nil {
+			score = *article.Score
+		}
+		rating := core.ScoreToScale(score, threshold)
+
+		scored = append(scored, ArticleResponse{
+			Title:  article.Title,
+			URL:    article.URL,
+			Source: article.Source,
+			Rating: rating,
+			Score:  score,
+		})
+	}
+
+	w.Header().Set("Content-Type", "application/json")
+	w.Header().Set("Cache-Control", "no-cache, no-store, must-revalidate")
+
+	if err := json.NewEncoder(w).Encode(map[string]interface{}{
+		"total":      len(articles),
+		"threshold":  threshold,
+		"updated_at": resultsTime,
+		"articles":   scored,
+	}); err != nil {
+		http.Error(w, "Failed to encode response", http.StatusInternalServerError)
+	}
+}
+
+func (c *ServeCommand) handleFilteredRSS(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	c.resultsMu.RLock()
+	articles := c.filteredResults
+	c.resultsMu.RUnlock()
+
+	c.modelMu.RLock()
+	model := c.model
+	c.modelMu.RUnlock()
+
+	w.Header().Set("Content-Type", "application/rss+xml")
+	w.Header().Set("Cache-Control", "public, max-age=3600")
+
+	// Generate RSS feed
+	fmt.Fprintf(w, `<?xml version="1.0" encoding="UTF-8"?>
+	<rss version="2.0">
+	<channel>
+	<title>%s - Filtered Articles</title>
+	<link>http://scholscan.local</link>
+	<description>Articles filtered by your learned preferences (scored 1-10)</description>
+	`, displayTitle(c.Title))
+
+	for _, article := range articles {
+		rawScore := 0.0
+		if article.Score != nil {
+			rawScore = *article.Score
+		}
+
+		threshold, _ := c.getThreshold(model)
+		scaledScore := core.ScoreToScale(rawScore, threshold)
+
+		title := escapeXML(article.Title)
+		url := escapeXML(article.URL)
+		description := fmt.Sprintf("SCHOLSCAN SCORE = %d/10 (raw: %.3f)", scaledScore, rawScore)
+
+		fmt.Fprintf(w, `    <item>
+		<title>%s</title>
+		<link>%s</link>
+		<description>%s</description>
+		</item>
+		`, title, url, description)
+	}
+
+	fmt.Fprint(w, `  </channel>
+	</rss>`)
+}
+
+func (c *ServeCommand) handleHealth(w http.ResponseWriter, r *http.Request) {
+	if r.Method != http.MethodGet {
+		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		return
+	}
+
+	c.modelMu.RLock()
+	modelLoaded := c.model != nil
+	c.modelMu.RUnlock()
+
+	status := "ok"
+	if !modelLoaded {
+		status = "model_not_loaded"
+		w.WriteHeader(http.StatusInternalServerError)
+	}
+
+	w.Header().Set("Content-Type", "application/json")
+	if err := json.NewEncoder(w).Encode(map[string]interface{}{
+		"status":       status,
+		"model_loaded": modelLoaded,
+		"timestamp":    time.Now().Unix(),
+	}); err != nil {
+		http.Error(w, "Failed to encode response", http.StatusInternalServerError)
+	}
+}
+
+// ============================================================================
+// ╻ ╻┏━╸╻  ┏━┓┏━╸┏━┓┏━┓
+// ┣━┫┣╸ ┃  ┣━┛┣╸ ┣┳┛┗━┓
+// ╹ ╹┗━╸┗━╸╹  ┗━╸╹┗╸┗━┛
+// ============================================================================
+
+func displayTitle(custom string) string {
+	if custom != "" {
+		return custom
+	}
+	return "ScholScan"
+}
+
+// extractTitleFromURL fetches the content from a URL and extracts the title from the HTML.
+// Designed to be resilient: tries multiple title sources, handles various URL formats,
+// and provides meaningful error feedback if extraction fails.
+func extractTitleFromURL(rawURL string) (string, error) {
+	if rawURL == "" {
+		return "", fmt.Errorf("empty URL")
+	}
+
+	// Check if it's a DOI
+	if strings.HasPrefix(rawURL, "10.") {
+		// Convert DOI to URL
+		rawURL = fmt.Sprintf("https://doi.org/%s", rawURL)
+	} else if !strings.HasPrefix(rawURL, "http://") && !strings.HasPrefix(rawURL, "https://") {
+		rawURL = "https://" + rawURL
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), core.DefaultContextTimeout)
+	defer cancel()
+
+	req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil)
+	if err != nil {
+		return "", fmt.Errorf("invalid URL: %w", err)
+	}
+	req.Header.Set("User-Agent", core.PoliteUserAgent)
+	req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
+
+	resp, err := core.DoRequestWithRetry(ctx, core.DefaultHTTPClient, req)
+	if err != nil {
+		return "", fmt.Errorf("failed to fetch URL: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return "", fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
+	}
+
+	doc, err := goquery.NewDocumentFromReader(resp.Body)
+	if err != nil {
+		return "", fmt.Errorf("failed to parse HTML: %w", err)
+	}
+
+	// Fallback chain: <title> → og:title → twitter:title → <h1>
+	// Different sites populate these differently; trying multiple increases success rate.
+	title := ""
+
+	if t := doc.Find("title").Text(); t != "" {
+		title = strings.TrimSpace(t)
+	}
+
+	if title == "" {
+		if t, exists := doc.Find(`meta[property="og:title"]`).Attr("content"); exists && t != "" {
+			title = strings.TrimSpace(t)
+		}
+	}
+
+	if title == "" {
+		if t, exists := doc.Find(`meta[name="twitter:title"]`).Attr("content"); exists && t != "" {
+			title = strings.TrimSpace(t)
+		}
+	}
+
+	if title == "" {
+		if t := doc.Find("h1").First().Text(); t != "" {
+			title = strings.TrimSpace(t)
+		}
+	}
+
+	if title == "" {
+		return "", fmt.Errorf("could not extract title from page")
+	}
+
+	// Clean up common title patterns
+	reClean := regexp.MustCompile(`\s*\|\s*`)
+	title = reClean.ReplaceAllString(title, "")
+
+	rePub := regexp.MustCompile(`^[^|]*\|\s*`)
+	title = rePub.ReplaceAllString(title, "")
+	title = strings.TrimSpace(title)
+
+	if len(title) < 10 {
+		return "", fmt.Errorf("extracted title too short: %q", title)
+	}
+
+	return title, nil
+}
+
+// validateTitle checks that a title is suitable for scoring.
+// Returns an error message string if invalid, empty string if valid.
+func (c *ServeCommand) validateTitle(title string) string {
+	if strings.TrimSpace(title) == "" {
+		return "Title cannot be empty"
+	}
+	if len(title) > 1000 {
+		return "Title too long (max 1000 characters)"
+	}
+	return ""
+}
+
+// renderResultsError renders the results template with an error message.
+func (c *ServeCommand) renderResultsError(w http.ResponseWriter, errMsg, title string) {
+	data := map[string]interface{}{
+		"Page":          "tools",
+		"IsScoreResult": true,
+		"Error":         errMsg,
+		"Title":         title,
+		"PageTitle":     displayTitle(c.Title),
+	}
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
+		http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+	}
+}
+
+// validateFeedURL checks that a feed URL is non-empty and has valid format.
+// Returns an error message string if invalid, empty string if valid.
+func (c *ServeCommand) validateFeedURL(feedURL string) string {
+	if feedURL == "" {
+		return "Feed URL cannot be empty"
+	}
+	if _, err := url.Parse(feedURL); err != nil {
+		return "Invalid URL format"
+	}
+	return ""
+}
+
+// renderScanResultsError renders the results template with an error for scan operation.
+func (c *ServeCommand) renderScanResultsError(w http.ResponseWriter, errMsg, feedURL string) {
+	data := map[string]interface{}{
+		"Page":         "tools",
+		"IsScanResult": true,
+		"Error":        errMsg,
+		"FeedURL":      feedURL,
+		"PageTitle":    displayTitle(c.Title),
+	}
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
+		http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+	}
+}
+
+func escapeXML(s string) string {
+	s = strings.ReplaceAll(s, "&", "&amp;")
+	s = strings.ReplaceAll(s, "<", "&lt;")
+	s = strings.ReplaceAll(s, ">", "&gt;")
+	s = strings.ReplaceAll(s, "\"", "&quot;")
+	s = strings.ReplaceAll(s, "'", "&apos;")
+	return s
+}
diff --git a/cmds/templates/live-feed.html b/cmds/templates/live-feed.html
new file mode 100644
index 0000000..1529ee1
--- /dev/null
+++ b/cmds/templates/live-feed.html
@@ -0,0 +1,158 @@
+{{define "live-feed"}}
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{{.Title}} - Live Feed</title>
+    <style>
+        /* ========================================
+           BASE STYLE
+           ======================================== */
+        * { margin: 0; padding: 0; box-sizing: border-box; }
+        body { 
+            font-family: monospace; 
+            background: #fff; 
+            color: #000; 
+            padding: 20px; 
+            line-height: 1.6; 
+        }
+        h1 { 
+            font-size: 1.2em; 
+            font-weight: bold; 
+            margin-bottom: 20px; 
+        }
+
+        /* ========================================
+           NAV           (live-feed | score-scan)
+           ======================================== */
+        .nav { 
+            margin-bottom: 30px; 
+            display: flex; 
+            gap: 30px; 
+            border-bottom: 1px solid #000; 
+            padding-bottom: 10px; 
+        }
+        .nav a { 
+            text-decoration: none; 
+            color: #000; 
+            font-family: monospace; 
+        }
+        .nav a.active { 
+            border-bottom: 2px solid #000; 
+            padding-bottom: 5px; 
+        }
+
+        /* ========================================
+           ARTICLE LIST
+           ======================================== */
+        .article { 
+            margin-bottom: 15px; 
+            padding: 10px; 
+            border: 1px solid #ccc; 
+        }
+        .article a { 
+            color: #00f; 
+            text-decoration: underline; 
+        }
+        .article-meta { 
+            margin-top: 8px; 
+            color: #666; 
+            font-size: 0.9em; 
+        }
+
+        /* ========================================
+           ARTICLE LIST STUFF
+           ======================================== */
+        .summary { 
+            margin-bottom: 15px; 
+            padding: 10px; 
+            border: 1px solid #000; 
+            background: #f9f9f9; 
+        }
+        .rss-link { 
+            background: #f9f9f9; 
+            padding: 15px; 
+            border: 1px solid #000; 
+            margin-bottom: 20px; 
+        }
+        .rss-link a { 
+            color: #00f; 
+            text-decoration: underline; 
+        }
+        .feed-list { 
+            max-height: 600px; 
+            overflow-y: auto; 
+            border: 1px solid #000; 
+            padding: 10px; 
+        }
+
+        .error { 
+            color: #f00; 
+            margin-top: 10px; 
+            padding: 10px; 
+            border: 1px solid #f00; 
+        }
+    </style>
+</head>
+<body>
+    <h1><a href="/live-feed" style="color: inherit; text-decoration: none;">{{.Title}}</a></h1>
+    <div class="nav">
+        <a href="/live-feed" class="active">Live Feed</a>
+        <a href="/tools">Score & Scan</a>
+    </div>
+
+    <div class="rss-link">
+        <strong>Filtered RSS Feed:</strong> 
+        <a href="/api/filtered/rss" target="_blank">Subscribe to filtered articles</a>
+        <span style="margin-left: 10px; color: #666; font-size: 0.9em;">(rss link for feed readers)</span>
+        <div style="margin-top: 10px; padding-top: 10px; border-top: 1px solid #ccc; color: #666; font-size: 0.9em;">
+            Last updated: <span id="feedTimestamp">{{if .UpdatedAt}}{{.UpdatedAt}}{{else}}—{{end}}</span>
+        </div>
+    </div>
+
+    <div style="margin-bottom: 20px;">
+        <strong>Filter by date:</strong>
+        <div style="margin-top: 8px; display: flex; gap: 10px;">
+            <a href="/live-feed?filter=day" style="padding: 6px 12px; text-decoration: none; {{if eq .Filter "day"}}background: #000; color: #fff;{{else}}border: 1px solid #000; color: #000;{{end}}">Last 24h</a>
+            <a href="/live-feed?filter=week" style="padding: 6px 12px; text-decoration: none; {{if eq .Filter "week"}}background: #000; color: #fff;{{else}}border: 1px solid #000; color: #000;{{end}}">Last 7 days</a>
+            <a href="/live-feed?filter=all" style="padding: 6px 12px; text-decoration: none; {{if eq .Filter "all"}}background: #000; color: #fff;{{else}}border: 1px solid #000; color: #000;{{end}}">All</a>
+        </div>
+    </div>
+
+    <div class="feed-list">
+        {{if .Error}}
+            <div class="error">{{.Error}}</div>
+        {{else if .Articles}}
+            <div class="summary">
+                <strong>{{len .Articles}}</strong> articles (threshold: {{printf "%.2f" .Threshold}})
+            </div>
+            {{$threshold := .Threshold}}
+            {{range .Articles}}
+                {{$isGood := ge .Score $threshold}}
+                {{$bgColor := "white"}}
+                {{if $isGood}}
+                    {{$bgColor = "#e8f5e9"}}
+                {{else}}
+                    {{$bgColor = "#ffebee"}}
+                {{end}}
+                {{$indicator := "✗"}}
+                {{if $isGood}}
+                    {{$indicator = "✓"}}
+                {{end}}
+                <div class="article" style="background-color: {{$bgColor}};">
+                    <div style="font-weight: bold;">
+                        <a href="{{.URL}}" target="_blank">{{.Title}}</a>
+                    </div>
+                    <div class="article-meta">
+                        Rating: {{$indicator}} {{.Rating}}/10 (raw: {{printf "%.3f" .Score}}) · {{.Source}}{{if .PublishedAt}} · {{.PublishedAt}}{{end}}
+                    </div>
+                </div>
+            {{end}}
+        {{else}}
+            <p>No articles to display</p>
+        {{end}}
+    </div>
+</body>
+</html>
+{{end}}
diff --git a/cmds/templates/results.html b/cmds/templates/results.html
new file mode 100644
index 0000000..13f68e0
--- /dev/null
+++ b/cmds/templates/results.html
@@ -0,0 +1,279 @@
+{{define "results"}}
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{{.PageTitle}} - Results</title>
+    <style>
+        /* ========================================
+           BASE STYLE
+           ======================================== */
+        * { margin: 0; padding: 0; box-sizing: border-box; }
+        body { 
+            font-family: monospace; 
+            background: #fff; 
+            color: #000; 
+            padding: 20px; 
+            line-height: 1.6; 
+        }
+        h1 { 
+            font-size: 1.2em; 
+            font-weight: bold; 
+            margin-bottom: 20px; 
+        }
+        h2 { 
+            font-size: 1em; 
+            font-weight: bold; 
+            margin-bottom: 15px; 
+            border-bottom: 1px solid #000; 
+            padding-bottom: 10px; 
+        }
+
+        /* ========================================
+           NAV           (live-feed | score-scan)
+           ======================================== */
+        .nav { 
+            margin-bottom: 30px; 
+            display: flex; 
+            gap: 30px; 
+            border-bottom: 1px solid #000; 
+            padding-bottom: 10px; 
+        }
+        .nav a { 
+            text-decoration: none; 
+            color: #000; 
+            font-family: monospace; 
+        }
+        .nav a.active { 
+            border-bottom: 2px solid #000; 
+            padding-bottom: 5px; 
+        }
+
+        /* ========================================
+           LAYOUT  (2-column grid for score-scan)
+           ======================================== */
+        .container { 
+            max-width: 1200px; 
+            margin: 0 auto; 
+            display: grid; 
+            grid-template-columns: 1fr 1fr; 
+            gap: 30px; 
+        }
+        .section { 
+            border: 1px solid #000; 
+            padding: 20px; 
+        }
+
+        /* ========================================
+           FORMS          (input, textarea, button)
+           ======================================== */
+        label { 
+            display: block; 
+            margin-top: 15px; 
+            font-weight: bold; 
+        }
+        input, textarea { 
+            display: block; 
+            width: 100%; 
+            margin-top: 5px; 
+            padding: 5px; 
+            border: 1px solid #000; 
+            font-family: monospace; 
+        }
+        textarea { 
+            resize: vertical; 
+            min-height: 80px; 
+        }
+        button { 
+            margin-top: 15px; 
+            padding: 5px 15px; 
+            border: 1px solid #000; 
+            background: #fff; 
+            cursor: pointer; 
+            font-family: monospace; 
+        }
+        button:hover { 
+            background: #000; 
+            color: #fff; 
+        }
+        button:active { 
+            opacity: 0.8; 
+        }
+
+        /* ========================================
+           RESULT BOXES
+           ======================================== */
+        .result { 
+            margin-top: 20px; 
+            padding: 15px; 
+            border: 1px solid #000; 
+            background: #f5f5f5; 
+        }
+        .score { 
+            font-size: 3em; 
+            font-weight: bold; 
+            text-align: center; 
+            margin: 20px 0; 
+        }
+        .error { 
+            color: #f00; 
+            margin-top: 10px; 
+            padding: 10px; 
+            border: 1px solid #f00; 
+        }
+
+        /* ========================================
+           ARTICLE LIST
+           ======================================== */
+        .article { 
+            margin-bottom: 15px; 
+            padding: 10px; 
+            border: 1px solid #ccc; 
+        }
+        .article a { 
+            color: #00f; 
+            text-decoration: underline; 
+        }
+        .article-meta { 
+            margin-top: 8px; 
+            color: #666; 
+            font-size: 0.9em; 
+        }
+
+        /* ========================================
+           ARTICLE LIST STUFF
+           ======================================== */
+        .summary { 
+            margin-bottom: 15px; 
+            padding: 10px; 
+            border: 1px solid #000; 
+            background: #f9f9f9; 
+        }
+
+        small { 
+            display: block; 
+            margin-top: 5px; 
+            color: #666; 
+        }
+
+        /* ========================================
+           MOBILE
+           ======================================== */
+        @media (max-width: 960px) { 
+            .container { 
+                grid-template-columns: 1fr; 
+                gap: 20px; 
+            } 
+        }
+    </style>
+</head>
+<body>
+    <h1><a href="/live-feed" style="color: inherit; text-decoration: none;">{{.PageTitle}}</a></h1>
+    <div class="nav">
+        <a href="/live-feed">Live Feed</a>
+        <a href="/tools" class="active">Score & Scan</a>
+    </div>
+
+    <div class="container">
+        {{if .IsScoreResult}}
+        <div class="section">
+            <h2>Score Article</h2>
+            {{if .Error}}
+                <div class="error">{{.Error}}</div>
+                <form method="POST" action="/score" style="margin-top: 20px;">
+                    <label for="scoreTitle">Title:</label>
+                    <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" value="{{.Title}}" />
+                    <label for="scoreURL">URL or DOI:</label>
+                    <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" />
+                    <small>If URL is provided, title will be automatically extracted</small>
+                    <button type="submit">Score</button>
+                </form>
+            {{else}}
+                <div class="result">
+                    <div class="score">{{.Rating}}/10</div>
+                    <p style="text-align: center; color: #666;">Score: {{printf "%.3f" .Score}}</p>
+                    <p style="text-align: center; margin-top: 10px; font-size: 0.9em;">{{.Title}}</p>
+                </div>
+                <form method="POST" action="/score" style="margin-top: 20px;">
+                    <label for="scoreTitle">Title:</label>
+                    <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" />
+                    <label for="scoreURL">URL or DOI:</label>
+                    <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" />
+                    <small>If URL is provided, title will be automatically extracted</small>
+                    <button type="submit">Score Another</button>
+                </form>
+            {{end}}
+        </div>
+
+        <div class="section">
+            <h2>Scan Feed</h2>
+            <form method="POST" action="/scan">
+                <label for="feedURL">RSS Feed URL:</label>
+                <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" required />
+                <button type="submit">Scan</button>
+            </form>
+        </div>
+
+        {{else if .IsScanResult}}
+        <div class="section">
+            <h2>Score Article</h2>
+            <form method="POST" action="/score">
+                <label for="scoreTitle">Title:</label>
+                <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" />
+                <label for="scoreURL">URL or DOI:</label>
+                <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" />
+                <small>If URL is provided, title will be automatically extracted</small>
+                <button type="submit">Score</button>
+            </form>
+        </div>
+
+        <div class="section">
+            <h2>Scan Feed</h2>
+            {{if .Error}}
+                <div class="error">{{.Error}}</div>
+                <form method="POST" action="/scan" style="margin-top: 20px;">
+                    <label for="feedURL">RSS Feed URL:</label>
+                    <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" value="{{.FeedURL}}" required />
+                    <button type="submit">Try Again</button>
+                </form>
+            {{else}}
+                <div class="summary">
+                    <strong>{{len .Articles}}</strong> articles from {{.FeedURL}} (threshold: {{printf "%.2f" .Threshold}})
+                </div>
+                <div style="max-height: 500px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
+                    {{$threshold := .Threshold}}
+                    {{range .Articles}}
+                        {{$isGood := ge .Score $threshold}}
+                        {{$bgColor := "white"}}
+                        {{if $isGood}}
+                            {{$bgColor = "#e8f5e9"}}
+                        {{else}}
+                            {{$bgColor = "#ffebee"}}
+                        {{end}}
+                        {{$indicator := "✗"}}
+                        {{if $isGood}}
+                            {{$indicator = "✓"}}
+                        {{end}}
+                        <div class="article" style="background-color: {{$bgColor}};">
+                            <div style="font-weight: bold;">
+                                <a href="{{.URL}}" target="_blank">{{.Title}}</a>
+                            </div>
+                            <div class="article-meta">
+                                Rating: {{$indicator}} {{.Rating}}/10 (raw: {{printf "%.3f" .Score}}) · {{.Source}}
+                            </div>
+                        </div>
+                    {{end}}
+                </div>
+                <form method="POST" action="/scan" style="margin-top: 20px;">
+                    <label for="feedURL">RSS Feed URL:</label>
+                    <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" required />
+                    <button type="submit">Scan Another</button>
+                </form>
+            {{end}}
+        </div>
+        {{end}}
+    </div>
+</body>
+</html>
+{{end}}
diff --git a/cmds/templates/tools.html b/cmds/templates/tools.html
new file mode 100644
index 0000000..def04fe
--- /dev/null
+++ b/cmds/templates/tools.html
@@ -0,0 +1,202 @@
+{{define "tools"}}
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{{.Title}} - Score & Scan</title>
+    <style>
+        /* ========================================
+           BASE STYLE
+           ======================================== */
+        * { margin: 0; padding: 0; box-sizing: border-box; }
+        body { 
+            font-family: monospace; 
+            background: #fff; 
+            color: #000; 
+            padding: 20px; 
+            line-height: 1.6; 
+        }
+        h1 { 
+            font-size: 1.2em; 
+            font-weight: bold; 
+            margin-bottom: 20px; 
+        }
+        h2 { 
+            font-size: 1em; 
+            font-weight: bold; 
+            margin-bottom: 15px; 
+            border-bottom: 1px solid #000; 
+            padding-bottom: 10px; 
+        }
+
+        /* ========================================
+           NAV           (live-feed | score-scan)
+           ======================================== */
+        .nav { 
+            margin-bottom: 30px; 
+            display: flex; 
+            gap: 30px; 
+            border-bottom: 1px solid #000; 
+            padding-bottom: 10px; 
+        }
+        .nav a { 
+            text-decoration: none; 
+            color: #000; 
+            font-family: monospace; 
+        }
+        .nav a.active { 
+            border-bottom: 2px solid #000; 
+            padding-bottom: 5px; 
+        }
+
+        /* ========================================
+           LAYOUT  (2-column grid for score-scan)
+           ======================================== */
+        .container { 
+            max-width: 1200px; 
+            margin: 0 auto; 
+            display: grid; 
+            grid-template-columns: 1fr 1fr; 
+            gap: 30px; 
+        }
+        .section { 
+            border: 1px solid #000; 
+            padding: 20px; 
+        }
+
+        /* ========================================
+           FORMS          (input, textarea, button)
+           ======================================== */
+        label { 
+            display: block; 
+            margin-top: 15px; 
+            font-weight: bold; 
+        }
+        input, textarea { 
+            display: block; 
+            width: 100%; 
+            margin-top: 5px; 
+            padding: 5px; 
+            border: 1px solid #000; 
+            font-family: monospace; 
+        }
+        textarea { 
+            resize: vertical; 
+            min-height: 80px; 
+        }
+        button { 
+            margin-top: 15px; 
+            padding: 5px 15px; 
+            border: 1px solid #000; 
+            background: #fff; 
+            cursor: pointer; 
+            font-family: monospace; 
+        }
+        button:hover { 
+            background: #000; 
+            color: #fff; 
+        }
+        button:active { 
+            opacity: 0.8; 
+        }
+
+        /* ========================================
+           RESULT BOXES
+           ======================================== */
+        .result { 
+            margin-top: 20px; 
+            padding: 15px; 
+            border: 1px solid #000; 
+            background: #f5f5f5; 
+        }
+        .score { 
+            font-size: 3em; 
+            font-weight: bold; 
+            text-align: center; 
+            margin: 20px 0; 
+        }
+        .error { 
+            color: #f00; 
+            margin-top: 10px; 
+            padding: 10px; 
+            border: 1px solid #f00; 
+        }
+
+        /* ========================================
+           ARTICLE LIST
+           ======================================== */
+        .article { 
+            margin-bottom: 15px; 
+            padding: 10px; 
+            border: 1px solid #ccc; 
+        }
+        .article a { 
+            color: #00f; 
+            text-decoration: underline; 
+        }
+        .article-meta { 
+            margin-top: 8px; 
+            color: #666; 
+            font-size: 0.9em; 
+        }
+
+        /* ========================================
+           ARTICLE LIST STUFF
+           ======================================== */
+        .summary { 
+            margin-bottom: 15px; 
+            padding: 10px; 
+            border: 1px solid #000; 
+            background: #f9f9f9; 
+        }
+
+        small { 
+            display: block; 
+            margin-top: 5px; 
+            color: #666; 
+        }
+
+        /* ========================================
+           MOBILE
+           ======================================== */
+        @media (max-width: 960px) { 
+            .container { 
+                grid-template-columns: 1fr; 
+                gap: 20px; 
+            } 
+        }
+    </style>
+</head>
+<body>
+    <h1><a href="/live-feed" style="color: inherit; text-decoration: none;">{{.Title}}</a></h1>
+    <div class="nav">
+        <a href="/live-feed">Live Feed</a>
+        <a href="/tools" class="active">Score & Scan</a>
+    </div>
+
+    <div class="container">
+        <div class="section">
+            <h2>Score Article</h2>
+            <form method="POST" action="/score">
+                <label for="scoreTitle">Title:</label>
+                <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" />
+                <label for="scoreURL" style="margin-top: 10px;">URL or DOI:</label>
+                <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" />
+                <small>If URL is provided, title will be automatically extracted</small>
+                <button type="submit">Score</button>
+            </form>
+        </div>
+
+        <div class="section">
+            <h2>Scan Feed</h2>
+            <form method="POST" action="/scan">
+                <label for="feedURL">RSS Feed URL:</label>
+                <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" required />
+                <button type="submit">Scan</button>
+            </form>
+        </div>
+    </div>
+</body>
+</html>
+{{end}}
diff --git a/cmds/train.go b/cmds/train.go
new file mode 100644
index 0000000..e7e8915
--- /dev/null
+++ b/cmds/train.go
@@ -0,0 +1,841 @@
+// Train command learns model from positive examples and RSS feeds.
+// Loads positives, fetches RSS feeds as negatives, excludes overlap,
+// trains TF-IDF + logistic regression with 1:1 class balancing.
+// Outputs model with validation threshold to stdout.
+package cmds
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"math"
+	"math/rand"
+	"net/http"
+	"net/url"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/mmcdole/gofeed"
+	"scholscan/core"
+)
+
+// ============================================================================
+// ┏━╸┏┳┓╺┳┓   ┏━┓┏┓  ┏┓
+// ┃  ┃┃┃ ┃┃   ┃ ┃┣┻┓  ┃
+// ┗━╸╹ ╹╺┻┛   ┗━┛┗━┛┗━┛
+// ============================================================================
+
+// Learns model from positive examples and RSS feeds
+// Outputs trained model JSON to stdout
+type TrainCommand struct {
+	positivesFile string
+	rssFeedsFile  string
+	verboseOutput bool
+	lambda        float64
+	minDF         int
+	maxDF         float64
+	ngramMax      int
+}
+
+func (c *TrainCommand) Name() string { return "train" }
+
+func (c *TrainCommand) Init(args []string) error {
+	fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
+	fs.Usage = func() {
+		fmt.Fprint(fs.Output(), `Usage: scholscan train POSITIVES_FILE --rss-feeds RSS_FEEDS_FILE > model.json
+
+Train a TF-IDF + logistic regression model from positive examples and RSS feeds.
+
+The training workflow:
+  1. Load positive examples from POSITIVES_FILE
+  2. Fetch articles from RSS feeds list
+  3. Exclude any positive examples from RSS feed articles
+  4. Train model with balanced classes
+  5. Output trained model to stdout as JSON
+
+Flags:
+`)
+		fs.PrintDefaults()
+		fmt.Fprint(fs.Output(), `
+Arguments:
+  POSITIVES_FILE      Path to JSONL file with positive examples (required)
+
+Example:
+  scholscan train positives.jsonl --rss-feeds rss_world.txt > model.json
+`)
+	}
+
+	fs.StringVar(&c.rssFeedsFile, "rss-feeds", "", "Path to text file with RSS feed URLs (required)")
+	fs.BoolVar(&c.verboseOutput, "verbose", false, "Show progress information")
+	fs.Float64Var(&c.lambda, "lambda", 0.001, "L2 regularization parameter for logistic regression")
+	fs.IntVar(&c.minDF, "min-df", 2, "Minimum document frequency (absolute count)")
+	fs.Float64Var(&c.maxDF, "max-df", 0.8, "Maximum document frequency (ratio, 0-1)")
+	fs.IntVar(&c.ngramMax, "ngram-max", 2, "Maximum n-gram size (e.g., 1=unigrams, 2=unigrams+bigrams)")
+
+	// Check for help flag first
+	for _, arg := range args {
+		if arg == "--help" || arg == "-h" {
+			fs.Usage()
+			return flag.ErrHelp
+		}
+	}
+
+	// Extract positional argument (POSITIVES_FILE) before parsing flags
+	if len(args) == 0 {
+		return fmt.Errorf("POSITIVES_FILE argument is required")
+	}
+	// The first argument should be the positives file, the rest are flags
+	c.positivesFile = args[0]
+	flagArgs := args[1:]
+
+	if err := fs.Parse(flagArgs); err != nil {
+		return err
+	}
+
+	if c.rssFeedsFile == "" {
+		return fmt.Errorf("--rss-feeds flag is required")
+	}
+
+	// Validate paths are safe (prevent directory traversal)
+	if strings.Contains(filepath.Clean(c.positivesFile), "..") {
+		return fmt.Errorf("invalid positives file path: directory traversal not allowed")
+	}
+	if strings.Contains(filepath.Clean(c.rssFeedsFile), "..") {
+		return fmt.Errorf("invalid RSS feeds file path: directory traversal not allowed")
+	}
+
+	return nil
+}
+
+func (c *TrainCommand) Run(stdin io.Reader, stdout io.Writer) error {
+	if c.verboseOutput {
+		log.SetOutput(os.Stderr)
+		log.Println("Starting training workflow...")
+		log.Printf("Positives: %s", c.positivesFile)
+		log.Printf("RSS feeds: %s", c.rssFeedsFile)
+	}
+
+	if c.verboseOutput {
+		log.Printf("Loading positives from %s...", c.positivesFile)
+	}
+	positives, err := c.loadArticles(c.positivesFile)
+	if err != nil {
+		return fmt.Errorf("failed to load positives: %w", err)
+	}
+	if c.verboseOutput {
+		log.Printf("Loaded %d positive examples", len(positives))
+	}
+
+	if c.verboseOutput {
+		log.Printf("Loading RSS feeds from %s...", c.rssFeedsFile)
+	}
+	rssURLs, err := c.loadRSSURLs(c.rssFeedsFile)
+	if err != nil {
+		return fmt.Errorf("failed to load RSS feeds: %w", err)
+	}
+	if c.verboseOutput {
+		log.Printf("Found %d RSS feeds to fetch", len(rssURLs))
+	}
+
+	negatives, err := c.fetchFromRSSFeeds(rssURLs)
+	if err != nil {
+		return fmt.Errorf("failed to fetch from RSS feeds: %w", err)
+	}
+	if c.verboseOutput {
+		log.Printf("Fetched %d articles from RSS feeds", len(negatives))
+	}
+
+	negatives = c.excludePositives(negatives, positives)
+	if c.verboseOutput {
+		log.Printf("After exclusion: %d negative examples", len(negatives))
+	}
+
+	if len(positives) == 0 || len(negatives) == 0 {
+		return fmt.Errorf("need both positive (%d) and negative (%d) examples for training", len(positives), len(negatives))
+	}
+
+	if c.verboseOutput {
+		log.Println("Training model...")
+	}
+	model, err := c.trainModel(positives, negatives)
+	if err != nil {
+		return fmt.Errorf("failed to train model: %w", err)
+	}
+
+	// Output model
+	encoder := json.NewEncoder(stdout)
+	encoder.SetIndent("", " ")
+	if err := encoder.Encode(model); err != nil {
+		return fmt.Errorf("failed to write model: %w", err)
+	}
+
+	return nil
+}
+
+// ============================================================================
+// ╺┳┓┏━┓╺┳╸┏━┓   ╻  ┏━┓┏━┓╺┳┓╻┏┓╻┏━╸
+//  ┃┃┣━┫ ┃ ┣━┫   ┃  ┃ ┃┣━┫ ┃┃┃┃┗┫┃╺┓
+// ╺┻┛╹ ╹ ╹ ╹ ╹   ┗━╸┗━┛╹ ╹╺┻┛╹╹ ╹┗━┛
+// ============================================================================
+
+func (c *TrainCommand) loadArticles(filename string) ([]*core.Article, error) {
+	file, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	var articles []*core.Article
+	decoder := json.NewDecoder(file)
+	lineCount := 0
+	for {
+		var article core.Article
+		if err := decoder.Decode(&article); err != nil {
+			if err == io.EOF {
+				break
+			}
+			// Skip malformed json lines, don't fail on bad input.
+			lineCount++
+			continue
+		}
+		articles = append(articles, &article)
+		lineCount++
+		if lineCount%500 == 0 && c.verboseOutput {
+			log.Printf("  Loaded %d articles so far", len(articles))
+		}
+	}
+	return articles, nil
+}
+
+// loadRSSURLs loads RSS feed URLs from a text file
+func (c *TrainCommand) loadRSSURLs(filename string) ([]string, error) {
+	file, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	var urls []string
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if line != "" && !strings.HasPrefix(line, "#") {
+			urls = append(urls, line)
+		}
+	}
+	return urls, scanner.Err()
+}
+
+// fetchFromRSSFeeds fetches articles from multiple RSS feeds in parallel
+func (c *TrainCommand) fetchFromRSSFeeds(rssURLs []string) ([]*core.Article, error) {
+	client := core.DefaultHTTPClient
+	type result struct {
+		url      string
+		articles []*core.Article
+		err      error
+	}
+	resultChan := make(chan result, len(rssURLs))
+
+	for _, rssURL := range rssURLs {
+		go func(url string) {
+			articles, err := c.fetchRSSFeed(client, url)
+			resultChan <- result{url: url, articles: articles, err: err}
+		}(rssURL)
+	}
+
+	var allArticles []*core.Article
+	for i := 0; i < len(rssURLs); i++ {
+		res := <-resultChan
+		if res.err != nil {
+			if c.verboseOutput {
+				log.Printf("%s: failed to fetch", shortURL(res.url))
+			}
+		} else {
+			if c.verboseOutput {
+				log.Printf("%s: %d articles", shortURL(res.url), len(res.articles))
+			}
+			allArticles = append(allArticles, res.articles...)
+		}
+	}
+
+	return allArticles, nil
+}
+
+// ParseRSSFeed parses an RSS/Atom feed from the provided body into a slice of Articles.
+func ParseRSSFeed(body []byte, baseURL string) ([]*core.Article, error) {
+	fp := gofeed.NewParser()
+	feed, err := fp.Parse(bytes.NewReader(body))
+	if err != nil {
+		return nil, err
+	}
+
+	var articles []*core.Article
+	for _, item := range feed.Items {
+		// Prefer explicit content; fall back to description.
+		content := strings.TrimSpace(item.Content)
+		if content == "" {
+			content = item.Description
+		}
+		// Also check custom content field (for <content> tags in RSS)
+		if content == "" && item.Custom != nil {
+			if c, ok := item.Custom["content"]; ok && c != "" {
+				content = c
+			}
+		}
+
+		// Clean and limit content length
+		content = core.CleanFeedContent(content)
+
+		articles = append(articles, &core.Article{
+			URL:     item.Link,
+			Title:   item.Title,
+			Content: content,
+		})
+	}
+	return articles, nil
+}
+
+// fetchRSSFeed fetches and parses a single RSS feed
+func (c *TrainCommand) fetchRSSFeed(client *http.Client, rssURL string) ([]*core.Article, error) {
+	var body []byte
+	var err error
+
+	// Handle file:// URLs locally
+	if strings.HasPrefix(rssURL, "file://") {
+		// Remove file:// prefix
+		filePath := strings.TrimPrefix(rssURL, "file://")
+		body, err = os.ReadFile(filePath)
+		if err != nil {
+			return nil, fmt.Errorf("error reading file %s: %w", filePath, err)
+		}
+	} else {
+		// Handle HTTP/HTTPS URLs normally
+		req, err := http.NewRequest("GET", rssURL, nil)
+		if err != nil {
+			return nil, fmt.Errorf("error building request: %w", err)
+		}
+		req.Header.Set("User-Agent", core.PoliteUserAgent)
+
+		// Make request with retry logic
+		ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer cancel()
+
+		resp, err := core.DoRequestWithRetry(ctx, client, req)
+		if err != nil {
+			return nil, fmt.Errorf("error fetching %s: %w", rssURL, err)
+		}
+		defer resp.Body.Close()
+
+		if resp.StatusCode != http.StatusOK {
+			return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, rssURL)
+		}
+
+		// Read response body
+		body, err = io.ReadAll(resp.Body)
+		if err != nil {
+			return nil, fmt.Errorf("error reading response from %s: %w", rssURL, err)
+		}
+	}
+
+	// Parse RSS/Atom feed
+	return ParseRSSFeed(body, rssURL)
+}
+
+// ============================================================================
+// ╺┳┓┏━┓╺┳╸┏━┓   ┏━┓┏━┓┏━╸┏━┓
+//  ┃┃┣━┫ ┃ ┣━┫   ┣━┛┣┳┛┣╸ ┣━┛
+// ╺┻┛╹ ╹ ╹ ╹ ╹   ╹  ╹┗╸┗━╸╹
+// ============================================================================
+
+func (c *TrainCommand) excludePositives(negatives, positives []*core.Article) []*core.Article {
+	// Build set of positive URLs for O(1) lookup
+	positiveURLs := make(map[string]bool)
+	for _, pos := range positives {
+		positiveURLs[pos.URL] = true
+	}
+
+	// Filter out positives
+	var filtered []*core.Article
+	for _, neg := range negatives {
+		if !positiveURLs[neg.URL] {
+			filtered = append(filtered, neg)
+		}
+	}
+
+	return filtered
+}
+
+// splitTrainingData performs a deterministic 80/20 split (seed=42).
+// Deterministic ensures reproducible model training across runs.
+func (c *TrainCommand) splitTrainingData(documents []string, labels []float64) (
+	trainDocs, valDocs []string,
+	trainLabels, valLabels []float64,
+) {
+	const validationSplitRatio = 0.2
+	const splitSeed = 42
+
+	if len(documents) < 3 {
+		// Not enough data to split, use all for training.
+		// A split requires at least 2 training documents to avoid MaxDF issues
+		// and at least 1 validation document.
+		return documents, nil, labels, nil
+	}
+
+	// Create a reproducible random source and shuffle indices.
+	rng := rand.New(rand.NewSource(splitSeed))
+	indices := make([]int, len(documents))
+	for i := range indices {
+		indices[i] = i
+	}
+	rng.Shuffle(len(indices), func(i, j int) {
+		indices[i], indices[j] = indices[j], indices[i]
+	})
+
+	splitIndex := int(float64(len(documents)) * (1.0 - validationSplitRatio))
+	trainIndices := indices[:splitIndex]
+	valIndices := indices[splitIndex:]
+
+	trainDocs = make([]string, len(trainIndices))
+	trainLabels = make([]float64, len(trainIndices))
+	for i, idx := range trainIndices {
+		trainDocs[i] = documents[idx]
+		trainLabels[i] = labels[idx]
+	}
+
+	valDocs = make([]string, len(valIndices))
+	valLabels = make([]float64, len(valIndices))
+	for i, idx := range valIndices {
+		valDocs[i] = documents[idx]
+		valLabels[i] = labels[idx]
+	}
+
+	return trainDocs, valDocs, trainLabels, valLabels
+}
+
+// Downsample majority class to 1:1 ratio AFTER vectorizer.Fit() to preserve IDF values.
+func (c *TrainCommand) downsampleToBalance(docs []string, labels []float64) ([]string, []float64) {
+	// Count positives and negatives
+	var posDocs, negDocs []string
+	var posLabels, negLabels []float64
+
+	for i, label := range labels {
+		if label == 1.0 {
+			posDocs = append(posDocs, docs[i])
+			posLabels = append(posLabels, label)
+		} else {
+			negDocs = append(negDocs, docs[i])
+			negLabels = append(negLabels, label)
+		}
+	}
+
+	// If already balanced, return as-is
+	if len(posDocs) == len(negDocs) {
+		return docs, labels
+	}
+
+	// Determine which class is majority
+	var majorityDocs, minorityDocs []string
+	var majorityLabels, minorityLabels []float64
+
+	if len(negDocs) > len(posDocs) {
+		// Negatives are majority
+		majorityDocs, minorityDocs = negDocs, posDocs
+		majorityLabels, minorityLabels = negLabels, posLabels
+	} else {
+		// Positives are majority (unlikely but handle)
+		majorityDocs, minorityDocs = posDocs, negDocs
+		majorityLabels, minorityLabels = posLabels, negLabels
+	}
+
+	// Downsample majority to match minority size
+	minoritySize := len(minorityDocs)
+	rng := rand.New(rand.NewSource(42)) // Use fixed seed for reproducibility
+
+	// Create random indices for downsampling
+	indices := make([]int, len(majorityDocs))
+	for i := range indices {
+		indices[i] = i
+	}
+	rng.Shuffle(len(indices), func(i, j int) {
+		indices[i], indices[j] = indices[j], indices[i]
+	})
+
+	// Select downsampled majority
+	downsampledDocs := make([]string, 0, minoritySize*2)
+	downsampledLabels := make([]float64, 0, minoritySize*2)
+
+	// Add all minority samples
+	downsampledDocs = append(downsampledDocs, minorityDocs...)
+	downsampledLabels = append(downsampledLabels, minorityLabels...)
+
+	// Add downsampled majority
+	for i := 0; i < minoritySize; i++ {
+		idx := indices[i]
+		downsampledDocs = append(downsampledDocs, majorityDocs[idx])
+		downsampledLabels = append(downsampledLabels, majorityLabels[idx])
+	}
+
+	return downsampledDocs, downsampledLabels
+}
+
+// ============================================================================
+// ╺┳╸┏━┓┏━┓╻┏┓╻   ┏┳┓┏━┓╺┳┓┏━╸╻
+//  ┃ ┣┳┛┣━┫┃┃┗┫   ┃┃┃┃ ┃ ┃┃┣╸ ┃
+//  ╹ ╹┗╸╹ ╹╹╹ ╹   ╹ ╹┗━┛╺┻┛┗━╸┗━╸
+// ============================================================================
+
+// trainModel trains a TF-IDF + logistic regression model
+func (c *TrainCommand) trainModel(positives, negatives []*core.Article) (*core.ModelEnvelope, error) {
+	// Combine datasets and create labels
+	var documents []string
+	var labels []float64
+
+	// Process positives
+	for _, article := range positives {
+		// Skip articles with titles that are too short
+		if len(article.Title) < 15 {
+			continue
+		}
+		documents = append(documents, article.Title)
+		labels = append(labels, 1.0)
+	}
+
+	// Process negatives
+	for _, article := range negatives {
+		// Skip articles with titles that are too short
+		if len(article.Title) < 15 {
+			continue
+		}
+		documents = append(documents, article.Title)
+		labels = append(labels, 0.0)
+	}
+
+	// Use parameters from CLI flags (with defaults matching Julia implementation)
+	const vocabCap = 50000
+
+	// Deterministic 80/20 split for train/validation
+	trainDocs, valDocs, trainLabels, valLabels := c.splitTrainingData(documents, labels)
+
+	// Create TF-IDF vectorizer with the specified parameters
+	vectorizer := &core.TFIDFVectorizer{
+		NgramMin:   1,
+		NgramMax:   c.ngramMax,
+		MinDF:      c.minDF,
+		MaxDF:      c.maxDF,
+		VocabCap:   vocabCap,
+		Vocabulary: make(map[string]float64),
+	}
+	// Fit vectorizer on UNBALANCED training data to match Julia implementation
+	// This preserves document frequencies properly
+	vectorizer.Fit(trainDocs)
+
+	// Downsample negatives to 1:1 ratio AFTER fitting (match Julia approach)
+	balancedTrainDocs, balancedTrainLabels := c.downsampleToBalance(trainDocs, trainLabels)
+
+	// Transform both training and validation sets
+	trainVectors := vectorizer.Transform(balancedTrainDocs)
+	valVectors := vectorizer.Transform(valDocs)
+
+	// Use uniform class weights since we've balanced the dataset
+	classWeights := map[float64]float64{
+		1.0: 1.0,
+		0.0: 1.0,
+	}
+
+	// Train logistic regression with the specified lambda parameter
+	lr := &core.LogisticRegression{
+		LearningRate: 0.5,
+		Lambda:       c.lambda,
+		Iterations:   500,
+		Tolerance:    0.000001,
+	}
+	lr.Validate()
+	weights, err := lr.Fit(trainVectors, balancedTrainLabels, classWeights)
+	if err != nil {
+		return nil, fmt.Errorf("failed to train logistic regression model: %w", err)
+	}
+
+	// Find the best threshold on the validation set
+	recommendedThreshold, scoreDistributions := c.findBestThreshold(valVectors, valLabels, weights)
+
+	// Count classes for metadata
+	var posCount, negCount float64
+	for _, label := range labels {
+		if label == 1.0 {
+			posCount++
+		} else {
+			negCount++
+		}
+	}
+
+	// Create model envelope
+	model := &core.ModelEnvelope{
+		Algorithm: "tfidf-go",
+		Impl:      "go",
+		Version:   "1",
+		CreatedAt: time.Now().UTC(),
+		Meta: map[string]any{
+			"positives": len(positives),
+			"negatives": len(negatives),
+			"class_counts": map[string]int{
+				"pos": int(posCount),
+				"neg": int(negCount),
+			},
+			"vectorizer_params": map[string]any{
+				"ngram_min": vectorizer.NgramMin,
+				"ngram_max": vectorizer.NgramMax,
+				"min_df":    vectorizer.MinDF,
+				"max_df":    vectorizer.MaxDF,
+				"vocab_cap": vectorizer.VocabCap,
+			},
+			"model_params": map[string]any{
+				"learning_rate": lr.LearningRate,
+				"lambda":        lr.Lambda,
+				"iterations":    lr.Iterations,
+				"tolerance":     lr.Tolerance,
+			},
+			"recommended_threshold": recommendedThreshold,
+			"score_distributions":   scoreDistributions,
+		},
+		Vectorizer:   vectorizer.Vocabulary,
+		OrderedVocab: vectorizer.OrderedVocab,
+		Weights:      weights,
+	}
+
+	return model, nil
+}
+
+// ============================================================================
+// ┏┳┓┏━╸╺┳╸┏━┓╻┏━╸┏━┓
+// ┃┃┃┣╸  ┃ ┣┳┛┃┃  ┗━┓
+// ╹ ╹┗━╸ ╹ ╹┗╸╹┗━╸┗━┛
+// ============================================================================
+
+// ClassificationMetrics holds the evaluation metrics
+type ClassificationMetrics struct {
+	TruePositives  int
+	TrueNegatives  int
+	FalsePositives int
+	FalseNegatives int
+	Accuracy       float64
+	Precision      float64
+	Recall         float64
+	F1Score        float64
+}
+
+// Calculate computes the metrics from raw counts
+func (m *ClassificationMetrics) Calculate() {
+	total := m.TruePositives + m.TrueNegatives + m.FalsePositives + m.FalseNegatives
+
+	if total > 0 {
+		m.Accuracy = float64(m.TruePositives+m.TrueNegatives) / float64(total)
+	}
+
+	if m.TruePositives+m.FalsePositives > 0 {
+		m.Precision = float64(m.TruePositives) / float64(m.TruePositives+m.FalsePositives)
+	}
+
+	if m.TruePositives+m.FalseNegatives > 0 {
+		m.Recall = float64(m.TruePositives) / float64(m.TruePositives+m.FalseNegatives)
+	}
+
+	if m.Precision+m.Recall > 0 {
+		m.F1Score = 2 * (m.Precision * m.Recall) / (m.Precision + m.Recall)
+	}
+}
+
+// findBestThreshold sweeps a range of thresholds on a validation set to find
+// the one that maximizes combined F1 + separation score.
+func (c *TrainCommand) findBestThreshold(
+	validationVectors [][]float64,
+	validationLabels []float64,
+	weights []float64,
+) (float64, map[string]any) {
+	if len(validationVectors) == 0 {
+		return 0.5, nil // Default if no validation data
+	}
+
+	scores := make([]float64, len(validationVectors))
+	for i, vector := range validationVectors {
+		score, err := core.PredictScore(vector, weights)
+		if err != nil {
+			// This should not happen with valid data, but as a fallback:
+			return 0.5, nil
+		}
+		scores[i] = score
+	}
+
+	// Collect score distributions by label
+	var posScores, negScores []float64
+	for i, score := range scores {
+		if validationLabels[i] == 1.0 {
+			posScores = append(posScores, score)
+		} else {
+			negScores = append(negScores, score)
+		}
+	}
+
+	// Compute stats for each class
+	posStats := computeScoreStats(posScores)
+	negStats := computeScoreStats(negScores)
+
+	// Calculate Cohen's d (effect size) to measure class separation in the learned space
+	posMean := posStats["mean"]
+	negMean := negStats["mean"]
+	posStd := posStats["std"]
+	negStd := negStats["std"]
+
+	var cohensD float64
+	if posStd > 0 && negStd > 0 {
+		pooledStd := math.Sqrt((posStd*posStd + negStd*negStd) / 2)
+		cohensD = math.Abs(posMean-negMean) / pooledStd
+	}
+
+	// Calculate separation ratio to understand how much the classes overlap on the score scale
+	totalRange := math.Max(posStats["max"], negStats["max"]) - math.Min(posStats["min"], negStats["min"])
+	overlapStart := math.Max(posStats["min"], negStats["min"])
+	overlapEnd := math.Min(posStats["max"], negStats["max"])
+	overlapRange := math.Max(0, overlapEnd-overlapStart)
+	separationRatio := 0.0
+	if totalRange > 0 {
+		separationRatio = (totalRange - overlapRange) / totalRange
+	}
+
+	// Find threshold that balances false positives and false negatives using Youden's J.
+	// This metric (Sensitivity + Specificity - 1) equally weights both false positive
+	// and false negative rates. Why not F1? F1 biases toward precision when classes
+	// are imbalanced; a validation set of 10 positives and 1000 negatives would push
+	// the threshold too high. Youden's J treats both types of error equally, which
+	// better reflects real use: missing a relevant article (false negative) is as bad
+	// as showing an irrelevant one (false positive).
+	bestCombinedScore := -1.0
+	bestThreshold := 0.5
+	var bestMetrics ClassificationMetrics
+
+	boolLabels := make([]bool, len(validationLabels))
+	for i, l := range validationLabels {
+		boolLabels[i] = l == 1.0
+	}
+
+	for i := 5; i <= 95; i++ {
+		threshold := float64(i) / 100.0
+		metrics := computeMetrics(scores, boolLabels, threshold)
+
+		sensitivity := metrics.Recall // TPR: TP / (TP + FN)
+		specificity := 0.0
+		if metrics.TrueNegatives+metrics.FalsePositives > 0 {
+			specificity = float64(metrics.TrueNegatives) / float64(metrics.TrueNegatives+metrics.FalsePositives)
+		}
+		youdenJ := sensitivity + specificity - 1.0
+
+		if youdenJ > bestCombinedScore {
+			bestCombinedScore = youdenJ
+			bestThreshold = threshold
+			bestMetrics = metrics
+		}
+	}
+
+	distributions := map[string]any{
+		"positive":         posStats,
+		"negative":         negStats,
+		"cohens_d":         cohensD,
+		"separation_ratio": separationRatio,
+		"best_f1":          bestMetrics.F1Score,
+		"best_precision":   bestMetrics.Precision,
+		"best_recall":      bestMetrics.Recall,
+	}
+
+	return bestThreshold, distributions
+}
+
+// computeScoreStats computes min, max, mean, and std for a slice of scores
+func computeScoreStats(scores []float64) map[string]float64 {
+	if len(scores) == 0 {
+		return map[string]float64{
+			"min":  0.0,
+			"max":  0.0,
+			"mean": 0.0,
+			"std":  0.0,
+		}
+	}
+
+	min, max := scores[0], scores[0]
+	sum := 0.0
+
+	for _, score := range scores {
+		if score < min {
+			min = score
+		}
+		if score > max {
+			max = score
+		}
+		sum += score
+	}
+
+	mean := sum / float64(len(scores))
+
+	// Calculate standard deviation
+	variance := 0.0
+	for _, score := range scores {
+		diff := score - mean
+		variance += diff * diff
+	}
+	variance /= float64(len(scores))
+	std := math.Sqrt(variance)
+
+	return map[string]float64{
+		"min":  min,
+		"max":  max,
+		"mean": mean,
+		"std":  std,
+	}
+}
+
+// computeMetrics calculates classification metrics
+func computeMetrics(scores []float64, labels []bool, threshold float64) ClassificationMetrics {
+	var metrics ClassificationMetrics
+	for i, score := range scores {
+		predicted := score > threshold
+		actual := labels[i]
+
+		if predicted && actual {
+			metrics.TruePositives++
+		} else if predicted && !actual {
+			metrics.FalsePositives++
+		} else if !predicted && actual {
+			metrics.FalseNegatives++
+		} else {
+			metrics.TrueNegatives++
+		}
+	}
+	metrics.Calculate()
+	return metrics
+}
+
+// ============================================================================
+// ╻ ╻┏━╸╻  ┏━┓┏━╸┏━┓┏━┓
+// ┣━┫┣╸ ┃  ┣━┛┣╸ ┣┳┛┗━┓
+// ╹ ╹┗━╸┗━╸╹  ┗━╸╹┗╸┗━┛
+// ============================================================================
+
+// shortURL formats a URL to be human-readable and not too long
+func shortURL(urlStr string) string {
+	u, err := url.Parse(urlStr)
+	if err != nil {
+		return urlStr
+	}
+
+	path := u.Path
+	if len(path) > 30 {
+		path = path[:30] + "..."
+	}
+
+	return u.Host + path
+}
diff --git a/cmds/train_test.go b/cmds/train_test.go
new file mode 100644
index 0000000..8298494
--- /dev/null
+++ b/cmds/train_test.go
@@ -0,0 +1,66 @@
+package cmds
+
+import (
+	"scholscan/core"
+	"strings"
+	"testing"
+)
+
+// test RSS parsing
+func TestParseRSSFeed(t *testing.T) {
+	rssXML := `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+<channel>
+<title>Test Feed</title>
+<item>
+<title>Test Article 1</title>
+<link>https://example.com/article1</link>
+<description>This is a test article with some content.</description>
+</item>
+<item>
+<title>Test Article 2</title>
+<link>https://example.com/article2</link>
+<content><![CDATA[<p>This is content with <b>HTML</b> tags.</p>]]></content>
+</item>
+</channel>
+</rss>`
+
+	articles, err := ParseRSSFeed([]byte(rssXML), "https://example.com/feed")
+	if err != nil {
+		t.Fatalf("Failed to parse RSS feed: %v", err)
+	}
+
+	if len(articles) != 2 {
+		t.Fatalf("Expected 2 articles, got %d", len(articles))
+	}
+
+	if articles[0].Title != "Test Article 1" {
+		t.Errorf("Expected title 'Test Article 1', got '%s'", articles[0].Title)
+	}
+	if articles[0].URL != "https://example.com/article1" {
+		t.Errorf("Expected URL 'https://example.com/article1', got '%s'", articles[0].URL)
+	}
+	if articles[0].Content != "This is a test article with some content." {
+		t.Errorf("Expected content 'This is a test article with some content.', got '%s'", articles[0].Content)
+	}
+
+	if articles[1].Title != "Test Article 2" {
+		t.Errorf("Expected title 'Test Article 2', got '%s'", articles[1].Title)
+	}
+	if articles[1].Content != "This is content with HTML tags." {
+		t.Errorf("Expected 'This is content with HTML tags.', got '%s'", articles[1].Content)
+	}
+}
+
+func TestCleanFeedContent(t *testing.T) {
+	longInput := strings.Repeat("test content ", 500) // 6000+ bytes
+	result := core.CleanFeedContent(longInput)
+
+	if len(result) <= 5000 {
+		t.Errorf("Expected content to be truncated to >5000 chars, got %d", len(result))
+	}
+
+	if !strings.HasSuffix(result, "...") {
+		t.Errorf("Expected truncated content to end with '...', got '%s'", result[len(result)-3:])
+	}
+}
author	Sam Scholten	2025-12-15 19:34:17 +1000
committer	Sam Scholten	2025-12-15 19:34:59 +1000
commit	9f5978186ac3de07f4325975fecf4f538fe713b6 (patch)
tree	41440b703054fe59eb561ba81d80fd60380c1f7a /cmds
download	scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.tar.gz scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.zip