// Serve command: HTTP server for web UI and APIs.
//
// Two main flows: live-feed (cached + background refresh) and tools (on-demand scoring).
// Live-feed rescans all configured RSS feeds on a timer (default 24h), caches results,
// serves filtered articles via web UI and JSON/RSS APIs.
// Tools provides real-time /score (single title) and /scan (ad-hoc feed) endpoints.
// Background refresh continues despite individual feed failures; RWMutex allows
// many concurrent readers with exclusive writer updates.
// Templates are embedded for single-binary deployment.
package cmds
import (
"bufio"
"context"
"embed"
"encoding/json"
"flag"
"fmt"
"html/template"
"io"
"log"
"net/http"
"net/url"
"os"
"os/signal"
"path/filepath"
"regexp"
"sort"
"strings"
"sync"
"syscall"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/mmcdole/gofeed"
"scholscan/core"
)
//go:embed templates/*.html
var templateFS embed.FS
// ============================================================================
// ┏━╸┏┳┓╺┳┓ ┏━┓┏┓ ┏┓
// ┃ ┃┃┃ ┃┃ ┃ ┃┣┻┓ ┃
// ┗━╸╹ ╹╺┻┛ ┗━┛┗━┛┗━┛
// ============================================================================
type ServeCommand struct {
Port int
RSSWorldPath string
RefreshInterval string
ModelPath string
Title string
// Parsed interval
refreshInterval time.Duration
// Loaded model (cached)
model *core.ModelEnvelope
modelMu sync.RWMutex
// Cached filtered RSS results and timestamp.
// RWMutex allows many concurrent readers (HTTP handlers) with exclusive writer (background refresh).
filteredResults []*core.Article
filteredResultsTime time.Time
resultsMu sync.RWMutex
// Loaded templates
tmpl *template.Template
}
func (c *ServeCommand) Name() string { return "serve" }
// Init configures the serve command with robust input validation.
// Prevents directory traversal, validates paths, and sets sensible defaults.
// Ensures only one configuration is possible to reduce runtime complexity.
func (c *ServeCommand) Init(args []string) error {
fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
fs.Usage = func() {
fmt.Fprint(fs.Output(), `Usage: scholscan serve [options]
Start HTTP server for filtered RSS and scoring web UI.
Flags:
`)
fs.PrintDefaults()
fmt.Fprint(fs.Output(), `
Examples:
scholscan serve --port 8080 --rss-world rss_world.txt --model model.json
scholscan serve --refresh-interval 24h --model ./model.json --rss-world feeds.txt
`)
}
fs.IntVar(&c.Port, "port", 8080, "Port to listen on")
fs.StringVar(&c.RSSWorldPath, "rss-world", "rss_world.txt", "Path to RSS world file (one feed URL per line)")
fs.StringVar(&c.RefreshInterval, "refresh-interval", "24h", "Interval for background rescans (e.g., 24h, 1h)")
fs.StringVar(&c.ModelPath, "model", "model.json", "Path to trained model JSON file")
fs.StringVar(&c.Title, "title", "", "Custom title for the web interface")
if err := fs.Parse(args); err != nil {
return err
}
if fs.NArg() != 0 {
return fmt.Errorf("unexpected arguments provided: %v", fs.Args())
}
// Parse refresh interval
interval, err := time.ParseDuration(c.RefreshInterval)
if err != nil {
return fmt.Errorf("invalid refresh-interval %q: %w", c.RefreshInterval, err)
}
c.refreshInterval = interval
if strings.Contains(filepath.Clean(c.RSSWorldPath), "..") {
return fmt.Errorf("invalid rss-world path: directory traversal not allowed")
}
if strings.Contains(filepath.Clean(c.ModelPath), "..") {
return fmt.Errorf("invalid model path: directory traversal not allowed")
}
return nil
}
func (c *ServeCommand) Run(stdin io.Reader, stdout io.Writer) error {
log.Printf("Starting scholscan server on port %d", c.Port)
// Initialize filteredResultsTime to server start time
c.resultsMu.Lock()
c.filteredResultsTime = time.Now()
c.resultsMu.Unlock()
// Load templates at startup
tmpl, err := template.ParseFS(templateFS, "templates/*.html")
if err != nil {
return fmt.Errorf("failed to parse templates: %w", err)
}
c.tmpl = tmpl
log.Printf("Templates loaded successfully")
// Load model at startup
model, err := c.loadModel()
if err != nil {
return fmt.Errorf("failed to load model at startup: %w", err)
}
c.modelMu.Lock()
c.model = model
c.modelMu.Unlock()
log.Printf("Model loaded successfully")
// Start background ticker for periodic refresh
ticker := time.NewTicker(c.refreshInterval)
go c.backgroundRefresh(ticker)
// Perform initial scan asynchronously
go func() {
log.Println("Starting initial feed scan...")
if err := c.refreshFilteredResults(); err != nil {
log.Printf("Warning: initial scan failed: %v", err)
} else {
c.resultsMu.RLock()
count := len(c.filteredResults)
c.resultsMu.RUnlock()
log.Printf("Initial scan complete, %d articles filtered", count)
}
}()
// Setup HTTP handlers
http.HandleFunc("/", c.handleRoot)
http.HandleFunc("/live-feed", c.handleLiveFeed)
http.HandleFunc("/tools", c.handleTools)
http.HandleFunc("/score", c.handleScore)
http.HandleFunc("/scan", c.handleScan)
http.HandleFunc("/api/filtered/feed", c.handleFilteredFeed)
http.HandleFunc("/api/filtered/rss", c.handleFilteredRSS)
http.HandleFunc("/api/health", c.handleHealth)
// Setup server with graceful shutdown
server := &http.Server{
Addr: fmt.Sprintf(":%d", c.Port),
Handler: http.DefaultServeMux,
ReadTimeout: core.DefaultReadTimeout,
WriteTimeout: core.DefaultWriteTimeout,
IdleTimeout: core.DefaultIdleTimeout,
}
// Handle shutdown signals
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigChan
log.Println("Shutdown signal received")
ticker.Stop()
ctx, cancel := context.WithTimeout(context.Background(), core.DefaultShutdownTimeout)
defer cancel()
if err := server.Shutdown(ctx); err != nil {
log.Printf("Server shutdown error: %v", err)
}
}()
log.Printf("Server listening on http://localhost:%d", c.Port)
if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
return fmt.Errorf("server error: %w", err)
}
return nil
}
// ============================================================================
// ┏━╸┏━┓┏━┓┏━╸ ╻ ┏━┓┏━╸╻┏━╸
// ┃ ┃ ┃┣┳┛┣╸ ┃ ┃ ┃┃╺┓┃┃
// ┗━╸┗━┛╹┗╸┗━╸ ┗━╸┗━┛┗━┛╹┗━╸
// ============================================================================
func (c *ServeCommand) loadModel() (*core.ModelEnvelope, error) {
f, err := os.Open(c.ModelPath)
if err != nil {
return nil, fmt.Errorf("failed to open model file %s: %w", c.ModelPath, err)
}
defer f.Close()
var model core.ModelEnvelope
if err := json.NewDecoder(f).Decode(&model); err != nil {
return nil, fmt.Errorf("failed to decode model: %w", err)
}
return &model, nil
}
func (c *ServeCommand) scoreArticle(article *core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope) float64 {
docs := []string{strings.TrimSpace(article.Title)}
vectors := vectorizer.Transform(docs)
if len(vectors) == 0 || len(vectors[0]) == 0 {
return 0.0
}
score, err := core.PredictScore(vectors[0], model.Weights)
if err != nil {
// Return 0.0 on error (below threshold). Malformed articles don't break the display,
// they just get filtered out. Log the error for diagnostics.
log.Printf("Error scoring article: %v", err)
return 0.0
}
return score
}
func (c *ServeCommand) getThreshold(model *core.ModelEnvelope) (float64, error) {
if model.Meta != nil {
if threshold, ok := model.Meta["recommended_threshold"].(float64); ok {
return threshold, nil
}
}
return core.DefaultScoreThreshold, nil
}
// scoreAndFormatArticles scores a list of articles and returns them formatted for templates.
// Articles are scored using the model and vectorizer, then returned with human-readable ratings.
func (c *ServeCommand) scoreAndFormatArticles(articles []*core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope, threshold float64) []map[string]interface{} {
type ArticleResponse struct {
Title string `json:"title"`
URL string `json:"url"`
Source string `json:"source,omitempty"`
Rating int `json:"rating"`
Score float64 `json:"score"`
}
scored := make([]ArticleResponse, 0, len(articles))
for _, article := range articles {
score := c.scoreArticle(article, vectorizer, model)
rating := core.ScoreToScale(score, threshold)
scored = append(scored, ArticleResponse{
Title: article.Title,
URL: article.URL,
Source: article.Source,
Rating: rating,
Score: score,
})
}
result := make([]map[string]interface{}, len(scored))
for i, a := range scored {
result[i] = map[string]interface{}{
"Title": a.Title,
"URL": a.URL,
"Source": a.Source,
"Rating": a.Rating,
"Score": a.Score,
}
}
return result
}
// ============================================================================
// ┏━┓┏━┓┏━┓ ┏━┓╺┳╸╻ ╻┏━╸┏━╸
// ┣┳┛┗━┓┗━┓ ┗━┓ ┃ ┃ ┃┣╸ ┣╸
// ╹┗╸┗━┛┗━┛ ┗━┛ ╹ ┗━┛╹ ╹
// ============================================================================
func (c *ServeCommand) readRSSWorldFeeds() ([]string, error) {
f, err := os.Open(c.RSSWorldPath)
if err != nil {
return nil, fmt.Errorf("failed to open rss_world file %s: %w", c.RSSWorldPath, err)
}
defer f.Close()
var feeds []string
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line != "" && !strings.HasPrefix(line, "#") {
feeds = append(feeds, line)
}
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading rss_world file: %w", err)
}
return feeds, nil
}
func (c *ServeCommand) refreshFilteredResults() error {
feeds, err := c.readRSSWorldFeeds()
if err != nil {
return err
}
c.modelMu.RLock()
model := c.model
c.modelMu.RUnlock()
if model == nil {
return fmt.Errorf("model not loaded")
}
// Scan all feeds. Continue on individual feed failures to maximize results.
// RSS feeds are often flaky; one down shouldn't prevent others from being processed.
var allArticles []*core.Article
for _, feed := range feeds {
articles, err := c.fetchRSSFeed(feed)
if err != nil {
log.Printf("Warning: failed to fetch feed %s: %v", feed, err)
continue
}
allArticles = append(allArticles, articles...)
}
// Score and filter articles
threshold, err := c.getThreshold(model)
if err != nil {
return err
}
vectorizer := core.CreateVectorizerFromModel(model)
filtered := make([]*core.Article, 0, len(allArticles))
for _, article := range allArticles {
score := c.scoreArticle(article, vectorizer, model)
if score >= threshold {
// Create a copy with score to avoid reference issues
articleCopy := *article
articleCopy.Score = &score
filtered = append(filtered, &articleCopy)
}
}
c.resultsMu.Lock()
c.filteredResults = filtered
c.filteredResultsTime = time.Now()
c.resultsMu.Unlock()
return nil
}
// backgroundRefresh runs in a goroutine, rescanning all RSS feeds on interval.
// Failures in individual feeds don't affect others - we log and continue.
func (c *ServeCommand) backgroundRefresh(ticker *time.Ticker) {
for range ticker.C {
log.Println("Background refresh started")
if err := c.refreshFilteredResults(); err != nil {
log.Printf("Background refresh error (continuing): %v", err)
} else {
c.resultsMu.RLock()
count := len(c.filteredResults)
c.resultsMu.RUnlock()
log.Printf("Background refresh complete, %d articles filtered", count)
}
}
}
func (c *ServeCommand) fetchRSSFeed(url string) ([]*core.Article, error) {
client := &http.Client{Timeout: core.DefaultHTTPTimeout}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, fmt.Errorf("error building request: %w", err)
}
req.Header.Set("User-Agent", core.PoliteUserAgent)
ctx, cancel := context.WithTimeout(context.Background(), core.DefaultHTTPTimeout)
defer cancel()
resp, err := client.Do(req.WithContext(ctx))
if err != nil {
return nil, fmt.Errorf("error fetching %s: %w", url, err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("error reading response from %s: %w", url, err)
}
fp := gofeed.NewParser()
feed, err := fp.Parse(strings.NewReader(string(body)))
if err != nil {
return nil, fmt.Errorf("error parsing feed from %s: %w", url, err)
}
var articles []*core.Article
for _, item := range feed.Items {
article := &core.Article{
URL: item.Link,
Title: strings.TrimSpace(item.Title),
Source: feed.Title,
}
if item.PublishedParsed != nil {
article.PublishedAt = item.PublishedParsed
}
if len(article.Title) >= core.MinTitleLength {
articles = append(articles, article)
}
}
return articles, nil
}
// ============================================================================
// ╻ ╻┏━╸┏┓ ╻ ╻╻
// ┃╻┃┣╸ ┣┻┓ ┃ ┃┃
// ┗┻┛┗━╸┗━┛ ┗━┛╹
// ============================================================================
func (c *ServeCommand) handleRoot(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/" {
http.NotFound(w, r)
return
}
// Redirect to live feed
http.Redirect(w, r, "/live-feed", http.StatusMovedPermanently)
}
func (c *ServeCommand) handleLiveFeed(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
c.resultsMu.RLock()
articles := c.filteredResults
resultsTime := c.filteredResultsTime
c.resultsMu.RUnlock()
c.modelMu.RLock()
model := c.model
c.modelMu.RUnlock()
if model == nil {
http.Error(w, "Model not loaded", http.StatusInternalServerError)
return
}
threshold, _ := c.getThreshold(model)
// Parse filter parameter (day, week, or all)
filter := r.URL.Query().Get("filter")
if filter == "" {
filter = "all"
}
// Filter articles by date if needed
now := time.Now()
filtered := articles
if filter == "day" || filter == "week" {
var cutoff time.Time
if filter == "day" {
cutoff = now.Add(-24 * time.Hour)
} else if filter == "week" {
cutoff = now.Add(-7 * 24 * time.Hour)
}
filtered = make([]*core.Article, 0, len(articles))
for _, article := range articles {
// Always include articles without PublishedAt
if article.PublishedAt == nil || article.PublishedAt.After(cutoff) {
filtered = append(filtered, article)
}
}
}
// Convert articles to template format
type TemplateArticle struct {
Title string
URL string
Source string
Rating int
Score float64
PublishedAt string
}
templateArticles := make([]TemplateArticle, 0, len(filtered))
for _, article := range filtered {
score := 0.0
if article.Score != nil {
score = *article.Score
}
rating := core.ScoreToScale(score, threshold)
publishedAt := ""
if article.PublishedAt != nil {
publishedAt = article.PublishedAt.Format("2006-01-02")
}
templateArticles = append(templateArticles, TemplateArticle{
Title: article.Title,
URL: article.URL,
Source: article.Source,
Rating: rating,
Score: score,
PublishedAt: publishedAt,
})
}
// Sort articles by score (highest first)
sort.Slice(templateArticles, func(i, j int) bool {
return templateArticles[i].Score > templateArticles[j].Score
})
data := map[string]interface{}{
"Page": "live-feed",
"Articles": templateArticles,
"Threshold": threshold,
"UpdatedAt": resultsTime.Format("2006-01-02 15:04:05"),
"Filter": filter,
"Title": displayTitle(c.Title),
}
w.Header().Set("Content-Type", "text/html; charset=utf-8")
if err := c.tmpl.ExecuteTemplate(w, "live-feed", data); err != nil {
http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
}
}
func (c *ServeCommand) handleTools(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
data := map[string]interface{}{
"Page": "tools",
"Title": displayTitle(c.Title),
}
w.Header().Set("Content-Type", "text/html; charset=utf-8")
if err := c.tmpl.ExecuteTemplate(w, "tools", data); err != nil {
http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
}
}
func (c *ServeCommand) handleScore(w http.ResponseWriter, r *http.Request) {
if r.Method == http.MethodGet {
c.handleTools(w, r)
return
}
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
c.modelMu.RLock()
model := c.model
c.modelMu.RUnlock()
if model == nil {
http.Error(w, "Model not loaded", http.StatusInternalServerError)
return
}
if err := r.ParseForm(); err != nil {
http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest)
return
}
title := strings.TrimSpace(r.FormValue("title"))
url := strings.TrimSpace(r.FormValue("url"))
// If URL provided, fetch and extract title from it; otherwise use provided title.
if url != "" {
extractedTitle, err := extractTitleFromURL(url)
if err != nil {
c.renderResultsError(w, fmt.Sprintf("Failed to extract title from URL: %v", err), title)
return
}
title = extractedTitle
}
// Validate input before scoring
if valErr := c.validateTitle(title); valErr != "" {
c.renderResultsError(w, valErr, title)
return
}
vectorizer := core.CreateVectorizerFromModel(model)
article := &core.Article{Title: title}
score := c.scoreArticle(article, vectorizer, model)
threshold, _ := c.getThreshold(model)
rating := core.ScoreToScale(score, threshold)
data := map[string]interface{}{
"Page": "tools",
"IsScoreResult": true,
"Title": title,
"Rating": rating,
"Score": score,
"Threshold": threshold,
"PageTitle": displayTitle(c.Title),
}
w.Header().Set("Content-Type", "text/html; charset=utf-8")
if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
}
}
func (c *ServeCommand) handleScan(w http.ResponseWriter, r *http.Request) {
if r.Method == http.MethodGet {
c.handleTools(w, r)
return
}
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
c.modelMu.RLock()
model := c.model
c.modelMu.RUnlock()
if model == nil {
http.Error(w, "Model not loaded", http.StatusInternalServerError)
return
}
if err := r.ParseForm(); err != nil {
http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest)
return
}
feedURL := strings.TrimSpace(r.FormValue("feed_url"))
// Validate and fetch the feed
if valErr := c.validateFeedURL(feedURL); valErr != "" {
c.renderScanResultsError(w, valErr, feedURL)
return
}
articles, err := c.fetchRSSFeed(feedURL)
if err != nil {
c.renderScanResultsError(w, fmt.Sprintf("Failed to fetch feed: %v", err), feedURL)
return
}
// Score articles
threshold, _ := c.getThreshold(model)
vectorizer := core.CreateVectorizerFromModel(model)
scored := c.scoreAndFormatArticles(articles, vectorizer, model, threshold)
sort.Slice(scored, func(i, j int) bool {
iScore := scored[i]["Score"].(float64)
jScore := scored[j]["Score"].(float64)
return iScore > jScore
})
data := map[string]interface{}{
"Page": "tools",
"IsScanResult": true,
"FeedURL": feedURL,
"Articles": scored,
"Threshold": threshold,
"PageTitle": displayTitle(c.Title),
}
w.Header().Set("Content-Type", "text/html; charset=utf-8")
if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
}
}
// ============================================================================
// ┏━┓┏━┓╻ ┏━╸┏┓╻╺┳┓┏━┓┏━┓╻┏┓╻╺┳╸┏━┓
// ┣━┫┣━┛┃ ┣╸ ┃┗┫ ┃┃┣━┛┃ ┃┃┃┗┫ ┃ ┗━┓
// ╹ ╹╹ ╹ ┗━╸╹ ╹╺┻┛╹ ┗━┛╹╹ ╹ ╹ ┗━┛
// ============================================================================
func (c *ServeCommand) handleFilteredFeed(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
c.resultsMu.RLock()
articles := c.filteredResults
resultsTime := c.filteredResultsTime
c.resultsMu.RUnlock()
c.modelMu.RLock()
model := c.model
c.modelMu.RUnlock()
threshold, _ := c.getThreshold(model)
type ArticleResponse struct {
Title string `json:"title"`
URL string `json:"url"`
Source string `json:"source,omitempty"`
Rating int `json:"rating"`
Score float64 `json:"score"`
}
scored := make([]ArticleResponse, 0, len(articles))
for _, article := range articles {
score := 0.0
if article.Score != nil {
score = *article.Score
}
rating := core.ScoreToScale(score, threshold)
scored = append(scored, ArticleResponse{
Title: article.Title,
URL: article.URL,
Source: article.Source,
Rating: rating,
Score: score,
})
}
w.Header().Set("Content-Type", "application/json")
w.Header().Set("Cache-Control", "no-cache, no-store, must-revalidate")
if err := json.NewEncoder(w).Encode(map[string]interface{}{
"total": len(articles),
"threshold": threshold,
"updated_at": resultsTime,
"articles": scored,
}); err != nil {
http.Error(w, "Failed to encode response", http.StatusInternalServerError)
}
}
func (c *ServeCommand) handleFilteredRSS(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
c.resultsMu.RLock()
articles := c.filteredResults
c.resultsMu.RUnlock()
c.modelMu.RLock()
model := c.model
c.modelMu.RUnlock()
w.Header().Set("Content-Type", "application/rss+xml")
w.Header().Set("Cache-Control", "public, max-age=3600")
// Generate RSS feed
fmt.Fprintf(w, `