diff options
Diffstat (limited to 'cmds/serve.go')
| -rw-r--r-- | cmds/serve.go | 1010 |
1 files changed, 1010 insertions, 0 deletions
diff --git a/cmds/serve.go b/cmds/serve.go new file mode 100644 index 0000000..92aa64c --- /dev/null +++ b/cmds/serve.go @@ -0,0 +1,1010 @@ +// Serve command: HTTP server for web UI and APIs. +// +// Two main flows: live-feed (cached + background refresh) and tools (on-demand scoring). +// Live-feed rescans all configured RSS feeds on a timer (default 24h), caches results, +// serves filtered articles via web UI and JSON/RSS APIs. +// Tools provides real-time /score (single title) and /scan (ad-hoc feed) endpoints. +// Background refresh continues despite individual feed failures; RWMutex allows +// many concurrent readers with exclusive writer updates. +// Templates are embedded for single-binary deployment. +package cmds + +import ( + "bufio" + "context" + "embed" + "encoding/json" + "flag" + "fmt" + "html/template" + "io" + "log" + "net/http" + "net/url" + "os" + "os/signal" + "path/filepath" + "regexp" + "sort" + "strings" + "sync" + "syscall" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/mmcdole/gofeed" + "scholscan/core" +) + +//go:embed templates/*.html +var templateFS embed.FS + +// ============================================================================ +// ┏━╸┏┳┓╺┳┓ ┏━┓┏┓ ┏┓ +// ┃ ┃┃┃ ┃┃ ┃ ┃┣┻┓ ┃ +// ┗━╸╹ ╹╺┻┛ ┗━┛┗━┛┗━┛ +// ============================================================================ + +type ServeCommand struct { + Port int + RSSWorldPath string + RefreshInterval string + ModelPath string + Title string + + // Parsed interval + refreshInterval time.Duration + // Loaded model (cached) + model *core.ModelEnvelope + modelMu sync.RWMutex + // Cached filtered RSS results and timestamp. + // RWMutex allows many concurrent readers (HTTP handlers) with exclusive writer (background refresh). + filteredResults []*core.Article + filteredResultsTime time.Time + resultsMu sync.RWMutex + // Loaded templates + tmpl *template.Template +} + +func (c *ServeCommand) Name() string { return "serve" } + +// Init configures the serve command with robust input validation. +// Prevents directory traversal, validates paths, and sets sensible defaults. +// Ensures only one configuration is possible to reduce runtime complexity. +func (c *ServeCommand) Init(args []string) error { + fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError) + fs.Usage = func() { + fmt.Fprint(fs.Output(), `Usage: scholscan serve [options] + + Start HTTP server for filtered RSS and scoring web UI. + + Flags: + `) + fs.PrintDefaults() + fmt.Fprint(fs.Output(), ` + Examples: + scholscan serve --port 8080 --rss-world rss_world.txt --model model.json + scholscan serve --refresh-interval 24h --model ./model.json --rss-world feeds.txt + `) + } + + fs.IntVar(&c.Port, "port", 8080, "Port to listen on") + fs.StringVar(&c.RSSWorldPath, "rss-world", "rss_world.txt", "Path to RSS world file (one feed URL per line)") + fs.StringVar(&c.RefreshInterval, "refresh-interval", "24h", "Interval for background rescans (e.g., 24h, 1h)") + fs.StringVar(&c.ModelPath, "model", "model.json", "Path to trained model JSON file") + fs.StringVar(&c.Title, "title", "", "Custom title for the web interface") + + if err := fs.Parse(args); err != nil { + return err + } + + if fs.NArg() != 0 { + return fmt.Errorf("unexpected arguments provided: %v", fs.Args()) + } + + // Parse refresh interval + interval, err := time.ParseDuration(c.RefreshInterval) + if err != nil { + return fmt.Errorf("invalid refresh-interval %q: %w", c.RefreshInterval, err) + } + c.refreshInterval = interval + + if strings.Contains(filepath.Clean(c.RSSWorldPath), "..") { + return fmt.Errorf("invalid rss-world path: directory traversal not allowed") + } + if strings.Contains(filepath.Clean(c.ModelPath), "..") { + return fmt.Errorf("invalid model path: directory traversal not allowed") + } + + return nil +} + +func (c *ServeCommand) Run(stdin io.Reader, stdout io.Writer) error { + log.Printf("Starting scholscan server on port %d", c.Port) + + // Initialize filteredResultsTime to server start time + c.resultsMu.Lock() + c.filteredResultsTime = time.Now() + c.resultsMu.Unlock() + + // Load templates at startup + tmpl, err := template.ParseFS(templateFS, "templates/*.html") + if err != nil { + return fmt.Errorf("failed to parse templates: %w", err) + } + c.tmpl = tmpl + log.Printf("Templates loaded successfully") + + // Load model at startup + model, err := c.loadModel() + if err != nil { + return fmt.Errorf("failed to load model at startup: %w", err) + } + c.modelMu.Lock() + c.model = model + c.modelMu.Unlock() + + log.Printf("Model loaded successfully") + + // Start background ticker for periodic refresh + ticker := time.NewTicker(c.refreshInterval) + go c.backgroundRefresh(ticker) + + // Perform initial scan asynchronously + go func() { + log.Println("Starting initial feed scan...") + if err := c.refreshFilteredResults(); err != nil { + log.Printf("Warning: initial scan failed: %v", err) + } else { + c.resultsMu.RLock() + count := len(c.filteredResults) + c.resultsMu.RUnlock() + log.Printf("Initial scan complete, %d articles filtered", count) + } + }() + + // Setup HTTP handlers + http.HandleFunc("/", c.handleRoot) + http.HandleFunc("/live-feed", c.handleLiveFeed) + http.HandleFunc("/tools", c.handleTools) + http.HandleFunc("/score", c.handleScore) + http.HandleFunc("/scan", c.handleScan) + http.HandleFunc("/api/filtered/feed", c.handleFilteredFeed) + http.HandleFunc("/api/filtered/rss", c.handleFilteredRSS) + http.HandleFunc("/api/health", c.handleHealth) + + // Setup server with graceful shutdown + server := &http.Server{ + Addr: fmt.Sprintf(":%d", c.Port), + Handler: http.DefaultServeMux, + ReadTimeout: core.DefaultReadTimeout, + WriteTimeout: core.DefaultWriteTimeout, + IdleTimeout: core.DefaultIdleTimeout, + } + + // Handle shutdown signals + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + go func() { + <-sigChan + log.Println("Shutdown signal received") + ticker.Stop() + ctx, cancel := context.WithTimeout(context.Background(), core.DefaultShutdownTimeout) + defer cancel() + if err := server.Shutdown(ctx); err != nil { + log.Printf("Server shutdown error: %v", err) + } + }() + + log.Printf("Server listening on http://localhost:%d", c.Port) + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + return fmt.Errorf("server error: %w", err) + } + + return nil +} + +// ============================================================================ +// ┏━╸┏━┓┏━┓┏━╸ ╻ ┏━┓┏━╸╻┏━╸ +// ┃ ┃ ┃┣┳┛┣╸ ┃ ┃ ┃┃╺┓┃┃ +// ┗━╸┗━┛╹┗╸┗━╸ ┗━╸┗━┛┗━┛╹┗━╸ +// ============================================================================ + +func (c *ServeCommand) loadModel() (*core.ModelEnvelope, error) { + f, err := os.Open(c.ModelPath) + if err != nil { + return nil, fmt.Errorf("failed to open model file %s: %w", c.ModelPath, err) + } + defer f.Close() + + var model core.ModelEnvelope + if err := json.NewDecoder(f).Decode(&model); err != nil { + return nil, fmt.Errorf("failed to decode model: %w", err) + } + + return &model, nil +} + +func (c *ServeCommand) scoreArticle(article *core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope) float64 { + docs := []string{strings.TrimSpace(article.Title)} + vectors := vectorizer.Transform(docs) + + if len(vectors) == 0 || len(vectors[0]) == 0 { + return 0.0 + } + + score, err := core.PredictScore(vectors[0], model.Weights) + if err != nil { + // Return 0.0 on error (below threshold). Malformed articles don't break the display, + // they just get filtered out. Log the error for diagnostics. + log.Printf("Error scoring article: %v", err) + return 0.0 + } + + return score +} + +func (c *ServeCommand) getThreshold(model *core.ModelEnvelope) (float64, error) { + if model.Meta != nil { + if threshold, ok := model.Meta["recommended_threshold"].(float64); ok { + return threshold, nil + } + } + return core.DefaultScoreThreshold, nil +} + +// scoreAndFormatArticles scores a list of articles and returns them formatted for templates. +// Articles are scored using the model and vectorizer, then returned with human-readable ratings. +func (c *ServeCommand) scoreAndFormatArticles(articles []*core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope, threshold float64) []map[string]interface{} { + type ArticleResponse struct { + Title string `json:"title"` + URL string `json:"url"` + Source string `json:"source,omitempty"` + Rating int `json:"rating"` + Score float64 `json:"score"` + } + + scored := make([]ArticleResponse, 0, len(articles)) + for _, article := range articles { + score := c.scoreArticle(article, vectorizer, model) + rating := core.ScoreToScale(score, threshold) + + scored = append(scored, ArticleResponse{ + Title: article.Title, + URL: article.URL, + Source: article.Source, + Rating: rating, + Score: score, + }) + } + + result := make([]map[string]interface{}, len(scored)) + for i, a := range scored { + result[i] = map[string]interface{}{ + "Title": a.Title, + "URL": a.URL, + "Source": a.Source, + "Rating": a.Rating, + "Score": a.Score, + } + } + return result +} + +// ============================================================================ +// ┏━┓┏━┓┏━┓ ┏━┓╺┳╸╻ ╻┏━╸┏━╸ +// ┣┳┛┗━┓┗━┓ ┗━┓ ┃ ┃ ┃┣╸ ┣╸ +// ╹┗╸┗━┛┗━┛ ┗━┛ ╹ ┗━┛╹ ╹ +// ============================================================================ + +func (c *ServeCommand) readRSSWorldFeeds() ([]string, error) { + f, err := os.Open(c.RSSWorldPath) + if err != nil { + return nil, fmt.Errorf("failed to open rss_world file %s: %w", c.RSSWorldPath, err) + } + defer f.Close() + + var feeds []string + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line != "" && !strings.HasPrefix(line, "#") { + feeds = append(feeds, line) + } + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("error reading rss_world file: %w", err) + } + + return feeds, nil +} + +func (c *ServeCommand) refreshFilteredResults() error { + feeds, err := c.readRSSWorldFeeds() + if err != nil { + return err + } + + c.modelMu.RLock() + model := c.model + c.modelMu.RUnlock() + + if model == nil { + return fmt.Errorf("model not loaded") + } + + // Scan all feeds. Continue on individual feed failures to maximize results. + // RSS feeds are often flaky; one down shouldn't prevent others from being processed. + var allArticles []*core.Article + for _, feed := range feeds { + articles, err := c.fetchRSSFeed(feed) + if err != nil { + log.Printf("Warning: failed to fetch feed %s: %v", feed, err) + continue + } + allArticles = append(allArticles, articles...) + } + + // Score and filter articles + threshold, err := c.getThreshold(model) + if err != nil { + return err + } + + vectorizer := core.CreateVectorizerFromModel(model) + + filtered := make([]*core.Article, 0, len(allArticles)) + for _, article := range allArticles { + score := c.scoreArticle(article, vectorizer, model) + if score >= threshold { + // Create a copy with score to avoid reference issues + articleCopy := *article + articleCopy.Score = &score + filtered = append(filtered, &articleCopy) + } + } + + c.resultsMu.Lock() + c.filteredResults = filtered + c.filteredResultsTime = time.Now() + c.resultsMu.Unlock() + + return nil +} + +// backgroundRefresh runs in a goroutine, rescanning all RSS feeds on interval. +// Failures in individual feeds don't affect others - we log and continue. +func (c *ServeCommand) backgroundRefresh(ticker *time.Ticker) { + for range ticker.C { + log.Println("Background refresh started") + if err := c.refreshFilteredResults(); err != nil { + log.Printf("Background refresh error (continuing): %v", err) + } else { + c.resultsMu.RLock() + count := len(c.filteredResults) + c.resultsMu.RUnlock() + log.Printf("Background refresh complete, %d articles filtered", count) + } + } +} + +func (c *ServeCommand) fetchRSSFeed(url string) ([]*core.Article, error) { + client := &http.Client{Timeout: core.DefaultHTTPTimeout} + + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, fmt.Errorf("error building request: %w", err) + } + req.Header.Set("User-Agent", core.PoliteUserAgent) + + ctx, cancel := context.WithTimeout(context.Background(), core.DefaultHTTPTimeout) + defer cancel() + + resp, err := client.Do(req.WithContext(ctx)) + if err != nil { + return nil, fmt.Errorf("error fetching %s: %w", url, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("error reading response from %s: %w", url, err) + } + + fp := gofeed.NewParser() + feed, err := fp.Parse(strings.NewReader(string(body))) + if err != nil { + return nil, fmt.Errorf("error parsing feed from %s: %w", url, err) + } + + var articles []*core.Article + for _, item := range feed.Items { + article := &core.Article{ + URL: item.Link, + Title: strings.TrimSpace(item.Title), + Source: feed.Title, + } + + if item.PublishedParsed != nil { + article.PublishedAt = item.PublishedParsed + } + + if len(article.Title) >= core.MinTitleLength { + articles = append(articles, article) + } + } + + return articles, nil +} + +// ============================================================================ +// ╻ ╻┏━╸┏┓ ╻ ╻╻ +// ┃╻┃┣╸ ┣┻┓ ┃ ┃┃ +// ┗┻┛┗━╸┗━┛ ┗━┛╹ +// ============================================================================ + +func (c *ServeCommand) handleRoot(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/" { + http.NotFound(w, r) + return + } + + // Redirect to live feed + http.Redirect(w, r, "/live-feed", http.StatusMovedPermanently) +} + +func (c *ServeCommand) handleLiveFeed(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + c.resultsMu.RLock() + articles := c.filteredResults + resultsTime := c.filteredResultsTime + c.resultsMu.RUnlock() + + c.modelMu.RLock() + model := c.model + c.modelMu.RUnlock() + + if model == nil { + http.Error(w, "Model not loaded", http.StatusInternalServerError) + return + } + + threshold, _ := c.getThreshold(model) + + // Parse filter parameter (day, week, or all) + filter := r.URL.Query().Get("filter") + if filter == "" { + filter = "all" + } + + // Filter articles by date if needed + now := time.Now() + filtered := articles + if filter == "day" || filter == "week" { + var cutoff time.Time + if filter == "day" { + cutoff = now.Add(-24 * time.Hour) + } else if filter == "week" { + cutoff = now.Add(-7 * 24 * time.Hour) + } + + filtered = make([]*core.Article, 0, len(articles)) + for _, article := range articles { + // Always include articles without PublishedAt + if article.PublishedAt == nil || article.PublishedAt.After(cutoff) { + filtered = append(filtered, article) + } + } + } + + // Convert articles to template format + type TemplateArticle struct { + Title string + URL string + Source string + Rating int + Score float64 + PublishedAt string + } + + templateArticles := make([]TemplateArticle, 0, len(filtered)) + for _, article := range filtered { + score := 0.0 + if article.Score != nil { + score = *article.Score + } + rating := core.ScoreToScale(score, threshold) + + publishedAt := "" + if article.PublishedAt != nil { + publishedAt = article.PublishedAt.Format("2006-01-02") + } + + templateArticles = append(templateArticles, TemplateArticle{ + Title: article.Title, + URL: article.URL, + Source: article.Source, + Rating: rating, + Score: score, + PublishedAt: publishedAt, + }) + } + + // Sort articles by score (highest first) + sort.Slice(templateArticles, func(i, j int) bool { + return templateArticles[i].Score > templateArticles[j].Score + }) + + data := map[string]interface{}{ + "Page": "live-feed", + "Articles": templateArticles, + "Threshold": threshold, + "UpdatedAt": resultsTime.Format("2006-01-02 15:04:05"), + "Filter": filter, + "Title": displayTitle(c.Title), + } + + w.Header().Set("Content-Type", "text/html; charset=utf-8") + if err := c.tmpl.ExecuteTemplate(w, "live-feed", data); err != nil { + http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) + } +} + +func (c *ServeCommand) handleTools(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + data := map[string]interface{}{ + "Page": "tools", + "Title": displayTitle(c.Title), + } + + w.Header().Set("Content-Type", "text/html; charset=utf-8") + if err := c.tmpl.ExecuteTemplate(w, "tools", data); err != nil { + http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) + } +} + +func (c *ServeCommand) handleScore(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodGet { + c.handleTools(w, r) + return + } + + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + c.modelMu.RLock() + model := c.model + c.modelMu.RUnlock() + + if model == nil { + http.Error(w, "Model not loaded", http.StatusInternalServerError) + return + } + + if err := r.ParseForm(); err != nil { + http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest) + return + } + + title := strings.TrimSpace(r.FormValue("title")) + url := strings.TrimSpace(r.FormValue("url")) + + // If URL provided, fetch and extract title from it; otherwise use provided title. + if url != "" { + extractedTitle, err := extractTitleFromURL(url) + if err != nil { + c.renderResultsError(w, fmt.Sprintf("Failed to extract title from URL: %v", err), title) + return + } + title = extractedTitle + } + + // Validate input before scoring + if valErr := c.validateTitle(title); valErr != "" { + c.renderResultsError(w, valErr, title) + return + } + + vectorizer := core.CreateVectorizerFromModel(model) + article := &core.Article{Title: title} + score := c.scoreArticle(article, vectorizer, model) + + threshold, _ := c.getThreshold(model) + rating := core.ScoreToScale(score, threshold) + + data := map[string]interface{}{ + "Page": "tools", + "IsScoreResult": true, + "Title": title, + "Rating": rating, + "Score": score, + "Threshold": threshold, + "PageTitle": displayTitle(c.Title), + } + + w.Header().Set("Content-Type", "text/html; charset=utf-8") + if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil { + http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) + } +} + +func (c *ServeCommand) handleScan(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodGet { + c.handleTools(w, r) + return + } + + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + c.modelMu.RLock() + model := c.model + c.modelMu.RUnlock() + + if model == nil { + http.Error(w, "Model not loaded", http.StatusInternalServerError) + return + } + + if err := r.ParseForm(); err != nil { + http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest) + return + } + + feedURL := strings.TrimSpace(r.FormValue("feed_url")) + + // Validate and fetch the feed + if valErr := c.validateFeedURL(feedURL); valErr != "" { + c.renderScanResultsError(w, valErr, feedURL) + return + } + + articles, err := c.fetchRSSFeed(feedURL) + if err != nil { + c.renderScanResultsError(w, fmt.Sprintf("Failed to fetch feed: %v", err), feedURL) + return + } + + // Score articles + threshold, _ := c.getThreshold(model) + vectorizer := core.CreateVectorizerFromModel(model) + scored := c.scoreAndFormatArticles(articles, vectorizer, model, threshold) + + sort.Slice(scored, func(i, j int) bool { + iScore := scored[i]["Score"].(float64) + jScore := scored[j]["Score"].(float64) + return iScore > jScore + }) + + data := map[string]interface{}{ + "Page": "tools", + "IsScanResult": true, + "FeedURL": feedURL, + "Articles": scored, + "Threshold": threshold, + "PageTitle": displayTitle(c.Title), + } + + w.Header().Set("Content-Type", "text/html; charset=utf-8") + if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil { + http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) + } +} + +// ============================================================================ +// ┏━┓┏━┓╻ ┏━╸┏┓╻╺┳┓┏━┓┏━┓╻┏┓╻╺┳╸┏━┓ +// ┣━┫┣━┛┃ ┣╸ ┃┗┫ ┃┃┣━┛┃ ┃┃┃┗┫ ┃ ┗━┓ +// ╹ ╹╹ ╹ ┗━╸╹ ╹╺┻┛╹ ┗━┛╹╹ ╹ ╹ ┗━┛ +// ============================================================================ + +func (c *ServeCommand) handleFilteredFeed(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + c.resultsMu.RLock() + articles := c.filteredResults + resultsTime := c.filteredResultsTime + c.resultsMu.RUnlock() + + c.modelMu.RLock() + model := c.model + c.modelMu.RUnlock() + + threshold, _ := c.getThreshold(model) + + type ArticleResponse struct { + Title string `json:"title"` + URL string `json:"url"` + Source string `json:"source,omitempty"` + Rating int `json:"rating"` + Score float64 `json:"score"` + } + + scored := make([]ArticleResponse, 0, len(articles)) + for _, article := range articles { + score := 0.0 + if article.Score != nil { + score = *article.Score + } + rating := core.ScoreToScale(score, threshold) + + scored = append(scored, ArticleResponse{ + Title: article.Title, + URL: article.URL, + Source: article.Source, + Rating: rating, + Score: score, + }) + } + + w.Header().Set("Content-Type", "application/json") + w.Header().Set("Cache-Control", "no-cache, no-store, must-revalidate") + + if err := json.NewEncoder(w).Encode(map[string]interface{}{ + "total": len(articles), + "threshold": threshold, + "updated_at": resultsTime, + "articles": scored, + }); err != nil { + http.Error(w, "Failed to encode response", http.StatusInternalServerError) + } +} + +func (c *ServeCommand) handleFilteredRSS(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + c.resultsMu.RLock() + articles := c.filteredResults + c.resultsMu.RUnlock() + + c.modelMu.RLock() + model := c.model + c.modelMu.RUnlock() + + w.Header().Set("Content-Type", "application/rss+xml") + w.Header().Set("Cache-Control", "public, max-age=3600") + + // Generate RSS feed + fmt.Fprintf(w, `<?xml version="1.0" encoding="UTF-8"?> + <rss version="2.0"> + <channel> + <title>%s - Filtered Articles</title> + <link>http://scholscan.local</link> + <description>Articles filtered by your learned preferences (scored 1-10)</description> + `, displayTitle(c.Title)) + + for _, article := range articles { + rawScore := 0.0 + if article.Score != nil { + rawScore = *article.Score + } + + threshold, _ := c.getThreshold(model) + scaledScore := core.ScoreToScale(rawScore, threshold) + + title := escapeXML(article.Title) + url := escapeXML(article.URL) + description := fmt.Sprintf("SCHOLSCAN SCORE = %d/10 (raw: %.3f)", scaledScore, rawScore) + + fmt.Fprintf(w, ` <item> + <title>%s</title> + <link>%s</link> + <description>%s</description> + </item> + `, title, url, description) + } + + fmt.Fprint(w, ` </channel> + </rss>`) +} + +func (c *ServeCommand) handleHealth(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + c.modelMu.RLock() + modelLoaded := c.model != nil + c.modelMu.RUnlock() + + status := "ok" + if !modelLoaded { + status = "model_not_loaded" + w.WriteHeader(http.StatusInternalServerError) + } + + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(map[string]interface{}{ + "status": status, + "model_loaded": modelLoaded, + "timestamp": time.Now().Unix(), + }); err != nil { + http.Error(w, "Failed to encode response", http.StatusInternalServerError) + } +} + +// ============================================================================ +// ╻ ╻┏━╸╻ ┏━┓┏━╸┏━┓┏━┓ +// ┣━┫┣╸ ┃ ┣━┛┣╸ ┣┳┛┗━┓ +// ╹ ╹┗━╸┗━╸╹ ┗━╸╹┗╸┗━┛ +// ============================================================================ + +func displayTitle(custom string) string { + if custom != "" { + return custom + } + return "ScholScan" +} + +// extractTitleFromURL fetches the content from a URL and extracts the title from the HTML. +// Designed to be resilient: tries multiple title sources, handles various URL formats, +// and provides meaningful error feedback if extraction fails. +func extractTitleFromURL(rawURL string) (string, error) { + if rawURL == "" { + return "", fmt.Errorf("empty URL") + } + + // Check if it's a DOI + if strings.HasPrefix(rawURL, "10.") { + // Convert DOI to URL + rawURL = fmt.Sprintf("https://doi.org/%s", rawURL) + } else if !strings.HasPrefix(rawURL, "http://") && !strings.HasPrefix(rawURL, "https://") { + rawURL = "https://" + rawURL + } + + ctx, cancel := context.WithTimeout(context.Background(), core.DefaultContextTimeout) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil) + if err != nil { + return "", fmt.Errorf("invalid URL: %w", err) + } + req.Header.Set("User-Agent", core.PoliteUserAgent) + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + + resp, err := core.DoRequestWithRetry(ctx, core.DefaultHTTPClient, req) + if err != nil { + return "", fmt.Errorf("failed to fetch URL: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) + } + + doc, err := goquery.NewDocumentFromReader(resp.Body) + if err != nil { + return "", fmt.Errorf("failed to parse HTML: %w", err) + } + + // Fallback chain: <title> → og:title → twitter:title → <h1> + // Different sites populate these differently; trying multiple increases success rate. + title := "" + + if t := doc.Find("title").Text(); t != "" { + title = strings.TrimSpace(t) + } + + if title == "" { + if t, exists := doc.Find(`meta[property="og:title"]`).Attr("content"); exists && t != "" { + title = strings.TrimSpace(t) + } + } + + if title == "" { + if t, exists := doc.Find(`meta[name="twitter:title"]`).Attr("content"); exists && t != "" { + title = strings.TrimSpace(t) + } + } + + if title == "" { + if t := doc.Find("h1").First().Text(); t != "" { + title = strings.TrimSpace(t) + } + } + + if title == "" { + return "", fmt.Errorf("could not extract title from page") + } + + // Clean up common title patterns + reClean := regexp.MustCompile(`\s*\|\s*`) + title = reClean.ReplaceAllString(title, "") + + rePub := regexp.MustCompile(`^[^|]*\|\s*`) + title = rePub.ReplaceAllString(title, "") + title = strings.TrimSpace(title) + + if len(title) < 10 { + return "", fmt.Errorf("extracted title too short: %q", title) + } + + return title, nil +} + +// validateTitle checks that a title is suitable for scoring. +// Returns an error message string if invalid, empty string if valid. +func (c *ServeCommand) validateTitle(title string) string { + if strings.TrimSpace(title) == "" { + return "Title cannot be empty" + } + if len(title) > 1000 { + return "Title too long (max 1000 characters)" + } + return "" +} + +// renderResultsError renders the results template with an error message. +func (c *ServeCommand) renderResultsError(w http.ResponseWriter, errMsg, title string) { + data := map[string]interface{}{ + "Page": "tools", + "IsScoreResult": true, + "Error": errMsg, + "Title": title, + "PageTitle": displayTitle(c.Title), + } + w.Header().Set("Content-Type", "text/html; charset=utf-8") + if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil { + http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) + } +} + +// validateFeedURL checks that a feed URL is non-empty and has valid format. +// Returns an error message string if invalid, empty string if valid. +func (c *ServeCommand) validateFeedURL(feedURL string) string { + if feedURL == "" { + return "Feed URL cannot be empty" + } + if _, err := url.Parse(feedURL); err != nil { + return "Invalid URL format" + } + return "" +} + +// renderScanResultsError renders the results template with an error for scan operation. +func (c *ServeCommand) renderScanResultsError(w http.ResponseWriter, errMsg, feedURL string) { + data := map[string]interface{}{ + "Page": "tools", + "IsScanResult": true, + "Error": errMsg, + "FeedURL": feedURL, + "PageTitle": displayTitle(c.Title), + } + w.Header().Set("Content-Type", "text/html; charset=utf-8") + if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil { + http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) + } +} + +func escapeXML(s string) string { + s = strings.ReplaceAll(s, "&", "&") + s = strings.ReplaceAll(s, "<", "<") + s = strings.ReplaceAll(s, ">", ">") + s = strings.ReplaceAll(s, "\"", """) + s = strings.ReplaceAll(s, "'", "'") + return s +} |
