// Serve command: HTTP server for web UI and APIs. // // Two main flows: live-feed (cached + background refresh) and tools (on-demand scoring). // Live-feed rescans all configured RSS feeds on a timer (default 24h), caches results, // serves filtered articles via web UI and JSON/RSS APIs. // Tools provides real-time /score (single title) and /scan (ad-hoc feed) endpoints. // Background refresh continues despite individual feed failures; RWMutex allows // many concurrent readers with exclusive writer updates. // Templates are embedded for single-binary deployment. package cmds import ( "bufio" "context" "embed" "encoding/json" "flag" "fmt" "html/template" "io" "log" "net/http" "net/url" "os" "os/signal" "path/filepath" "regexp" "sort" "strings" "sync" "syscall" "time" "github.com/PuerkitoBio/goquery" "github.com/mmcdole/gofeed" "scholscan/core" ) //go:embed templates/*.html var templateFS embed.FS // ============================================================================ // ┏━╸┏┳┓╺┳┓ ┏━┓┏┓ ┏┓ // ┃ ┃┃┃ ┃┃ ┃ ┃┣┻┓ ┃ // ┗━╸╹ ╹╺┻┛ ┗━┛┗━┛┗━┛ // ============================================================================ type ServeCommand struct { Port int RSSWorldPath string RefreshInterval string ModelPath string Title string // Parsed interval refreshInterval time.Duration // Loaded model (cached) model *core.ModelEnvelope modelMu sync.RWMutex // Cached filtered RSS results and timestamp. // RWMutex allows many concurrent readers (HTTP handlers) with exclusive writer (background refresh). filteredResults []*core.Article filteredResultsTime time.Time resultsMu sync.RWMutex // Loaded templates tmpl *template.Template } func (c *ServeCommand) Name() string { return "serve" } // Init configures the serve command with robust input validation. // Prevents directory traversal, validates paths, and sets sensible defaults. // Ensures only one configuration is possible to reduce runtime complexity. func (c *ServeCommand) Init(args []string) error { fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError) fs.Usage = func() { fmt.Fprint(fs.Output(), `Usage: scholscan serve [options] Start HTTP server for filtered RSS and scoring web UI. Flags: `) fs.PrintDefaults() fmt.Fprint(fs.Output(), ` Examples: scholscan serve --port 8080 --rss-world rss_world.txt --model model.json scholscan serve --refresh-interval 24h --model ./model.json --rss-world feeds.txt `) } fs.IntVar(&c.Port, "port", 8080, "Port to listen on") fs.StringVar(&c.RSSWorldPath, "rss-world", "rss_world.txt", "Path to RSS world file (one feed URL per line)") fs.StringVar(&c.RefreshInterval, "refresh-interval", "24h", "Interval for background rescans (e.g., 24h, 1h)") fs.StringVar(&c.ModelPath, "model", "model.json", "Path to trained model JSON file") fs.StringVar(&c.Title, "title", "", "Custom title for the web interface") if err := fs.Parse(args); err != nil { return err } if fs.NArg() != 0 { return fmt.Errorf("unexpected arguments provided: %v", fs.Args()) } // Parse refresh interval interval, err := time.ParseDuration(c.RefreshInterval) if err != nil { return fmt.Errorf("invalid refresh-interval %q: %w", c.RefreshInterval, err) } c.refreshInterval = interval if strings.Contains(filepath.Clean(c.RSSWorldPath), "..") { return fmt.Errorf("invalid rss-world path: directory traversal not allowed") } if strings.Contains(filepath.Clean(c.ModelPath), "..") { return fmt.Errorf("invalid model path: directory traversal not allowed") } return nil } func (c *ServeCommand) Run(stdin io.Reader, stdout io.Writer) error { log.Printf("Starting scholscan server on port %d", c.Port) // Initialize filteredResultsTime to server start time c.resultsMu.Lock() c.filteredResultsTime = time.Now() c.resultsMu.Unlock() // Load templates at startup tmpl, err := template.ParseFS(templateFS, "templates/*.html") if err != nil { return fmt.Errorf("failed to parse templates: %w", err) } c.tmpl = tmpl log.Printf("Templates loaded successfully") // Load model at startup model, err := c.loadModel() if err != nil { return fmt.Errorf("failed to load model at startup: %w", err) } c.modelMu.Lock() c.model = model c.modelMu.Unlock() log.Printf("Model loaded successfully") // Start background ticker for periodic refresh ticker := time.NewTicker(c.refreshInterval) go c.backgroundRefresh(ticker) // Perform initial scan asynchronously go func() { log.Println("Starting initial feed scan...") if err := c.refreshFilteredResults(); err != nil { log.Printf("Warning: initial scan failed: %v", err) } else { c.resultsMu.RLock() count := len(c.filteredResults) c.resultsMu.RUnlock() log.Printf("Initial scan complete, %d articles filtered", count) } }() // Setup HTTP handlers http.HandleFunc("/", c.handleRoot) http.HandleFunc("/live-feed", c.handleLiveFeed) http.HandleFunc("/tools", c.handleTools) http.HandleFunc("/score", c.handleScore) http.HandleFunc("/scan", c.handleScan) http.HandleFunc("/api/filtered/feed", c.handleFilteredFeed) http.HandleFunc("/api/filtered/rss", c.handleFilteredRSS) http.HandleFunc("/api/health", c.handleHealth) // Setup server with graceful shutdown server := &http.Server{ Addr: fmt.Sprintf(":%d", c.Port), Handler: http.DefaultServeMux, ReadTimeout: core.DefaultReadTimeout, WriteTimeout: core.DefaultWriteTimeout, IdleTimeout: core.DefaultIdleTimeout, } // Handle shutdown signals sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) go func() { <-sigChan log.Println("Shutdown signal received") ticker.Stop() ctx, cancel := context.WithTimeout(context.Background(), core.DefaultShutdownTimeout) defer cancel() if err := server.Shutdown(ctx); err != nil { log.Printf("Server shutdown error: %v", err) } }() log.Printf("Server listening on http://localhost:%d", c.Port) if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { return fmt.Errorf("server error: %w", err) } return nil } // ============================================================================ // ┏━╸┏━┓┏━┓┏━╸ ╻ ┏━┓┏━╸╻┏━╸ // ┃ ┃ ┃┣┳┛┣╸ ┃ ┃ ┃┃╺┓┃┃ // ┗━╸┗━┛╹┗╸┗━╸ ┗━╸┗━┛┗━┛╹┗━╸ // ============================================================================ func (c *ServeCommand) loadModel() (*core.ModelEnvelope, error) { f, err := os.Open(c.ModelPath) if err != nil { return nil, fmt.Errorf("failed to open model file %s: %w", c.ModelPath, err) } defer f.Close() var model core.ModelEnvelope if err := json.NewDecoder(f).Decode(&model); err != nil { return nil, fmt.Errorf("failed to decode model: %w", err) } return &model, nil } func (c *ServeCommand) scoreArticle(article *core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope) float64 { docs := []string{strings.TrimSpace(article.Title)} vectors := vectorizer.Transform(docs) if len(vectors) == 0 || len(vectors[0]) == 0 { return 0.0 } score, err := core.PredictScore(vectors[0], model.Weights) if err != nil { // Return 0.0 on error (below threshold). Malformed articles don't break the display, // they just get filtered out. Log the error for diagnostics. log.Printf("Error scoring article: %v", err) return 0.0 } return score } func (c *ServeCommand) getThreshold(model *core.ModelEnvelope) (float64, error) { if model.Meta != nil { if threshold, ok := model.Meta["recommended_threshold"].(float64); ok { return threshold, nil } } return core.DefaultScoreThreshold, nil } // scoreAndFormatArticles scores a list of articles and returns them formatted for templates. // Articles are scored using the model and vectorizer, then returned with human-readable ratings. func (c *ServeCommand) scoreAndFormatArticles(articles []*core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope, threshold float64) []map[string]interface{} { type ArticleResponse struct { Title string `json:"title"` URL string `json:"url"` Source string `json:"source,omitempty"` Rating int `json:"rating"` Score float64 `json:"score"` } scored := make([]ArticleResponse, 0, len(articles)) for _, article := range articles { score := c.scoreArticle(article, vectorizer, model) rating := core.ScoreToScale(score, threshold) scored = append(scored, ArticleResponse{ Title: article.Title, URL: article.URL, Source: article.Source, Rating: rating, Score: score, }) } result := make([]map[string]interface{}, len(scored)) for i, a := range scored { result[i] = map[string]interface{}{ "Title": a.Title, "URL": a.URL, "Source": a.Source, "Rating": a.Rating, "Score": a.Score, } } return result } // ============================================================================ // ┏━┓┏━┓┏━┓ ┏━┓╺┳╸╻ ╻┏━╸┏━╸ // ┣┳┛┗━┓┗━┓ ┗━┓ ┃ ┃ ┃┣╸ ┣╸ // ╹┗╸┗━┛┗━┛ ┗━┛ ╹ ┗━┛╹ ╹ // ============================================================================ func (c *ServeCommand) readRSSWorldFeeds() ([]string, error) { f, err := os.Open(c.RSSWorldPath) if err != nil { return nil, fmt.Errorf("failed to open rss_world file %s: %w", c.RSSWorldPath, err) } defer f.Close() var feeds []string scanner := bufio.NewScanner(f) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if line != "" && !strings.HasPrefix(line, "#") { feeds = append(feeds, line) } } if err := scanner.Err(); err != nil { return nil, fmt.Errorf("error reading rss_world file: %w", err) } return feeds, nil } func (c *ServeCommand) refreshFilteredResults() error { feeds, err := c.readRSSWorldFeeds() if err != nil { return err } c.modelMu.RLock() model := c.model c.modelMu.RUnlock() if model == nil { return fmt.Errorf("model not loaded") } // Scan all feeds. Continue on individual feed failures to maximize results. // RSS feeds are often flaky; one down shouldn't prevent others from being processed. var allArticles []*core.Article for _, feed := range feeds { articles, err := c.fetchRSSFeed(feed) if err != nil { log.Printf("Warning: failed to fetch feed %s: %v", feed, err) continue } allArticles = append(allArticles, articles...) } // Score and filter articles threshold, err := c.getThreshold(model) if err != nil { return err } vectorizer := core.CreateVectorizerFromModel(model) filtered := make([]*core.Article, 0, len(allArticles)) for _, article := range allArticles { score := c.scoreArticle(article, vectorizer, model) if score >= threshold { // Create a copy with score to avoid reference issues articleCopy := *article articleCopy.Score = &score filtered = append(filtered, &articleCopy) } } c.resultsMu.Lock() c.filteredResults = filtered c.filteredResultsTime = time.Now() c.resultsMu.Unlock() return nil } // backgroundRefresh runs in a goroutine, rescanning all RSS feeds on interval. // Failures in individual feeds don't affect others - we log and continue. func (c *ServeCommand) backgroundRefresh(ticker *time.Ticker) { for range ticker.C { log.Println("Background refresh started") if err := c.refreshFilteredResults(); err != nil { log.Printf("Background refresh error (continuing): %v", err) } else { c.resultsMu.RLock() count := len(c.filteredResults) c.resultsMu.RUnlock() log.Printf("Background refresh complete, %d articles filtered", count) } } } func (c *ServeCommand) fetchRSSFeed(url string) ([]*core.Article, error) { client := &http.Client{Timeout: core.DefaultHTTPTimeout} req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, fmt.Errorf("error building request: %w", err) } req.Header.Set("User-Agent", core.PoliteUserAgent) ctx, cancel := context.WithTimeout(context.Background(), core.DefaultHTTPTimeout) defer cancel() resp, err := client.Do(req.WithContext(ctx)) if err != nil { return nil, fmt.Errorf("error fetching %s: %w", url, err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url) } body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("error reading response from %s: %w", url, err) } fp := gofeed.NewParser() feed, err := fp.Parse(strings.NewReader(string(body))) if err != nil { return nil, fmt.Errorf("error parsing feed from %s: %w", url, err) } var articles []*core.Article for _, item := range feed.Items { article := &core.Article{ URL: item.Link, Title: strings.TrimSpace(item.Title), Source: feed.Title, } if item.PublishedParsed != nil { article.PublishedAt = item.PublishedParsed } if len(article.Title) >= core.MinTitleLength { articles = append(articles, article) } } return articles, nil } // ============================================================================ // ╻ ╻┏━╸┏┓ ╻ ╻╻ // ┃╻┃┣╸ ┣┻┓ ┃ ┃┃ // ┗┻┛┗━╸┗━┛ ┗━┛╹ // ============================================================================ func (c *ServeCommand) handleRoot(w http.ResponseWriter, r *http.Request) { if r.URL.Path != "/" { http.NotFound(w, r) return } // Redirect to live feed http.Redirect(w, r, "/live-feed", http.StatusMovedPermanently) } func (c *ServeCommand) handleLiveFeed(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } c.resultsMu.RLock() articles := c.filteredResults resultsTime := c.filteredResultsTime c.resultsMu.RUnlock() c.modelMu.RLock() model := c.model c.modelMu.RUnlock() if model == nil { http.Error(w, "Model not loaded", http.StatusInternalServerError) return } threshold, _ := c.getThreshold(model) // Parse filter parameter (day, week, or all) filter := r.URL.Query().Get("filter") if filter == "" { filter = "all" } // Filter articles by date if needed now := time.Now() filtered := articles if filter == "day" || filter == "week" { var cutoff time.Time if filter == "day" { cutoff = now.Add(-24 * time.Hour) } else if filter == "week" { cutoff = now.Add(-7 * 24 * time.Hour) } filtered = make([]*core.Article, 0, len(articles)) for _, article := range articles { // Always include articles without PublishedAt if article.PublishedAt == nil || article.PublishedAt.After(cutoff) { filtered = append(filtered, article) } } } // Convert articles to template format type TemplateArticle struct { Title string URL string Source string Rating int Score float64 PublishedAt string } templateArticles := make([]TemplateArticle, 0, len(filtered)) for _, article := range filtered { score := 0.0 if article.Score != nil { score = *article.Score } rating := core.ScoreToScale(score, threshold) publishedAt := "" if article.PublishedAt != nil { publishedAt = article.PublishedAt.Format("2006-01-02") } templateArticles = append(templateArticles, TemplateArticle{ Title: article.Title, URL: article.URL, Source: article.Source, Rating: rating, Score: score, PublishedAt: publishedAt, }) } // Sort articles by score (highest first) sort.Slice(templateArticles, func(i, j int) bool { return templateArticles[i].Score > templateArticles[j].Score }) data := map[string]interface{}{ "Page": "live-feed", "Articles": templateArticles, "Threshold": threshold, "UpdatedAt": resultsTime.Format("2006-01-02 15:04:05"), "Filter": filter, "Title": displayTitle(c.Title), } w.Header().Set("Content-Type", "text/html; charset=utf-8") if err := c.tmpl.ExecuteTemplate(w, "live-feed", data); err != nil { http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) } } func (c *ServeCommand) handleTools(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } data := map[string]interface{}{ "Page": "tools", "Title": displayTitle(c.Title), } w.Header().Set("Content-Type", "text/html; charset=utf-8") if err := c.tmpl.ExecuteTemplate(w, "tools", data); err != nil { http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) } } func (c *ServeCommand) handleScore(w http.ResponseWriter, r *http.Request) { if r.Method == http.MethodGet { c.handleTools(w, r) return } if r.Method != http.MethodPost { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } c.modelMu.RLock() model := c.model c.modelMu.RUnlock() if model == nil { http.Error(w, "Model not loaded", http.StatusInternalServerError) return } if err := r.ParseForm(); err != nil { http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest) return } title := strings.TrimSpace(r.FormValue("title")) url := strings.TrimSpace(r.FormValue("url")) // If URL provided, fetch and extract title from it; otherwise use provided title. if url != "" { extractedTitle, err := extractTitleFromURL(url) if err != nil { c.renderResultsError(w, fmt.Sprintf("Failed to extract title from URL: %v", err), title) return } title = extractedTitle } // Validate input before scoring if valErr := c.validateTitle(title); valErr != "" { c.renderResultsError(w, valErr, title) return } vectorizer := core.CreateVectorizerFromModel(model) article := &core.Article{Title: title} score := c.scoreArticle(article, vectorizer, model) threshold, _ := c.getThreshold(model) rating := core.ScoreToScale(score, threshold) data := map[string]interface{}{ "Page": "tools", "IsScoreResult": true, "Title": title, "Rating": rating, "Score": score, "Threshold": threshold, "PageTitle": displayTitle(c.Title), } w.Header().Set("Content-Type", "text/html; charset=utf-8") if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil { http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) } } func (c *ServeCommand) handleScan(w http.ResponseWriter, r *http.Request) { if r.Method == http.MethodGet { c.handleTools(w, r) return } if r.Method != http.MethodPost { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } c.modelMu.RLock() model := c.model c.modelMu.RUnlock() if model == nil { http.Error(w, "Model not loaded", http.StatusInternalServerError) return } if err := r.ParseForm(); err != nil { http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest) return } feedURL := strings.TrimSpace(r.FormValue("feed_url")) // Validate and fetch the feed if valErr := c.validateFeedURL(feedURL); valErr != "" { c.renderScanResultsError(w, valErr, feedURL) return } articles, err := c.fetchRSSFeed(feedURL) if err != nil { c.renderScanResultsError(w, fmt.Sprintf("Failed to fetch feed: %v", err), feedURL) return } // Score articles threshold, _ := c.getThreshold(model) vectorizer := core.CreateVectorizerFromModel(model) scored := c.scoreAndFormatArticles(articles, vectorizer, model, threshold) sort.Slice(scored, func(i, j int) bool { iScore := scored[i]["Score"].(float64) jScore := scored[j]["Score"].(float64) return iScore > jScore }) data := map[string]interface{}{ "Page": "tools", "IsScanResult": true, "FeedURL": feedURL, "Articles": scored, "Threshold": threshold, "PageTitle": displayTitle(c.Title), } w.Header().Set("Content-Type", "text/html; charset=utf-8") if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil { http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) } } // ============================================================================ // ┏━┓┏━┓╻ ┏━╸┏┓╻╺┳┓┏━┓┏━┓╻┏┓╻╺┳╸┏━┓ // ┣━┫┣━┛┃ ┣╸ ┃┗┫ ┃┃┣━┛┃ ┃┃┃┗┫ ┃ ┗━┓ // ╹ ╹╹ ╹ ┗━╸╹ ╹╺┻┛╹ ┗━┛╹╹ ╹ ╹ ┗━┛ // ============================================================================ func (c *ServeCommand) handleFilteredFeed(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } c.resultsMu.RLock() articles := c.filteredResults resultsTime := c.filteredResultsTime c.resultsMu.RUnlock() c.modelMu.RLock() model := c.model c.modelMu.RUnlock() threshold, _ := c.getThreshold(model) type ArticleResponse struct { Title string `json:"title"` URL string `json:"url"` Source string `json:"source,omitempty"` Rating int `json:"rating"` Score float64 `json:"score"` } scored := make([]ArticleResponse, 0, len(articles)) for _, article := range articles { score := 0.0 if article.Score != nil { score = *article.Score } rating := core.ScoreToScale(score, threshold) scored = append(scored, ArticleResponse{ Title: article.Title, URL: article.URL, Source: article.Source, Rating: rating, Score: score, }) } w.Header().Set("Content-Type", "application/json") w.Header().Set("Cache-Control", "no-cache, no-store, must-revalidate") if err := json.NewEncoder(w).Encode(map[string]interface{}{ "total": len(articles), "threshold": threshold, "updated_at": resultsTime, "articles": scored, }); err != nil { http.Error(w, "Failed to encode response", http.StatusInternalServerError) } } func (c *ServeCommand) handleFilteredRSS(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } c.resultsMu.RLock() articles := c.filteredResults c.resultsMu.RUnlock() c.modelMu.RLock() model := c.model c.modelMu.RUnlock() w.Header().Set("Content-Type", "application/rss+xml") w.Header().Set("Cache-Control", "public, max-age=3600") // Generate RSS feed fmt.Fprintf(w, ` %s - Filtered Articles http://scholscan.local Articles filtered by your learned preferences (scored 1-10) `, displayTitle(c.Title)) for _, article := range articles { rawScore := 0.0 if article.Score != nil { rawScore = *article.Score } threshold, _ := c.getThreshold(model) scaledScore := core.ScoreToScale(rawScore, threshold) title := escapeXML(article.Title) url := escapeXML(article.URL) description := fmt.Sprintf("SCHOLSCAN SCORE = %d/10 (raw: %.3f)", scaledScore, rawScore) fmt.Fprintf(w, ` %s %s %s `, title, url, description) } fmt.Fprint(w, ` `) } func (c *ServeCommand) handleHealth(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } c.modelMu.RLock() modelLoaded := c.model != nil c.modelMu.RUnlock() status := "ok" if !modelLoaded { status = "model_not_loaded" w.WriteHeader(http.StatusInternalServerError) } w.Header().Set("Content-Type", "application/json") if err := json.NewEncoder(w).Encode(map[string]interface{}{ "status": status, "model_loaded": modelLoaded, "timestamp": time.Now().Unix(), }); err != nil { http.Error(w, "Failed to encode response", http.StatusInternalServerError) } } // ============================================================================ // ╻ ╻┏━╸╻ ┏━┓┏━╸┏━┓┏━┓ // ┣━┫┣╸ ┃ ┣━┛┣╸ ┣┳┛┗━┓ // ╹ ╹┗━╸┗━╸╹ ┗━╸╹┗╸┗━┛ // ============================================================================ func displayTitle(custom string) string { if custom != "" { return custom } return "ScholScan" } // extractTitleFromURL fetches the content from a URL and extracts the title from the HTML. // Designed to be resilient: tries multiple title sources, handles various URL formats, // and provides meaningful error feedback if extraction fails. func extractTitleFromURL(rawURL string) (string, error) { if rawURL == "" { return "", fmt.Errorf("empty URL") } // Check if it's a DOI if strings.HasPrefix(rawURL, "10.") { // Convert DOI to URL rawURL = fmt.Sprintf("https://doi.org/%s", rawURL) } else if !strings.HasPrefix(rawURL, "http://") && !strings.HasPrefix(rawURL, "https://") { rawURL = "https://" + rawURL } ctx, cancel := context.WithTimeout(context.Background(), core.DefaultContextTimeout) defer cancel() req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil) if err != nil { return "", fmt.Errorf("invalid URL: %w", err) } req.Header.Set("User-Agent", core.PoliteUserAgent) req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") resp, err := core.DoRequestWithRetry(ctx, core.DefaultHTTPClient, req) if err != nil { return "", fmt.Errorf("failed to fetch URL: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return "", fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) } doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { return "", fmt.Errorf("failed to parse HTML: %w", err) } // Fallback chain: → og:title → twitter:title → <h1> // Different sites populate these differently; trying multiple increases success rate. title := "" if t := doc.Find("title").Text(); t != "" { title = strings.TrimSpace(t) } if title == "" { if t, exists := doc.Find(`meta[property="og:title"]`).Attr("content"); exists && t != "" { title = strings.TrimSpace(t) } } if title == "" { if t, exists := doc.Find(`meta[name="twitter:title"]`).Attr("content"); exists && t != "" { title = strings.TrimSpace(t) } } if title == "" { if t := doc.Find("h1").First().Text(); t != "" { title = strings.TrimSpace(t) } } if title == "" { return "", fmt.Errorf("could not extract title from page") } // Clean up common title patterns reClean := regexp.MustCompile(`\s*\|\s*`) title = reClean.ReplaceAllString(title, "") rePub := regexp.MustCompile(`^[^|]*\|\s*`) title = rePub.ReplaceAllString(title, "") title = strings.TrimSpace(title) if len(title) < 10 { return "", fmt.Errorf("extracted title too short: %q", title) } return title, nil } // validateTitle checks that a title is suitable for scoring. // Returns an error message string if invalid, empty string if valid. func (c *ServeCommand) validateTitle(title string) string { if strings.TrimSpace(title) == "" { return "Title cannot be empty" } if len(title) > 1000 { return "Title too long (max 1000 characters)" } return "" } // renderResultsError renders the results template with an error message. func (c *ServeCommand) renderResultsError(w http.ResponseWriter, errMsg, title string) { data := map[string]interface{}{ "Page": "tools", "IsScoreResult": true, "Error": errMsg, "Title": title, "PageTitle": displayTitle(c.Title), } w.Header().Set("Content-Type", "text/html; charset=utf-8") if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil { http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) } } // validateFeedURL checks that a feed URL is non-empty and has valid format. // Returns an error message string if invalid, empty string if valid. func (c *ServeCommand) validateFeedURL(feedURL string) string { if feedURL == "" { return "Feed URL cannot be empty" } if _, err := url.Parse(feedURL); err != nil { return "Invalid URL format" } return "" } // renderScanResultsError renders the results template with an error for scan operation. func (c *ServeCommand) renderScanResultsError(w http.ResponseWriter, errMsg, feedURL string) { data := map[string]interface{}{ "Page": "tools", "IsScanResult": true, "Error": errMsg, "FeedURL": feedURL, "PageTitle": displayTitle(c.Title), } w.Header().Set("Content-Type", "text/html; charset=utf-8") if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil { http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) } } func escapeXML(s string) string { s = strings.ReplaceAll(s, "&", "&") s = strings.ReplaceAll(s, "<", "<") s = strings.ReplaceAll(s, ">", ">") s = strings.ReplaceAll(s, "\"", """) s = strings.ReplaceAll(s, "'", "'") return s }