aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore33
-rw-r--r--Containerfile11
-rw-r--r--DESIGN.md81
-rw-r--r--README.md37
-rw-r--r--cmds/scan.go416
-rw-r--r--cmds/serve.go1010
-rw-r--r--cmds/templates/live-feed.html158
-rw-r--r--cmds/templates/results.html279
-rw-r--r--cmds/templates/tools.html202
-rw-r--r--cmds/train.go841
-rw-r--r--cmds/train_test.go66
-rw-r--r--core/constants.go21
-rw-r--r--core/http.go196
-rw-r--r--core/ml.go427
-rw-r--r--core/model.go20
-rw-r--r--core/scoring.go14
-rw-r--r--core/text.go36
-rw-r--r--core/types.go84
-rw-r--r--go.mod19
-rw-r--r--go.sum96
-rw-r--r--justfile39
-rw-r--r--main.go83
22 files changed, 4169 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b57a04a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,33 @@
+scholscan
+
+*.exe
+*.exe~
+*.dll
+*.so
+*.dylib
+
+*.test
+
+*.out
+
+go.work
+
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+.DS_Store
+Thumbs.db
+
+data/
+!README.md
+
+*.log
+
+.env*
+config.*
+secrets.*# RSS world data file
+rss_world.txt
+*.kate-swp
diff --git a/Containerfile b/Containerfile
new file mode 100644
index 0000000..58f011f
--- /dev/null
+++ b/Containerfile
@@ -0,0 +1,11 @@
+# Copy & customize: mount model.json and rss_world.txt, set --title flag as needed
+FROM golang:1.25-alpine AS builder
+RUN apk add --no-cache git
+WORKDIR /build
+RUN git clone https://your-git-repo-url/scholscan.git .
+RUN go build -o scholscan .
+
+FROM alpine:latest
+COPY --from=builder /build/scholscan /app/scholscan
+WORKDIR /app
+ENTRYPOINT ["/app/scholscan"]
diff --git a/DESIGN.md b/DESIGN.md
new file mode 100644
index 0000000..dba3394
--- /dev/null
+++ b/DESIGN.md
@@ -0,0 +1,81 @@
+Scholscan Design
+=================
+
+Article filter that learns from positive examples then filters RSS feeds automatically. Classifier uses TF-IDF on article titles plus logistic regression - fast, no content scraping needed.
+
+Code Structure
+---------------
+
+main.go - Entry point, validates commands, dispatches
+
+cmds/
+ train.go - Load positive articles, fetch RSS as negatives, train model, output JSON
+ scan.go - Fetch articles from RSS, score with model, output filtered results
+ serve.go - HTTP server with background feed refresh, embedded web UI, RSS output
+
+core/
+ types.go - Article struct holds article data, Config struct for app settings, Command interface
+ ml.go - TF-IDF implementation with n-gram support, logistic regression classifier
+ model.go - ModelEnvelope for serialized models, model save/load functions
+ scoring.go - Score conversion from raw 0-1 to display 1-10 scale
+ text.go - HTML content extraction, word tokenization, text cleaning
+ http.go - HTTP client with retries, timeouts, user agents
+ constants.go - Default timeouts, thresholds, chunk sizes
+
+Training Flow
+-------------
+
+Command loads positive examples from JSONL file. Reads RSS URLs from text file (one per line, # comments allowed). Fetches RSS feeds in parallel, removes any articles matching positive URLs. Trains TF-IDF vectorizer then logistic regression on balanced dataset. Finds optimal threshold on validation split using Youden's J metric. Outputs complete model JSON to stdout.
+
+Scanning Flow
+-------------
+
+Command fetches specified RSS feed, scores each article using trained model. Articles scoring above threshold output as JSON-Lines (same format as input). Includes enrichment metadata if available. Verbose mode shows fetch and scoring progress to stderr.
+
+Server Flow
+-----------
+
+Server loads model and RSS world feed list on startup. Background goroutine refreshes all feeds in parallel every N minutes (configurable). Results cached in memory with RWMutex. HTTP handlers serve both HTML UI and JSON/RSS API endpoints.
+
+API Endpoints
+-------------
+
+### HTML Pages
+- GET `/` - Redirect to /live-feed
+- GET `/live-feed` - Filtered articles web interface (server-rendered)
+- GET `/tools` - Manual article scoring interface (server-rendered)
+
+### HTTP Handlers
+- GET `/api/filtered/feed` - Articles as JSON array (for external consumption)
+- GET `/api/health` - Health check returns {"status":"ok"}
+- POST `/score` - Score single article via form post
+- POST `/scan` - Scan RSS feed via form post
+
+### RSS Output
+- GET `/api/filtered/rss` - Scored articles as RSS feed
+
+Model Details
+-------------
+
+Vectorizer uses unigrams plus bigrams. Minimum document frequency 2 (removes typos), maximum 80% (removes stopwords). Vocabulary capped at 50000 terms. Logistic regression with L2 regularization lambda=0.001, learning rate 0.5, 500 iterations. Validation split 80/20 with seed 42 for reproducible results. Threshold selected using Youden's J to balance false positives against false negatives.
+
+Server Implementation
+---------------------
+
+HTML templates embedded in binary using embed.FS. All rendering is server-side with no JavaScript. Tools page uses standard HTML forms with POST submissions. Live feed displays cached background results with server-side rendering. Background refresh uses separate goroutine per feed. Results cached with last update time for each feed. RSS output repackages filtered articles into RSS format for consumption.
+
+Key Implementation Notes
+------------------------
+
+- Articles processed in 50-item chunks for memory efficiency
+- File paths validated against directory traversal attacks
+- HTTP requests use custom polite user agent with email contact
+- RSS parsing handles both RSS and Atom via gofeed library
+- TF-IDF vectorizer stores vocabulary as sorted string array for deterministic ordering
+- Model version field allows future format changes
+- Background refresh errors logged but don't crash server
+
+External Dependencies
+---------------------
+
+gofeed mmcdole for RSS/Atom parsing. All other functionality uses Go standard library only.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..870bf34
--- /dev/null
+++ b/README.md
@@ -0,0 +1,37 @@
+# Scholscan
+
+Filters academic articles using TF-IDF on titles plus logistic regression.
+
+## Build
+```
+go build -o scholscan .
+```
+
+## Usage
+```
+# Train model from articles you like
+./scholscan train positives.jsonl --rss-feeds feeds.txt > model.json
+
+# Score new RSS feed
+./scholscan scan --url RSS_URL --model model.json > results.jsonl
+
+# Run web server
+./scholscan serve --port 8080 --model model.json --rss-world rss_world.txt
+```
+
+## Endpoints
+
+- GET `/` - redirect to live feed
+- GET `/live-feed` - filtered articles web UI
+- GET `/tools` - score individual articles
+- POST `/score` - API for scoring titles
+- POST `/scan` - API for scanning RSS
+- GET `/api/filtered/feed` - JSON feed
+- GET `/api/filtered/rss` - RSS feed
+- GET `/api/health` - health check
+
+## Model settings
+
+- TF-IDF: unigrams + bigrams, MinDF=2, MaxDF=0.8
+- Logistic regression: λ=0.001, L2 regularization
+- Class balancing: downsample majority to 1:1 ratio \ No newline at end of file
diff --git a/cmds/scan.go b/cmds/scan.go
new file mode 100644
index 0000000..789157c
--- /dev/null
+++ b/cmds/scan.go
@@ -0,0 +1,416 @@
+// Scan command: filters articles using trained model.
+//
+// takes articles from RSS feed, text, or JSONL. Scores & outputs those passing.
+// Batches processing (default 50) to allow continuous streaming.
+package cmds
+
+import (
+ "bufio"
+ "context"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "net/url"
+ "os"
+ "path/filepath"
+ "strings"
+
+ "github.com/mmcdole/gofeed"
+ "scholscan/core"
+)
+
+
+// ============================================================================
+// ┏━╸┏━┓┏┳┓┏┳┓┏━┓┏┓╻╺┳┓
+// ┃ ┃ ┃┃┃┃┃┃┃┣━┫┃┗┫ ┃┃
+// ┗━╸┗━┛╹ ╹╹ ╹╹ ╹╹ ╹╺┻┛
+// ============================================================================
+
+
+// scores articles with trained model and outputs filtered results above thresh
+type ScanCommand struct {
+ URL string
+ FromText bool
+ FromArticles bool
+
+ ModelPath string
+ Threshold string
+
+ MinTitleLength int
+ ChunkSize int
+
+ EventsOut string
+ MetricsOut string
+ Verbose bool
+}
+
+func (c *ScanCommand) Name() string { return "scan" }
+
+func (c *ScanCommand) Init(args []string) error {
+ fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
+ fs.Usage = func() {
+ fmt.Fprint(fs.Output(), `Usage: scholscan scan [options]
+
+Fetches articles, scores with model, outputs matched (>thresh) ones.
+
+Source options (exactly one required):
+ --url <feed_url> Fetch articles from RSS/Atom feed
+ --from-text Extract URLs from text on stdin
+ --from-articles Use Article JSONL from stdin directly
+
+Model and filtering:
+ --model <path> Path to trained model JSON file (required)
+ --threshold <float> Score threshold (if not provided, uses model's recommended threshold)
+
+Enrichment options:
+`)
+ fs.PrintDefaults()
+ fmt.Fprint(fs.Output(), `
+Examples:
+ scholscan scan --url "http://some.blog/rss.xml" --model model.json > interesting.jsonl
+ echo "see https://example.com" | scholscan scan --from-text --model model.json
+ cat articles.jsonl | scholscan scan --from-articles --model model.json
+`)
+ }
+
+ fs.StringVar(&c.URL, "url", "", "RSS/Atom feed URL to fetch")
+ fs.BoolVar(&c.FromText, "from-text", false, "Extract URLs from text on stdin")
+ fs.BoolVar(&c.FromArticles, "from-articles", false, "Use Article JSONL from stdin")
+ fs.StringVar(&c.ModelPath, "model", "", "Path to trained model JSON file (required)")
+ fs.StringVar(&c.Threshold, "threshold", "", "Score threshold for filtering (if not provided, uses model's recommended threshold)")
+ fs.IntVar(&c.MinTitleLength, "min-title-length", core.MinTitleLength, "Minimum title length to consider valid")
+ fs.IntVar(&c.ChunkSize, "chunk-size", core.DefaultChunkSize, "Number of articles to process in each batch")
+ fs.StringVar(&c.EventsOut, "events-out", "events.jsonl", "Write per-article events to a JSONL file")
+ fs.StringVar(&c.MetricsOut, "metrics-out", "metrics.json", "Write summary metrics to a JSON file")
+ fs.BoolVar(&c.Verbose, "verbose", false, "Show progress information")
+
+ if err := fs.Parse(args); err != nil {
+ return err
+ }
+
+ if fs.NArg() != 0 {
+ return fmt.Errorf("unexpected arguments provided: %v", fs.Args())
+ }
+
+ // one src opt required
+ sourceCount := 0
+ if c.URL != "" {
+ sourceCount++
+ }
+ if c.FromText {
+ sourceCount++
+ }
+ if c.FromArticles {
+ sourceCount++
+ }
+
+ if sourceCount == 0 {
+ return fmt.Errorf("exactly one source option must be specified: --url, --from-text, or --from-articles")
+ }
+ if sourceCount > 1 {
+ return fmt.Errorf("only one source option may be specified: --url, --from-text, or --from-articles")
+ }
+
+ if c.ModelPath == "" {
+ return fmt.Errorf("--model flag is required")
+ }
+
+ // prevent dir traversal
+ if strings.Contains(filepath.Clean(c.ModelPath), "..") {
+ return fmt.Errorf("invalid model path: directory traversal not allowed")
+ }
+
+ if c.URL != "" {
+ if _, err := url.Parse(c.URL); err != nil {
+ return fmt.Errorf("invalid URL format: %w", err)
+ }
+ }
+
+ return nil
+}
+
+// Run runs the scan: load the model, decide on a threshold, get articles, then score them in chunks.
+// We bail out early on config problems but try to keep going even if some articles fail to fetch.
+func (c *ScanCommand) Run(stdin io.Reader, stdout io.Writer) error {
+ if c.Verbose {
+ log.SetOutput(os.Stderr)
+ log.Println("Starting scan workflow...")
+ log.Printf("Source: %v", c.getSourceDescription())
+ log.Printf("Model: %s", c.ModelPath)
+ }
+
+ model, err := c.loadModel()
+ if err != nil {
+ return fmt.Errorf("failed to load model: %w", err)
+ }
+
+ threshold, err := c.getThreshold(model)
+ if err != nil {
+ return fmt.Errorf("failed to determine threshold: %w", err)
+ }
+
+ if c.Verbose {
+ log.Printf("Using threshold: %.3f", threshold)
+ }
+
+ var articles []*core.Article
+ if c.FromArticles {
+ articles, err = c.readArticlesFromStdin(stdin)
+ } else {
+ articles, err = c.fetchArticles()
+ }
+ if err != nil {
+ return fmt.Errorf("failed to get articles: %w", err)
+ }
+
+ if c.Verbose {
+ log.Printf("Processing %d articles", len(articles))
+ }
+
+ // process articles in chunks
+ return c.processArticles(articles, model, threshold, stdout, stdin)
+}
+
+
+// ============================================================================
+// ┏┳┓┏━┓╺┳┓┏━╸╻ ┏┓ ┏━╸┏━┓┏┓╻┏━╸╻┏━╸
+// ┃┃┃┃ ┃ ┃┃┣╸ ┃ ┃╺╋╸ ┃ ┃ ┃┃┗┫┣╸ ┃┃╺┓
+// ╹ ╹┗━┛╺┻┛┗━╸┗━╸ ┗━┛ ┗━╸┗━┛╹ ╹╹ ╹┗━┛
+// ============================================================================
+
+
+
+func (c *ScanCommand) getSourceDescription() string {
+ if c.URL != "" {
+ return fmt.Sprintf("RSS feed: %s", c.URL)
+ }
+ if c.FromText {
+ return "text from stdin"
+ }
+ if c.FromArticles {
+ return "articles from stdin"
+ }
+ return "unknown"
+}
+
+// loadModel reads and parses the model JSON file.
+// The envelope contains weights, vocabulary, and optionally a recommended threshold.
+func (c *ScanCommand) loadModel() (*core.ModelEnvelope, error) {
+ f, err := os.Open(c.ModelPath)
+ if err != nil {
+ return nil, fmt.Errorf("failed to open model file %s: %w", c.ModelPath, err)
+ }
+ defer f.Close()
+
+ var model core.ModelEnvelope
+ if err := json.NewDecoder(f).Decode(&model); err != nil {
+ return nil, fmt.Errorf("failed to decode model: %w", err)
+ }
+
+ return &model, nil
+}
+
+func (c *ScanCommand) getThreshold(model *core.ModelEnvelope) (float64, error) {
+ if c.Threshold != "" {
+ var threshold float64
+ _, err := fmt.Sscanf(c.Threshold, "%f", &threshold)
+ if err == nil {
+ return threshold, nil
+ }
+ }
+
+ if model.Meta != nil {
+ if meta, ok := model.Meta["recommended_threshold"].(float64); ok {
+ return meta, nil
+ }
+ }
+
+ return core.DefaultScoreThreshold, nil
+}
+
+// ============================================================================
+// ┏━┓┏━┓╺┳╸╻┏━╸╻ ┏━╸ ┏━┓┏━┓┏━╸┏━┓
+// ┣━┫┣┳┛ ┃ ┃┃ ┃ ┣╸ ┗━┓┣┳┛┃ ┗━┓
+// ╹ ╹╹┗╸ ╹ ╹┗━╸┗━╸┗━╸ ┗━┛╹┗╸┗━╸┗━┛
+// ============================================================================
+
+
+func (c *ScanCommand) fetchArticles() ([]*core.Article, error) {
+ if c.FromText {
+ return c.extractURLsFromText(os.Stdin)
+ }
+ if c.URL != "" {
+ return c.fetchRSSFeed(c.URL)
+ }
+ return nil, fmt.Errorf("no valid source specified")
+}
+
+// extractURLsFromText pulls URLs from plain text on stdin.
+// We create minimal Article objects since only the URL is needed for scoring.
+func (c *ScanCommand) extractURLsFromText(stdin io.Reader) ([]*core.Article, error) {
+ var urls []string
+ s := bufio.NewScanner(stdin)
+ for s.Scan() {
+ line := s.Text()
+ // url extraction
+ fields := strings.Fields(line)
+ for _, field := range fields {
+ if strings.HasPrefix(field, "http://") || strings.HasPrefix(field, "https://") {
+ urls = append(urls, field)
+ }
+ }
+ }
+
+ // create Article objs for URLs
+ articles := make([]*core.Article, len(urls))
+ for i, url := range urls {
+ articles[i] = &core.Article{
+ URL: url,
+ Title: fmt.Sprintf("Article from %s", url),
+ Content: "",
+ }
+ }
+
+ return articles, s.Err()
+}
+
+// fetchRSSFeed fetches and parses a single RSS feed with a 30s timeout.
+// We skip articles with short titles since they're usually noise or truncated.
+func (c *ScanCommand) fetchRSSFeed(url string) ([]*core.Article, error) {
+ client := &http.Client{Timeout: core.DefaultHTTPTimeout}
+
+ req, err := http.NewRequest("GET", url, nil)
+ if err != nil {
+ return nil, fmt.Errorf("error building request: %w", err)
+ }
+ req.Header.Set("User-Agent", core.PoliteUserAgent)
+
+ ctx, cancel := context.WithTimeout(context.Background(), core.DefaultHTTPTimeout)
+ defer cancel()
+
+ resp, err := client.Do(req.WithContext(ctx))
+ if err != nil {
+ return nil, fmt.Errorf("error fetching %s: %w", url, err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("error reading response from %s: %w", url, err)
+ }
+
+ // parse feed
+ fp := gofeed.NewParser()
+ feed, err := fp.Parse(strings.NewReader(string(body)))
+ if err != nil {
+ return nil, fmt.Errorf("error parsing feed from %s: %w", url, err)
+ }
+
+ var articles []*core.Article
+ for _, item := range feed.Items {
+ article := &core.Article{
+ URL: item.Link,
+ Title: strings.TrimSpace(item.Title),
+ }
+
+ if len(article.Title) >= c.MinTitleLength {
+ articles = append(articles, article)
+ }
+ }
+
+ return articles, nil
+}
+
+// readArticlesFromStdin reads Article objects from JSONL on stdin.
+// Malformed lines are skipped to allow partial processing of corrupted input.
+func (c *ScanCommand) readArticlesFromStdin(stdin io.Reader) ([]*core.Article, error) {
+ var articles []*core.Article
+ decoder := json.NewDecoder(stdin)
+ for {
+ var article core.Article
+ if err := decoder.Decode(&article); err != nil {
+ if err == io.EOF {
+ break
+ }
+ continue
+ }
+
+ if len(article.Title) >= c.MinTitleLength {
+ articles = append(articles, &article)
+ }
+ }
+ return articles, nil
+}
+
+
+
+// ============================================================================
+// ┏━┓┏━┓┏━┓┏━╸┏━╸┏━┓┏━┓ ┏━┓┏━┓╺┳╸╻┏━╸╻ ┏━╸┏━┓
+// ┣━┛┣┳┛┃ ┃┃ ┣╸ ┗━┓┗━┓ ┣━┫┣┳┛ ┃ ┃┃ ┃ ┣╸ ┗━┓
+// ╹ ╹┗╸┗━┛┗━╸┗━╸┗━┛┗━┛ ╹ ╹╹┗╸ ╹ ╹┗━╸┗━╸┗━╸┗━┛
+// ============================================================================
+
+
+// processArticles handles scoring and filtering in batches to keep memory usage predictable.
+// Scoring errors don't crash the process - we log them and continue with the next article.
+func (c *ScanCommand) processArticles(articles []*core.Article, model *core.ModelEnvelope, threshold float64, stdout io.Writer, stdin io.Reader) error {
+ vectorizer := core.CreateVectorizerFromModel(model)
+
+ encoder := json.NewEncoder(stdout)
+
+ // process each batch
+ for i := 0; i < len(articles); i += c.ChunkSize {
+ end := i + c.ChunkSize
+ if end > len(articles) {
+ end = len(articles)
+ }
+
+ chunk := articles[i:end]
+ if c.Verbose {
+ log.Printf("Processing chunk %d-%d of %d articles", i+1, end, len(articles))
+ }
+
+ // calc score for batch
+ docs := make([]string, len(chunk))
+ for j, article := range chunk {
+ docs[j] = strings.TrimSpace(article.Title)
+ }
+
+ vectors := vectorizer.Transform(docs)
+ scores := make([]float64, len(chunk))
+
+ for j, vector := range vectors {
+ score, err := core.PredictScore(vector, model.Weights)
+ if err != nil {
+ log.Printf("Error computing score for article %d: %v", i+j, err)
+ scores[j] = 0.0
+ } else {
+ scores[j] = score
+ }
+ }
+
+ for j, article := range chunk {
+ score := scores[j]
+ article.Score = &score
+
+ if score >= threshold {
+ if err := encoder.Encode(article); err != nil {
+ log.Printf("Error encoding article: %v", err)
+ }
+ }
+ }
+ }
+
+ if c.Verbose {
+ log.Println("Scan complete")
+ }
+
+ return nil
+}
diff --git a/cmds/serve.go b/cmds/serve.go
new file mode 100644
index 0000000..92aa64c
--- /dev/null
+++ b/cmds/serve.go
@@ -0,0 +1,1010 @@
+// Serve command: HTTP server for web UI and APIs.
+//
+// Two main flows: live-feed (cached + background refresh) and tools (on-demand scoring).
+// Live-feed rescans all configured RSS feeds on a timer (default 24h), caches results,
+// serves filtered articles via web UI and JSON/RSS APIs.
+// Tools provides real-time /score (single title) and /scan (ad-hoc feed) endpoints.
+// Background refresh continues despite individual feed failures; RWMutex allows
+// many concurrent readers with exclusive writer updates.
+// Templates are embedded for single-binary deployment.
+package cmds
+
+import (
+ "bufio"
+ "context"
+ "embed"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "html/template"
+ "io"
+ "log"
+ "net/http"
+ "net/url"
+ "os"
+ "os/signal"
+ "path/filepath"
+ "regexp"
+ "sort"
+ "strings"
+ "sync"
+ "syscall"
+ "time"
+
+ "github.com/PuerkitoBio/goquery"
+ "github.com/mmcdole/gofeed"
+ "scholscan/core"
+)
+
+//go:embed templates/*.html
+var templateFS embed.FS
+
+// ============================================================================
+// ┏━╸┏┳┓╺┳┓ ┏━┓┏┓ ┏┓
+// ┃ ┃┃┃ ┃┃ ┃ ┃┣┻┓ ┃
+// ┗━╸╹ ╹╺┻┛ ┗━┛┗━┛┗━┛
+// ============================================================================
+
+type ServeCommand struct {
+ Port int
+ RSSWorldPath string
+ RefreshInterval string
+ ModelPath string
+ Title string
+
+ // Parsed interval
+ refreshInterval time.Duration
+ // Loaded model (cached)
+ model *core.ModelEnvelope
+ modelMu sync.RWMutex
+ // Cached filtered RSS results and timestamp.
+ // RWMutex allows many concurrent readers (HTTP handlers) with exclusive writer (background refresh).
+ filteredResults []*core.Article
+ filteredResultsTime time.Time
+ resultsMu sync.RWMutex
+ // Loaded templates
+ tmpl *template.Template
+}
+
+func (c *ServeCommand) Name() string { return "serve" }
+
+// Init configures the serve command with robust input validation.
+// Prevents directory traversal, validates paths, and sets sensible defaults.
+// Ensures only one configuration is possible to reduce runtime complexity.
+func (c *ServeCommand) Init(args []string) error {
+ fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
+ fs.Usage = func() {
+ fmt.Fprint(fs.Output(), `Usage: scholscan serve [options]
+
+ Start HTTP server for filtered RSS and scoring web UI.
+
+ Flags:
+ `)
+ fs.PrintDefaults()
+ fmt.Fprint(fs.Output(), `
+ Examples:
+ scholscan serve --port 8080 --rss-world rss_world.txt --model model.json
+ scholscan serve --refresh-interval 24h --model ./model.json --rss-world feeds.txt
+ `)
+ }
+
+ fs.IntVar(&c.Port, "port", 8080, "Port to listen on")
+ fs.StringVar(&c.RSSWorldPath, "rss-world", "rss_world.txt", "Path to RSS world file (one feed URL per line)")
+ fs.StringVar(&c.RefreshInterval, "refresh-interval", "24h", "Interval for background rescans (e.g., 24h, 1h)")
+ fs.StringVar(&c.ModelPath, "model", "model.json", "Path to trained model JSON file")
+ fs.StringVar(&c.Title, "title", "", "Custom title for the web interface")
+
+ if err := fs.Parse(args); err != nil {
+ return err
+ }
+
+ if fs.NArg() != 0 {
+ return fmt.Errorf("unexpected arguments provided: %v", fs.Args())
+ }
+
+ // Parse refresh interval
+ interval, err := time.ParseDuration(c.RefreshInterval)
+ if err != nil {
+ return fmt.Errorf("invalid refresh-interval %q: %w", c.RefreshInterval, err)
+ }
+ c.refreshInterval = interval
+
+ if strings.Contains(filepath.Clean(c.RSSWorldPath), "..") {
+ return fmt.Errorf("invalid rss-world path: directory traversal not allowed")
+ }
+ if strings.Contains(filepath.Clean(c.ModelPath), "..") {
+ return fmt.Errorf("invalid model path: directory traversal not allowed")
+ }
+
+ return nil
+}
+
+func (c *ServeCommand) Run(stdin io.Reader, stdout io.Writer) error {
+ log.Printf("Starting scholscan server on port %d", c.Port)
+
+ // Initialize filteredResultsTime to server start time
+ c.resultsMu.Lock()
+ c.filteredResultsTime = time.Now()
+ c.resultsMu.Unlock()
+
+ // Load templates at startup
+ tmpl, err := template.ParseFS(templateFS, "templates/*.html")
+ if err != nil {
+ return fmt.Errorf("failed to parse templates: %w", err)
+ }
+ c.tmpl = tmpl
+ log.Printf("Templates loaded successfully")
+
+ // Load model at startup
+ model, err := c.loadModel()
+ if err != nil {
+ return fmt.Errorf("failed to load model at startup: %w", err)
+ }
+ c.modelMu.Lock()
+ c.model = model
+ c.modelMu.Unlock()
+
+ log.Printf("Model loaded successfully")
+
+ // Start background ticker for periodic refresh
+ ticker := time.NewTicker(c.refreshInterval)
+ go c.backgroundRefresh(ticker)
+
+ // Perform initial scan asynchronously
+ go func() {
+ log.Println("Starting initial feed scan...")
+ if err := c.refreshFilteredResults(); err != nil {
+ log.Printf("Warning: initial scan failed: %v", err)
+ } else {
+ c.resultsMu.RLock()
+ count := len(c.filteredResults)
+ c.resultsMu.RUnlock()
+ log.Printf("Initial scan complete, %d articles filtered", count)
+ }
+ }()
+
+ // Setup HTTP handlers
+ http.HandleFunc("/", c.handleRoot)
+ http.HandleFunc("/live-feed", c.handleLiveFeed)
+ http.HandleFunc("/tools", c.handleTools)
+ http.HandleFunc("/score", c.handleScore)
+ http.HandleFunc("/scan", c.handleScan)
+ http.HandleFunc("/api/filtered/feed", c.handleFilteredFeed)
+ http.HandleFunc("/api/filtered/rss", c.handleFilteredRSS)
+ http.HandleFunc("/api/health", c.handleHealth)
+
+ // Setup server with graceful shutdown
+ server := &http.Server{
+ Addr: fmt.Sprintf(":%d", c.Port),
+ Handler: http.DefaultServeMux,
+ ReadTimeout: core.DefaultReadTimeout,
+ WriteTimeout: core.DefaultWriteTimeout,
+ IdleTimeout: core.DefaultIdleTimeout,
+ }
+
+ // Handle shutdown signals
+ sigChan := make(chan os.Signal, 1)
+ signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+
+ go func() {
+ <-sigChan
+ log.Println("Shutdown signal received")
+ ticker.Stop()
+ ctx, cancel := context.WithTimeout(context.Background(), core.DefaultShutdownTimeout)
+ defer cancel()
+ if err := server.Shutdown(ctx); err != nil {
+ log.Printf("Server shutdown error: %v", err)
+ }
+ }()
+
+ log.Printf("Server listening on http://localhost:%d", c.Port)
+ if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
+ return fmt.Errorf("server error: %w", err)
+ }
+
+ return nil
+}
+
+// ============================================================================
+// ┏━╸┏━┓┏━┓┏━╸ ╻ ┏━┓┏━╸╻┏━╸
+// ┃ ┃ ┃┣┳┛┣╸ ┃ ┃ ┃┃╺┓┃┃
+// ┗━╸┗━┛╹┗╸┗━╸ ┗━╸┗━┛┗━┛╹┗━╸
+// ============================================================================
+
+func (c *ServeCommand) loadModel() (*core.ModelEnvelope, error) {
+ f, err := os.Open(c.ModelPath)
+ if err != nil {
+ return nil, fmt.Errorf("failed to open model file %s: %w", c.ModelPath, err)
+ }
+ defer f.Close()
+
+ var model core.ModelEnvelope
+ if err := json.NewDecoder(f).Decode(&model); err != nil {
+ return nil, fmt.Errorf("failed to decode model: %w", err)
+ }
+
+ return &model, nil
+}
+
+func (c *ServeCommand) scoreArticle(article *core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope) float64 {
+ docs := []string{strings.TrimSpace(article.Title)}
+ vectors := vectorizer.Transform(docs)
+
+ if len(vectors) == 0 || len(vectors[0]) == 0 {
+ return 0.0
+ }
+
+ score, err := core.PredictScore(vectors[0], model.Weights)
+ if err != nil {
+ // Return 0.0 on error (below threshold). Malformed articles don't break the display,
+ // they just get filtered out. Log the error for diagnostics.
+ log.Printf("Error scoring article: %v", err)
+ return 0.0
+ }
+
+ return score
+}
+
+func (c *ServeCommand) getThreshold(model *core.ModelEnvelope) (float64, error) {
+ if model.Meta != nil {
+ if threshold, ok := model.Meta["recommended_threshold"].(float64); ok {
+ return threshold, nil
+ }
+ }
+ return core.DefaultScoreThreshold, nil
+}
+
+// scoreAndFormatArticles scores a list of articles and returns them formatted for templates.
+// Articles are scored using the model and vectorizer, then returned with human-readable ratings.
+func (c *ServeCommand) scoreAndFormatArticles(articles []*core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope, threshold float64) []map[string]interface{} {
+ type ArticleResponse struct {
+ Title string `json:"title"`
+ URL string `json:"url"`
+ Source string `json:"source,omitempty"`
+ Rating int `json:"rating"`
+ Score float64 `json:"score"`
+ }
+
+ scored := make([]ArticleResponse, 0, len(articles))
+ for _, article := range articles {
+ score := c.scoreArticle(article, vectorizer, model)
+ rating := core.ScoreToScale(score, threshold)
+
+ scored = append(scored, ArticleResponse{
+ Title: article.Title,
+ URL: article.URL,
+ Source: article.Source,
+ Rating: rating,
+ Score: score,
+ })
+ }
+
+ result := make([]map[string]interface{}, len(scored))
+ for i, a := range scored {
+ result[i] = map[string]interface{}{
+ "Title": a.Title,
+ "URL": a.URL,
+ "Source": a.Source,
+ "Rating": a.Rating,
+ "Score": a.Score,
+ }
+ }
+ return result
+}
+
+// ============================================================================
+// ┏━┓┏━┓┏━┓ ┏━┓╺┳╸╻ ╻┏━╸┏━╸
+// ┣┳┛┗━┓┗━┓ ┗━┓ ┃ ┃ ┃┣╸ ┣╸
+// ╹┗╸┗━┛┗━┛ ┗━┛ ╹ ┗━┛╹ ╹
+// ============================================================================
+
+func (c *ServeCommand) readRSSWorldFeeds() ([]string, error) {
+ f, err := os.Open(c.RSSWorldPath)
+ if err != nil {
+ return nil, fmt.Errorf("failed to open rss_world file %s: %w", c.RSSWorldPath, err)
+ }
+ defer f.Close()
+
+ var feeds []string
+ scanner := bufio.NewScanner(f)
+ for scanner.Scan() {
+ line := strings.TrimSpace(scanner.Text())
+ if line != "" && !strings.HasPrefix(line, "#") {
+ feeds = append(feeds, line)
+ }
+ }
+
+ if err := scanner.Err(); err != nil {
+ return nil, fmt.Errorf("error reading rss_world file: %w", err)
+ }
+
+ return feeds, nil
+}
+
+func (c *ServeCommand) refreshFilteredResults() error {
+ feeds, err := c.readRSSWorldFeeds()
+ if err != nil {
+ return err
+ }
+
+ c.modelMu.RLock()
+ model := c.model
+ c.modelMu.RUnlock()
+
+ if model == nil {
+ return fmt.Errorf("model not loaded")
+ }
+
+ // Scan all feeds. Continue on individual feed failures to maximize results.
+ // RSS feeds are often flaky; one down shouldn't prevent others from being processed.
+ var allArticles []*core.Article
+ for _, feed := range feeds {
+ articles, err := c.fetchRSSFeed(feed)
+ if err != nil {
+ log.Printf("Warning: failed to fetch feed %s: %v", feed, err)
+ continue
+ }
+ allArticles = append(allArticles, articles...)
+ }
+
+ // Score and filter articles
+ threshold, err := c.getThreshold(model)
+ if err != nil {
+ return err
+ }
+
+ vectorizer := core.CreateVectorizerFromModel(model)
+
+ filtered := make([]*core.Article, 0, len(allArticles))
+ for _, article := range allArticles {
+ score := c.scoreArticle(article, vectorizer, model)
+ if score >= threshold {
+ // Create a copy with score to avoid reference issues
+ articleCopy := *article
+ articleCopy.Score = &score
+ filtered = append(filtered, &articleCopy)
+ }
+ }
+
+ c.resultsMu.Lock()
+ c.filteredResults = filtered
+ c.filteredResultsTime = time.Now()
+ c.resultsMu.Unlock()
+
+ return nil
+}
+
+// backgroundRefresh runs in a goroutine, rescanning all RSS feeds on interval.
+// Failures in individual feeds don't affect others - we log and continue.
+func (c *ServeCommand) backgroundRefresh(ticker *time.Ticker) {
+ for range ticker.C {
+ log.Println("Background refresh started")
+ if err := c.refreshFilteredResults(); err != nil {
+ log.Printf("Background refresh error (continuing): %v", err)
+ } else {
+ c.resultsMu.RLock()
+ count := len(c.filteredResults)
+ c.resultsMu.RUnlock()
+ log.Printf("Background refresh complete, %d articles filtered", count)
+ }
+ }
+}
+
+func (c *ServeCommand) fetchRSSFeed(url string) ([]*core.Article, error) {
+ client := &http.Client{Timeout: core.DefaultHTTPTimeout}
+
+ req, err := http.NewRequest("GET", url, nil)
+ if err != nil {
+ return nil, fmt.Errorf("error building request: %w", err)
+ }
+ req.Header.Set("User-Agent", core.PoliteUserAgent)
+
+ ctx, cancel := context.WithTimeout(context.Background(), core.DefaultHTTPTimeout)
+ defer cancel()
+
+ resp, err := client.Do(req.WithContext(ctx))
+ if err != nil {
+ return nil, fmt.Errorf("error fetching %s: %w", url, err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url)
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("error reading response from %s: %w", url, err)
+ }
+
+ fp := gofeed.NewParser()
+ feed, err := fp.Parse(strings.NewReader(string(body)))
+ if err != nil {
+ return nil, fmt.Errorf("error parsing feed from %s: %w", url, err)
+ }
+
+ var articles []*core.Article
+ for _, item := range feed.Items {
+ article := &core.Article{
+ URL: item.Link,
+ Title: strings.TrimSpace(item.Title),
+ Source: feed.Title,
+ }
+
+ if item.PublishedParsed != nil {
+ article.PublishedAt = item.PublishedParsed
+ }
+
+ if len(article.Title) >= core.MinTitleLength {
+ articles = append(articles, article)
+ }
+ }
+
+ return articles, nil
+}
+
+// ============================================================================
+// ╻ ╻┏━╸┏┓ ╻ ╻╻
+// ┃╻┃┣╸ ┣┻┓ ┃ ┃┃
+// ┗┻┛┗━╸┗━┛ ┗━┛╹
+// ============================================================================
+
+func (c *ServeCommand) handleRoot(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/" {
+ http.NotFound(w, r)
+ return
+ }
+
+ // Redirect to live feed
+ http.Redirect(w, r, "/live-feed", http.StatusMovedPermanently)
+}
+
+func (c *ServeCommand) handleLiveFeed(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ c.resultsMu.RLock()
+ articles := c.filteredResults
+ resultsTime := c.filteredResultsTime
+ c.resultsMu.RUnlock()
+
+ c.modelMu.RLock()
+ model := c.model
+ c.modelMu.RUnlock()
+
+ if model == nil {
+ http.Error(w, "Model not loaded", http.StatusInternalServerError)
+ return
+ }
+
+ threshold, _ := c.getThreshold(model)
+
+ // Parse filter parameter (day, week, or all)
+ filter := r.URL.Query().Get("filter")
+ if filter == "" {
+ filter = "all"
+ }
+
+ // Filter articles by date if needed
+ now := time.Now()
+ filtered := articles
+ if filter == "day" || filter == "week" {
+ var cutoff time.Time
+ if filter == "day" {
+ cutoff = now.Add(-24 * time.Hour)
+ } else if filter == "week" {
+ cutoff = now.Add(-7 * 24 * time.Hour)
+ }
+
+ filtered = make([]*core.Article, 0, len(articles))
+ for _, article := range articles {
+ // Always include articles without PublishedAt
+ if article.PublishedAt == nil || article.PublishedAt.After(cutoff) {
+ filtered = append(filtered, article)
+ }
+ }
+ }
+
+ // Convert articles to template format
+ type TemplateArticle struct {
+ Title string
+ URL string
+ Source string
+ Rating int
+ Score float64
+ PublishedAt string
+ }
+
+ templateArticles := make([]TemplateArticle, 0, len(filtered))
+ for _, article := range filtered {
+ score := 0.0
+ if article.Score != nil {
+ score = *article.Score
+ }
+ rating := core.ScoreToScale(score, threshold)
+
+ publishedAt := ""
+ if article.PublishedAt != nil {
+ publishedAt = article.PublishedAt.Format("2006-01-02")
+ }
+
+ templateArticles = append(templateArticles, TemplateArticle{
+ Title: article.Title,
+ URL: article.URL,
+ Source: article.Source,
+ Rating: rating,
+ Score: score,
+ PublishedAt: publishedAt,
+ })
+ }
+
+ // Sort articles by score (highest first)
+ sort.Slice(templateArticles, func(i, j int) bool {
+ return templateArticles[i].Score > templateArticles[j].Score
+ })
+
+ data := map[string]interface{}{
+ "Page": "live-feed",
+ "Articles": templateArticles,
+ "Threshold": threshold,
+ "UpdatedAt": resultsTime.Format("2006-01-02 15:04:05"),
+ "Filter": filter,
+ "Title": displayTitle(c.Title),
+ }
+
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ if err := c.tmpl.ExecuteTemplate(w, "live-feed", data); err != nil {
+ http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+ }
+}
+
+func (c *ServeCommand) handleTools(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ data := map[string]interface{}{
+ "Page": "tools",
+ "Title": displayTitle(c.Title),
+ }
+
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ if err := c.tmpl.ExecuteTemplate(w, "tools", data); err != nil {
+ http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+ }
+}
+
+func (c *ServeCommand) handleScore(w http.ResponseWriter, r *http.Request) {
+ if r.Method == http.MethodGet {
+ c.handleTools(w, r)
+ return
+ }
+
+ if r.Method != http.MethodPost {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ c.modelMu.RLock()
+ model := c.model
+ c.modelMu.RUnlock()
+
+ if model == nil {
+ http.Error(w, "Model not loaded", http.StatusInternalServerError)
+ return
+ }
+
+ if err := r.ParseForm(); err != nil {
+ http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest)
+ return
+ }
+
+ title := strings.TrimSpace(r.FormValue("title"))
+ url := strings.TrimSpace(r.FormValue("url"))
+
+ // If URL provided, fetch and extract title from it; otherwise use provided title.
+ if url != "" {
+ extractedTitle, err := extractTitleFromURL(url)
+ if err != nil {
+ c.renderResultsError(w, fmt.Sprintf("Failed to extract title from URL: %v", err), title)
+ return
+ }
+ title = extractedTitle
+ }
+
+ // Validate input before scoring
+ if valErr := c.validateTitle(title); valErr != "" {
+ c.renderResultsError(w, valErr, title)
+ return
+ }
+
+ vectorizer := core.CreateVectorizerFromModel(model)
+ article := &core.Article{Title: title}
+ score := c.scoreArticle(article, vectorizer, model)
+
+ threshold, _ := c.getThreshold(model)
+ rating := core.ScoreToScale(score, threshold)
+
+ data := map[string]interface{}{
+ "Page": "tools",
+ "IsScoreResult": true,
+ "Title": title,
+ "Rating": rating,
+ "Score": score,
+ "Threshold": threshold,
+ "PageTitle": displayTitle(c.Title),
+ }
+
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
+ http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+ }
+}
+
+func (c *ServeCommand) handleScan(w http.ResponseWriter, r *http.Request) {
+ if r.Method == http.MethodGet {
+ c.handleTools(w, r)
+ return
+ }
+
+ if r.Method != http.MethodPost {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ c.modelMu.RLock()
+ model := c.model
+ c.modelMu.RUnlock()
+
+ if model == nil {
+ http.Error(w, "Model not loaded", http.StatusInternalServerError)
+ return
+ }
+
+ if err := r.ParseForm(); err != nil {
+ http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest)
+ return
+ }
+
+ feedURL := strings.TrimSpace(r.FormValue("feed_url"))
+
+ // Validate and fetch the feed
+ if valErr := c.validateFeedURL(feedURL); valErr != "" {
+ c.renderScanResultsError(w, valErr, feedURL)
+ return
+ }
+
+ articles, err := c.fetchRSSFeed(feedURL)
+ if err != nil {
+ c.renderScanResultsError(w, fmt.Sprintf("Failed to fetch feed: %v", err), feedURL)
+ return
+ }
+
+ // Score articles
+ threshold, _ := c.getThreshold(model)
+ vectorizer := core.CreateVectorizerFromModel(model)
+ scored := c.scoreAndFormatArticles(articles, vectorizer, model, threshold)
+
+ sort.Slice(scored, func(i, j int) bool {
+ iScore := scored[i]["Score"].(float64)
+ jScore := scored[j]["Score"].(float64)
+ return iScore > jScore
+ })
+
+ data := map[string]interface{}{
+ "Page": "tools",
+ "IsScanResult": true,
+ "FeedURL": feedURL,
+ "Articles": scored,
+ "Threshold": threshold,
+ "PageTitle": displayTitle(c.Title),
+ }
+
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
+ http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+ }
+}
+
+// ============================================================================
+// ┏━┓┏━┓╻ ┏━╸┏┓╻╺┳┓┏━┓┏━┓╻┏┓╻╺┳╸┏━┓
+// ┣━┫┣━┛┃ ┣╸ ┃┗┫ ┃┃┣━┛┃ ┃┃┃┗┫ ┃ ┗━┓
+// ╹ ╹╹ ╹ ┗━╸╹ ╹╺┻┛╹ ┗━┛╹╹ ╹ ╹ ┗━┛
+// ============================================================================
+
+func (c *ServeCommand) handleFilteredFeed(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ c.resultsMu.RLock()
+ articles := c.filteredResults
+ resultsTime := c.filteredResultsTime
+ c.resultsMu.RUnlock()
+
+ c.modelMu.RLock()
+ model := c.model
+ c.modelMu.RUnlock()
+
+ threshold, _ := c.getThreshold(model)
+
+ type ArticleResponse struct {
+ Title string `json:"title"`
+ URL string `json:"url"`
+ Source string `json:"source,omitempty"`
+ Rating int `json:"rating"`
+ Score float64 `json:"score"`
+ }
+
+ scored := make([]ArticleResponse, 0, len(articles))
+ for _, article := range articles {
+ score := 0.0
+ if article.Score != nil {
+ score = *article.Score
+ }
+ rating := core.ScoreToScale(score, threshold)
+
+ scored = append(scored, ArticleResponse{
+ Title: article.Title,
+ URL: article.URL,
+ Source: article.Source,
+ Rating: rating,
+ Score: score,
+ })
+ }
+
+ w.Header().Set("Content-Type", "application/json")
+ w.Header().Set("Cache-Control", "no-cache, no-store, must-revalidate")
+
+ if err := json.NewEncoder(w).Encode(map[string]interface{}{
+ "total": len(articles),
+ "threshold": threshold,
+ "updated_at": resultsTime,
+ "articles": scored,
+ }); err != nil {
+ http.Error(w, "Failed to encode response", http.StatusInternalServerError)
+ }
+}
+
+func (c *ServeCommand) handleFilteredRSS(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ c.resultsMu.RLock()
+ articles := c.filteredResults
+ c.resultsMu.RUnlock()
+
+ c.modelMu.RLock()
+ model := c.model
+ c.modelMu.RUnlock()
+
+ w.Header().Set("Content-Type", "application/rss+xml")
+ w.Header().Set("Cache-Control", "public, max-age=3600")
+
+ // Generate RSS feed
+ fmt.Fprintf(w, `<?xml version="1.0" encoding="UTF-8"?>
+ <rss version="2.0">
+ <channel>
+ <title>%s - Filtered Articles</title>
+ <link>http://scholscan.local</link>
+ <description>Articles filtered by your learned preferences (scored 1-10)</description>
+ `, displayTitle(c.Title))
+
+ for _, article := range articles {
+ rawScore := 0.0
+ if article.Score != nil {
+ rawScore = *article.Score
+ }
+
+ threshold, _ := c.getThreshold(model)
+ scaledScore := core.ScoreToScale(rawScore, threshold)
+
+ title := escapeXML(article.Title)
+ url := escapeXML(article.URL)
+ description := fmt.Sprintf("SCHOLSCAN SCORE = %d/10 (raw: %.3f)", scaledScore, rawScore)
+
+ fmt.Fprintf(w, ` <item>
+ <title>%s</title>
+ <link>%s</link>
+ <description>%s</description>
+ </item>
+ `, title, url, description)
+ }
+
+ fmt.Fprint(w, ` </channel>
+ </rss>`)
+}
+
+func (c *ServeCommand) handleHealth(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodGet {
+ http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+ return
+ }
+
+ c.modelMu.RLock()
+ modelLoaded := c.model != nil
+ c.modelMu.RUnlock()
+
+ status := "ok"
+ if !modelLoaded {
+ status = "model_not_loaded"
+ w.WriteHeader(http.StatusInternalServerError)
+ }
+
+ w.Header().Set("Content-Type", "application/json")
+ if err := json.NewEncoder(w).Encode(map[string]interface{}{
+ "status": status,
+ "model_loaded": modelLoaded,
+ "timestamp": time.Now().Unix(),
+ }); err != nil {
+ http.Error(w, "Failed to encode response", http.StatusInternalServerError)
+ }
+}
+
+// ============================================================================
+// ╻ ╻┏━╸╻ ┏━┓┏━╸┏━┓┏━┓
+// ┣━┫┣╸ ┃ ┣━┛┣╸ ┣┳┛┗━┓
+// ╹ ╹┗━╸┗━╸╹ ┗━╸╹┗╸┗━┛
+// ============================================================================
+
+func displayTitle(custom string) string {
+ if custom != "" {
+ return custom
+ }
+ return "ScholScan"
+}
+
+// extractTitleFromURL fetches the content from a URL and extracts the title from the HTML.
+// Designed to be resilient: tries multiple title sources, handles various URL formats,
+// and provides meaningful error feedback if extraction fails.
+func extractTitleFromURL(rawURL string) (string, error) {
+ if rawURL == "" {
+ return "", fmt.Errorf("empty URL")
+ }
+
+ // Check if it's a DOI
+ if strings.HasPrefix(rawURL, "10.") {
+ // Convert DOI to URL
+ rawURL = fmt.Sprintf("https://doi.org/%s", rawURL)
+ } else if !strings.HasPrefix(rawURL, "http://") && !strings.HasPrefix(rawURL, "https://") {
+ rawURL = "https://" + rawURL
+ }
+
+ ctx, cancel := context.WithTimeout(context.Background(), core.DefaultContextTimeout)
+ defer cancel()
+
+ req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil)
+ if err != nil {
+ return "", fmt.Errorf("invalid URL: %w", err)
+ }
+ req.Header.Set("User-Agent", core.PoliteUserAgent)
+ req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
+
+ resp, err := core.DoRequestWithRetry(ctx, core.DefaultHTTPClient, req)
+ if err != nil {
+ return "", fmt.Errorf("failed to fetch URL: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return "", fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
+ }
+
+ doc, err := goquery.NewDocumentFromReader(resp.Body)
+ if err != nil {
+ return "", fmt.Errorf("failed to parse HTML: %w", err)
+ }
+
+ // Fallback chain: <title> → og:title → twitter:title → <h1>
+ // Different sites populate these differently; trying multiple increases success rate.
+ title := ""
+
+ if t := doc.Find("title").Text(); t != "" {
+ title = strings.TrimSpace(t)
+ }
+
+ if title == "" {
+ if t, exists := doc.Find(`meta[property="og:title"]`).Attr("content"); exists && t != "" {
+ title = strings.TrimSpace(t)
+ }
+ }
+
+ if title == "" {
+ if t, exists := doc.Find(`meta[name="twitter:title"]`).Attr("content"); exists && t != "" {
+ title = strings.TrimSpace(t)
+ }
+ }
+
+ if title == "" {
+ if t := doc.Find("h1").First().Text(); t != "" {
+ title = strings.TrimSpace(t)
+ }
+ }
+
+ if title == "" {
+ return "", fmt.Errorf("could not extract title from page")
+ }
+
+ // Clean up common title patterns
+ reClean := regexp.MustCompile(`\s*\|\s*`)
+ title = reClean.ReplaceAllString(title, "")
+
+ rePub := regexp.MustCompile(`^[^|]*\|\s*`)
+ title = rePub.ReplaceAllString(title, "")
+ title = strings.TrimSpace(title)
+
+ if len(title) < 10 {
+ return "", fmt.Errorf("extracted title too short: %q", title)
+ }
+
+ return title, nil
+}
+
+// validateTitle checks that a title is suitable for scoring.
+// Returns an error message string if invalid, empty string if valid.
+func (c *ServeCommand) validateTitle(title string) string {
+ if strings.TrimSpace(title) == "" {
+ return "Title cannot be empty"
+ }
+ if len(title) > 1000 {
+ return "Title too long (max 1000 characters)"
+ }
+ return ""
+}
+
+// renderResultsError renders the results template with an error message.
+func (c *ServeCommand) renderResultsError(w http.ResponseWriter, errMsg, title string) {
+ data := map[string]interface{}{
+ "Page": "tools",
+ "IsScoreResult": true,
+ "Error": errMsg,
+ "Title": title,
+ "PageTitle": displayTitle(c.Title),
+ }
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
+ http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+ }
+}
+
+// validateFeedURL checks that a feed URL is non-empty and has valid format.
+// Returns an error message string if invalid, empty string if valid.
+func (c *ServeCommand) validateFeedURL(feedURL string) string {
+ if feedURL == "" {
+ return "Feed URL cannot be empty"
+ }
+ if _, err := url.Parse(feedURL); err != nil {
+ return "Invalid URL format"
+ }
+ return ""
+}
+
+// renderScanResultsError renders the results template with an error for scan operation.
+func (c *ServeCommand) renderScanResultsError(w http.ResponseWriter, errMsg, feedURL string) {
+ data := map[string]interface{}{
+ "Page": "tools",
+ "IsScanResult": true,
+ "Error": errMsg,
+ "FeedURL": feedURL,
+ "PageTitle": displayTitle(c.Title),
+ }
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil {
+ http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError)
+ }
+}
+
+func escapeXML(s string) string {
+ s = strings.ReplaceAll(s, "&", "&amp;")
+ s = strings.ReplaceAll(s, "<", "&lt;")
+ s = strings.ReplaceAll(s, ">", "&gt;")
+ s = strings.ReplaceAll(s, "\"", "&quot;")
+ s = strings.ReplaceAll(s, "'", "&apos;")
+ return s
+}
diff --git a/cmds/templates/live-feed.html b/cmds/templates/live-feed.html
new file mode 100644
index 0000000..1529ee1
--- /dev/null
+++ b/cmds/templates/live-feed.html
@@ -0,0 +1,158 @@
+{{define "live-feed"}}
+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <title>{{.Title}} - Live Feed</title>
+ <style>
+ /* ========================================
+ BASE STYLE
+ ======================================== */
+ * { margin: 0; padding: 0; box-sizing: border-box; }
+ body {
+ font-family: monospace;
+ background: #fff;
+ color: #000;
+ padding: 20px;
+ line-height: 1.6;
+ }
+ h1 {
+ font-size: 1.2em;
+ font-weight: bold;
+ margin-bottom: 20px;
+ }
+
+ /* ========================================
+ NAV (live-feed | score-scan)
+ ======================================== */
+ .nav {
+ margin-bottom: 30px;
+ display: flex;
+ gap: 30px;
+ border-bottom: 1px solid #000;
+ padding-bottom: 10px;
+ }
+ .nav a {
+ text-decoration: none;
+ color: #000;
+ font-family: monospace;
+ }
+ .nav a.active {
+ border-bottom: 2px solid #000;
+ padding-bottom: 5px;
+ }
+
+ /* ========================================
+ ARTICLE LIST
+ ======================================== */
+ .article {
+ margin-bottom: 15px;
+ padding: 10px;
+ border: 1px solid #ccc;
+ }
+ .article a {
+ color: #00f;
+ text-decoration: underline;
+ }
+ .article-meta {
+ margin-top: 8px;
+ color: #666;
+ font-size: 0.9em;
+ }
+
+ /* ========================================
+ ARTICLE LIST STUFF
+ ======================================== */
+ .summary {
+ margin-bottom: 15px;
+ padding: 10px;
+ border: 1px solid #000;
+ background: #f9f9f9;
+ }
+ .rss-link {
+ background: #f9f9f9;
+ padding: 15px;
+ border: 1px solid #000;
+ margin-bottom: 20px;
+ }
+ .rss-link a {
+ color: #00f;
+ text-decoration: underline;
+ }
+ .feed-list {
+ max-height: 600px;
+ overflow-y: auto;
+ border: 1px solid #000;
+ padding: 10px;
+ }
+
+ .error {
+ color: #f00;
+ margin-top: 10px;
+ padding: 10px;
+ border: 1px solid #f00;
+ }
+ </style>
+</head>
+<body>
+ <h1><a href="/live-feed" style="color: inherit; text-decoration: none;">{{.Title}}</a></h1>
+ <div class="nav">
+ <a href="/live-feed" class="active">Live Feed</a>
+ <a href="/tools">Score & Scan</a>
+ </div>
+
+ <div class="rss-link">
+ <strong>Filtered RSS Feed:</strong>
+ <a href="/api/filtered/rss" target="_blank">Subscribe to filtered articles</a>
+ <span style="margin-left: 10px; color: #666; font-size: 0.9em;">(rss link for feed readers)</span>
+ <div style="margin-top: 10px; padding-top: 10px; border-top: 1px solid #ccc; color: #666; font-size: 0.9em;">
+ Last updated: <span id="feedTimestamp">{{if .UpdatedAt}}{{.UpdatedAt}}{{else}}—{{end}}</span>
+ </div>
+ </div>
+
+ <div style="margin-bottom: 20px;">
+ <strong>Filter by date:</strong>
+ <div style="margin-top: 8px; display: flex; gap: 10px;">
+ <a href="/live-feed?filter=day" style="padding: 6px 12px; text-decoration: none; {{if eq .Filter "day"}}background: #000; color: #fff;{{else}}border: 1px solid #000; color: #000;{{end}}">Last 24h</a>
+ <a href="/live-feed?filter=week" style="padding: 6px 12px; text-decoration: none; {{if eq .Filter "week"}}background: #000; color: #fff;{{else}}border: 1px solid #000; color: #000;{{end}}">Last 7 days</a>
+ <a href="/live-feed?filter=all" style="padding: 6px 12px; text-decoration: none; {{if eq .Filter "all"}}background: #000; color: #fff;{{else}}border: 1px solid #000; color: #000;{{end}}">All</a>
+ </div>
+ </div>
+
+ <div class="feed-list">
+ {{if .Error}}
+ <div class="error">{{.Error}}</div>
+ {{else if .Articles}}
+ <div class="summary">
+ <strong>{{len .Articles}}</strong> articles (threshold: {{printf "%.2f" .Threshold}})
+ </div>
+ {{$threshold := .Threshold}}
+ {{range .Articles}}
+ {{$isGood := ge .Score $threshold}}
+ {{$bgColor := "white"}}
+ {{if $isGood}}
+ {{$bgColor = "#e8f5e9"}}
+ {{else}}
+ {{$bgColor = "#ffebee"}}
+ {{end}}
+ {{$indicator := "✗"}}
+ {{if $isGood}}
+ {{$indicator = "✓"}}
+ {{end}}
+ <div class="article" style="background-color: {{$bgColor}};">
+ <div style="font-weight: bold;">
+ <a href="{{.URL}}" target="_blank">{{.Title}}</a>
+ </div>
+ <div class="article-meta">
+ Rating: {{$indicator}} {{.Rating}}/10 (raw: {{printf "%.3f" .Score}}) · {{.Source}}{{if .PublishedAt}} · {{.PublishedAt}}{{end}}
+ </div>
+ </div>
+ {{end}}
+ {{else}}
+ <p>No articles to display</p>
+ {{end}}
+ </div>
+</body>
+</html>
+{{end}}
diff --git a/cmds/templates/results.html b/cmds/templates/results.html
new file mode 100644
index 0000000..13f68e0
--- /dev/null
+++ b/cmds/templates/results.html
@@ -0,0 +1,279 @@
+{{define "results"}}
+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <title>{{.PageTitle}} - Results</title>
+ <style>
+ /* ========================================
+ BASE STYLE
+ ======================================== */
+ * { margin: 0; padding: 0; box-sizing: border-box; }
+ body {
+ font-family: monospace;
+ background: #fff;
+ color: #000;
+ padding: 20px;
+ line-height: 1.6;
+ }
+ h1 {
+ font-size: 1.2em;
+ font-weight: bold;
+ margin-bottom: 20px;
+ }
+ h2 {
+ font-size: 1em;
+ font-weight: bold;
+ margin-bottom: 15px;
+ border-bottom: 1px solid #000;
+ padding-bottom: 10px;
+ }
+
+ /* ========================================
+ NAV (live-feed | score-scan)
+ ======================================== */
+ .nav {
+ margin-bottom: 30px;
+ display: flex;
+ gap: 30px;
+ border-bottom: 1px solid #000;
+ padding-bottom: 10px;
+ }
+ .nav a {
+ text-decoration: none;
+ color: #000;
+ font-family: monospace;
+ }
+ .nav a.active {
+ border-bottom: 2px solid #000;
+ padding-bottom: 5px;
+ }
+
+ /* ========================================
+ LAYOUT (2-column grid for score-scan)
+ ======================================== */
+ .container {
+ max-width: 1200px;
+ margin: 0 auto;
+ display: grid;
+ grid-template-columns: 1fr 1fr;
+ gap: 30px;
+ }
+ .section {
+ border: 1px solid #000;
+ padding: 20px;
+ }
+
+ /* ========================================
+ FORMS (input, textarea, button)
+ ======================================== */
+ label {
+ display: block;
+ margin-top: 15px;
+ font-weight: bold;
+ }
+ input, textarea {
+ display: block;
+ width: 100%;
+ margin-top: 5px;
+ padding: 5px;
+ border: 1px solid #000;
+ font-family: monospace;
+ }
+ textarea {
+ resize: vertical;
+ min-height: 80px;
+ }
+ button {
+ margin-top: 15px;
+ padding: 5px 15px;
+ border: 1px solid #000;
+ background: #fff;
+ cursor: pointer;
+ font-family: monospace;
+ }
+ button:hover {
+ background: #000;
+ color: #fff;
+ }
+ button:active {
+ opacity: 0.8;
+ }
+
+ /* ========================================
+ RESULT BOXES
+ ======================================== */
+ .result {
+ margin-top: 20px;
+ padding: 15px;
+ border: 1px solid #000;
+ background: #f5f5f5;
+ }
+ .score {
+ font-size: 3em;
+ font-weight: bold;
+ text-align: center;
+ margin: 20px 0;
+ }
+ .error {
+ color: #f00;
+ margin-top: 10px;
+ padding: 10px;
+ border: 1px solid #f00;
+ }
+
+ /* ========================================
+ ARTICLE LIST
+ ======================================== */
+ .article {
+ margin-bottom: 15px;
+ padding: 10px;
+ border: 1px solid #ccc;
+ }
+ .article a {
+ color: #00f;
+ text-decoration: underline;
+ }
+ .article-meta {
+ margin-top: 8px;
+ color: #666;
+ font-size: 0.9em;
+ }
+
+ /* ========================================
+ ARTICLE LIST STUFF
+ ======================================== */
+ .summary {
+ margin-bottom: 15px;
+ padding: 10px;
+ border: 1px solid #000;
+ background: #f9f9f9;
+ }
+
+ small {
+ display: block;
+ margin-top: 5px;
+ color: #666;
+ }
+
+ /* ========================================
+ MOBILE
+ ======================================== */
+ @media (max-width: 960px) {
+ .container {
+ grid-template-columns: 1fr;
+ gap: 20px;
+ }
+ }
+ </style>
+</head>
+<body>
+ <h1><a href="/live-feed" style="color: inherit; text-decoration: none;">{{.PageTitle}}</a></h1>
+ <div class="nav">
+ <a href="/live-feed">Live Feed</a>
+ <a href="/tools" class="active">Score & Scan</a>
+ </div>
+
+ <div class="container">
+ {{if .IsScoreResult}}
+ <div class="section">
+ <h2>Score Article</h2>
+ {{if .Error}}
+ <div class="error">{{.Error}}</div>
+ <form method="POST" action="/score" style="margin-top: 20px;">
+ <label for="scoreTitle">Title:</label>
+ <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" value="{{.Title}}" />
+ <label for="scoreURL">URL or DOI:</label>
+ <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" />
+ <small>If URL is provided, title will be automatically extracted</small>
+ <button type="submit">Score</button>
+ </form>
+ {{else}}
+ <div class="result">
+ <div class="score">{{.Rating}}/10</div>
+ <p style="text-align: center; color: #666;">Score: {{printf "%.3f" .Score}}</p>
+ <p style="text-align: center; margin-top: 10px; font-size: 0.9em;">{{.Title}}</p>
+ </div>
+ <form method="POST" action="/score" style="margin-top: 20px;">
+ <label for="scoreTitle">Title:</label>
+ <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" />
+ <label for="scoreURL">URL or DOI:</label>
+ <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" />
+ <small>If URL is provided, title will be automatically extracted</small>
+ <button type="submit">Score Another</button>
+ </form>
+ {{end}}
+ </div>
+
+ <div class="section">
+ <h2>Scan Feed</h2>
+ <form method="POST" action="/scan">
+ <label for="feedURL">RSS Feed URL:</label>
+ <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" required />
+ <button type="submit">Scan</button>
+ </form>
+ </div>
+
+ {{else if .IsScanResult}}
+ <div class="section">
+ <h2>Score Article</h2>
+ <form method="POST" action="/score">
+ <label for="scoreTitle">Title:</label>
+ <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" />
+ <label for="scoreURL">URL or DOI:</label>
+ <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" />
+ <small>If URL is provided, title will be automatically extracted</small>
+ <button type="submit">Score</button>
+ </form>
+ </div>
+
+ <div class="section">
+ <h2>Scan Feed</h2>
+ {{if .Error}}
+ <div class="error">{{.Error}}</div>
+ <form method="POST" action="/scan" style="margin-top: 20px;">
+ <label for="feedURL">RSS Feed URL:</label>
+ <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" value="{{.FeedURL}}" required />
+ <button type="submit">Try Again</button>
+ </form>
+ {{else}}
+ <div class="summary">
+ <strong>{{len .Articles}}</strong> articles from {{.FeedURL}} (threshold: {{printf "%.2f" .Threshold}})
+ </div>
+ <div style="max-height: 500px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
+ {{$threshold := .Threshold}}
+ {{range .Articles}}
+ {{$isGood := ge .Score $threshold}}
+ {{$bgColor := "white"}}
+ {{if $isGood}}
+ {{$bgColor = "#e8f5e9"}}
+ {{else}}
+ {{$bgColor = "#ffebee"}}
+ {{end}}
+ {{$indicator := "✗"}}
+ {{if $isGood}}
+ {{$indicator = "✓"}}
+ {{end}}
+ <div class="article" style="background-color: {{$bgColor}};">
+ <div style="font-weight: bold;">
+ <a href="{{.URL}}" target="_blank">{{.Title}}</a>
+ </div>
+ <div class="article-meta">
+ Rating: {{$indicator}} {{.Rating}}/10 (raw: {{printf "%.3f" .Score}}) · {{.Source}}
+ </div>
+ </div>
+ {{end}}
+ </div>
+ <form method="POST" action="/scan" style="margin-top: 20px;">
+ <label for="feedURL">RSS Feed URL:</label>
+ <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" required />
+ <button type="submit">Scan Another</button>
+ </form>
+ {{end}}
+ </div>
+ {{end}}
+ </div>
+</body>
+</html>
+{{end}}
diff --git a/cmds/templates/tools.html b/cmds/templates/tools.html
new file mode 100644
index 0000000..def04fe
--- /dev/null
+++ b/cmds/templates/tools.html
@@ -0,0 +1,202 @@
+{{define "tools"}}
+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+ <title>{{.Title}} - Score & Scan</title>
+ <style>
+ /* ========================================
+ BASE STYLE
+ ======================================== */
+ * { margin: 0; padding: 0; box-sizing: border-box; }
+ body {
+ font-family: monospace;
+ background: #fff;
+ color: #000;
+ padding: 20px;
+ line-height: 1.6;
+ }
+ h1 {
+ font-size: 1.2em;
+ font-weight: bold;
+ margin-bottom: 20px;
+ }
+ h2 {
+ font-size: 1em;
+ font-weight: bold;
+ margin-bottom: 15px;
+ border-bottom: 1px solid #000;
+ padding-bottom: 10px;
+ }
+
+ /* ========================================
+ NAV (live-feed | score-scan)
+ ======================================== */
+ .nav {
+ margin-bottom: 30px;
+ display: flex;
+ gap: 30px;
+ border-bottom: 1px solid #000;
+ padding-bottom: 10px;
+ }
+ .nav a {
+ text-decoration: none;
+ color: #000;
+ font-family: monospace;
+ }
+ .nav a.active {
+ border-bottom: 2px solid #000;
+ padding-bottom: 5px;
+ }
+
+ /* ========================================
+ LAYOUT (2-column grid for score-scan)
+ ======================================== */
+ .container {
+ max-width: 1200px;
+ margin: 0 auto;
+ display: grid;
+ grid-template-columns: 1fr 1fr;
+ gap: 30px;
+ }
+ .section {
+ border: 1px solid #000;
+ padding: 20px;
+ }
+
+ /* ========================================
+ FORMS (input, textarea, button)
+ ======================================== */
+ label {
+ display: block;
+ margin-top: 15px;
+ font-weight: bold;
+ }
+ input, textarea {
+ display: block;
+ width: 100%;
+ margin-top: 5px;
+ padding: 5px;
+ border: 1px solid #000;
+ font-family: monospace;
+ }
+ textarea {
+ resize: vertical;
+ min-height: 80px;
+ }
+ button {
+ margin-top: 15px;
+ padding: 5px 15px;
+ border: 1px solid #000;
+ background: #fff;
+ cursor: pointer;
+ font-family: monospace;
+ }
+ button:hover {
+ background: #000;
+ color: #fff;
+ }
+ button:active {
+ opacity: 0.8;
+ }
+
+ /* ========================================
+ RESULT BOXES
+ ======================================== */
+ .result {
+ margin-top: 20px;
+ padding: 15px;
+ border: 1px solid #000;
+ background: #f5f5f5;
+ }
+ .score {
+ font-size: 3em;
+ font-weight: bold;
+ text-align: center;
+ margin: 20px 0;
+ }
+ .error {
+ color: #f00;
+ margin-top: 10px;
+ padding: 10px;
+ border: 1px solid #f00;
+ }
+
+ /* ========================================
+ ARTICLE LIST
+ ======================================== */
+ .article {
+ margin-bottom: 15px;
+ padding: 10px;
+ border: 1px solid #ccc;
+ }
+ .article a {
+ color: #00f;
+ text-decoration: underline;
+ }
+ .article-meta {
+ margin-top: 8px;
+ color: #666;
+ font-size: 0.9em;
+ }
+
+ /* ========================================
+ ARTICLE LIST STUFF
+ ======================================== */
+ .summary {
+ margin-bottom: 15px;
+ padding: 10px;
+ border: 1px solid #000;
+ background: #f9f9f9;
+ }
+
+ small {
+ display: block;
+ margin-top: 5px;
+ color: #666;
+ }
+
+ /* ========================================
+ MOBILE
+ ======================================== */
+ @media (max-width: 960px) {
+ .container {
+ grid-template-columns: 1fr;
+ gap: 20px;
+ }
+ }
+ </style>
+</head>
+<body>
+ <h1><a href="/live-feed" style="color: inherit; text-decoration: none;">{{.Title}}</a></h1>
+ <div class="nav">
+ <a href="/live-feed">Live Feed</a>
+ <a href="/tools" class="active">Score & Scan</a>
+ </div>
+
+ <div class="container">
+ <div class="section">
+ <h2>Score Article</h2>
+ <form method="POST" action="/score">
+ <label for="scoreTitle">Title:</label>
+ <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" />
+ <label for="scoreURL" style="margin-top: 10px;">URL or DOI:</label>
+ <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" />
+ <small>If URL is provided, title will be automatically extracted</small>
+ <button type="submit">Score</button>
+ </form>
+ </div>
+
+ <div class="section">
+ <h2>Scan Feed</h2>
+ <form method="POST" action="/scan">
+ <label for="feedURL">RSS Feed URL:</label>
+ <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" required />
+ <button type="submit">Scan</button>
+ </form>
+ </div>
+ </div>
+</body>
+</html>
+{{end}}
diff --git a/cmds/train.go b/cmds/train.go
new file mode 100644
index 0000000..e7e8915
--- /dev/null
+++ b/cmds/train.go
@@ -0,0 +1,841 @@
+// Train command learns model from positive examples and RSS feeds.
+// Loads positives, fetches RSS feeds as negatives, excludes overlap,
+// trains TF-IDF + logistic regression with 1:1 class balancing.
+// Outputs model with validation threshold to stdout.
+package cmds
+
+import (
+ "bufio"
+ "bytes"
+ "context"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "math"
+ "math/rand"
+ "net/http"
+ "net/url"
+ "os"
+ "path/filepath"
+ "strings"
+ "time"
+
+ "github.com/mmcdole/gofeed"
+ "scholscan/core"
+)
+
+// ============================================================================
+// ┏━╸┏┳┓╺┳┓ ┏━┓┏┓ ┏┓
+// ┃ ┃┃┃ ┃┃ ┃ ┃┣┻┓ ┃
+// ┗━╸╹ ╹╺┻┛ ┗━┛┗━┛┗━┛
+// ============================================================================
+
+// Learns model from positive examples and RSS feeds
+// Outputs trained model JSON to stdout
+type TrainCommand struct {
+ positivesFile string
+ rssFeedsFile string
+ verboseOutput bool
+ lambda float64
+ minDF int
+ maxDF float64
+ ngramMax int
+}
+
+func (c *TrainCommand) Name() string { return "train" }
+
+func (c *TrainCommand) Init(args []string) error {
+ fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
+ fs.Usage = func() {
+ fmt.Fprint(fs.Output(), `Usage: scholscan train POSITIVES_FILE --rss-feeds RSS_FEEDS_FILE > model.json
+
+Train a TF-IDF + logistic regression model from positive examples and RSS feeds.
+
+The training workflow:
+ 1. Load positive examples from POSITIVES_FILE
+ 2. Fetch articles from RSS feeds list
+ 3. Exclude any positive examples from RSS feed articles
+ 4. Train model with balanced classes
+ 5. Output trained model to stdout as JSON
+
+Flags:
+`)
+ fs.PrintDefaults()
+ fmt.Fprint(fs.Output(), `
+Arguments:
+ POSITIVES_FILE Path to JSONL file with positive examples (required)
+
+Example:
+ scholscan train positives.jsonl --rss-feeds rss_world.txt > model.json
+`)
+ }
+
+ fs.StringVar(&c.rssFeedsFile, "rss-feeds", "", "Path to text file with RSS feed URLs (required)")
+ fs.BoolVar(&c.verboseOutput, "verbose", false, "Show progress information")
+ fs.Float64Var(&c.lambda, "lambda", 0.001, "L2 regularization parameter for logistic regression")
+ fs.IntVar(&c.minDF, "min-df", 2, "Minimum document frequency (absolute count)")
+ fs.Float64Var(&c.maxDF, "max-df", 0.8, "Maximum document frequency (ratio, 0-1)")
+ fs.IntVar(&c.ngramMax, "ngram-max", 2, "Maximum n-gram size (e.g., 1=unigrams, 2=unigrams+bigrams)")
+
+ // Check for help flag first
+ for _, arg := range args {
+ if arg == "--help" || arg == "-h" {
+ fs.Usage()
+ return flag.ErrHelp
+ }
+ }
+
+ // Extract positional argument (POSITIVES_FILE) before parsing flags
+ if len(args) == 0 {
+ return fmt.Errorf("POSITIVES_FILE argument is required")
+ }
+ // The first argument should be the positives file, the rest are flags
+ c.positivesFile = args[0]
+ flagArgs := args[1:]
+
+ if err := fs.Parse(flagArgs); err != nil {
+ return err
+ }
+
+ if c.rssFeedsFile == "" {
+ return fmt.Errorf("--rss-feeds flag is required")
+ }
+
+ // Validate paths are safe (prevent directory traversal)
+ if strings.Contains(filepath.Clean(c.positivesFile), "..") {
+ return fmt.Errorf("invalid positives file path: directory traversal not allowed")
+ }
+ if strings.Contains(filepath.Clean(c.rssFeedsFile), "..") {
+ return fmt.Errorf("invalid RSS feeds file path: directory traversal not allowed")
+ }
+
+ return nil
+}
+
+func (c *TrainCommand) Run(stdin io.Reader, stdout io.Writer) error {
+ if c.verboseOutput {
+ log.SetOutput(os.Stderr)
+ log.Println("Starting training workflow...")
+ log.Printf("Positives: %s", c.positivesFile)
+ log.Printf("RSS feeds: %s", c.rssFeedsFile)
+ }
+
+ if c.verboseOutput {
+ log.Printf("Loading positives from %s...", c.positivesFile)
+ }
+ positives, err := c.loadArticles(c.positivesFile)
+ if err != nil {
+ return fmt.Errorf("failed to load positives: %w", err)
+ }
+ if c.verboseOutput {
+ log.Printf("Loaded %d positive examples", len(positives))
+ }
+
+ if c.verboseOutput {
+ log.Printf("Loading RSS feeds from %s...", c.rssFeedsFile)
+ }
+ rssURLs, err := c.loadRSSURLs(c.rssFeedsFile)
+ if err != nil {
+ return fmt.Errorf("failed to load RSS feeds: %w", err)
+ }
+ if c.verboseOutput {
+ log.Printf("Found %d RSS feeds to fetch", len(rssURLs))
+ }
+
+ negatives, err := c.fetchFromRSSFeeds(rssURLs)
+ if err != nil {
+ return fmt.Errorf("failed to fetch from RSS feeds: %w", err)
+ }
+ if c.verboseOutput {
+ log.Printf("Fetched %d articles from RSS feeds", len(negatives))
+ }
+
+ negatives = c.excludePositives(negatives, positives)
+ if c.verboseOutput {
+ log.Printf("After exclusion: %d negative examples", len(negatives))
+ }
+
+ if len(positives) == 0 || len(negatives) == 0 {
+ return fmt.Errorf("need both positive (%d) and negative (%d) examples for training", len(positives), len(negatives))
+ }
+
+ if c.verboseOutput {
+ log.Println("Training model...")
+ }
+ model, err := c.trainModel(positives, negatives)
+ if err != nil {
+ return fmt.Errorf("failed to train model: %w", err)
+ }
+
+ // Output model
+ encoder := json.NewEncoder(stdout)
+ encoder.SetIndent("", " ")
+ if err := encoder.Encode(model); err != nil {
+ return fmt.Errorf("failed to write model: %w", err)
+ }
+
+ return nil
+}
+
+// ============================================================================
+// ╺┳┓┏━┓╺┳╸┏━┓ ╻ ┏━┓┏━┓╺┳┓╻┏┓╻┏━╸
+// ┃┃┣━┫ ┃ ┣━┫ ┃ ┃ ┃┣━┫ ┃┃┃┃┗┫┃╺┓
+// ╺┻┛╹ ╹ ╹ ╹ ╹ ┗━╸┗━┛╹ ╹╺┻┛╹╹ ╹┗━┛
+// ============================================================================
+
+func (c *TrainCommand) loadArticles(filename string) ([]*core.Article, error) {
+ file, err := os.Open(filename)
+ if err != nil {
+ return nil, err
+ }
+ defer file.Close()
+
+ var articles []*core.Article
+ decoder := json.NewDecoder(file)
+ lineCount := 0
+ for {
+ var article core.Article
+ if err := decoder.Decode(&article); err != nil {
+ if err == io.EOF {
+ break
+ }
+ // Skip malformed json lines, don't fail on bad input.
+ lineCount++
+ continue
+ }
+ articles = append(articles, &article)
+ lineCount++
+ if lineCount%500 == 0 && c.verboseOutput {
+ log.Printf(" Loaded %d articles so far", len(articles))
+ }
+ }
+ return articles, nil
+}
+
+// loadRSSURLs loads RSS feed URLs from a text file
+func (c *TrainCommand) loadRSSURLs(filename string) ([]string, error) {
+ file, err := os.Open(filename)
+ if err != nil {
+ return nil, err
+ }
+ defer file.Close()
+
+ var urls []string
+ scanner := bufio.NewScanner(file)
+ for scanner.Scan() {
+ line := strings.TrimSpace(scanner.Text())
+ if line != "" && !strings.HasPrefix(line, "#") {
+ urls = append(urls, line)
+ }
+ }
+ return urls, scanner.Err()
+}
+
+// fetchFromRSSFeeds fetches articles from multiple RSS feeds in parallel
+func (c *TrainCommand) fetchFromRSSFeeds(rssURLs []string) ([]*core.Article, error) {
+ client := core.DefaultHTTPClient
+ type result struct {
+ url string
+ articles []*core.Article
+ err error
+ }
+ resultChan := make(chan result, len(rssURLs))
+
+ for _, rssURL := range rssURLs {
+ go func(url string) {
+ articles, err := c.fetchRSSFeed(client, url)
+ resultChan <- result{url: url, articles: articles, err: err}
+ }(rssURL)
+ }
+
+ var allArticles []*core.Article
+ for i := 0; i < len(rssURLs); i++ {
+ res := <-resultChan
+ if res.err != nil {
+ if c.verboseOutput {
+ log.Printf("%s: failed to fetch", shortURL(res.url))
+ }
+ } else {
+ if c.verboseOutput {
+ log.Printf("%s: %d articles", shortURL(res.url), len(res.articles))
+ }
+ allArticles = append(allArticles, res.articles...)
+ }
+ }
+
+ return allArticles, nil
+}
+
+// ParseRSSFeed parses an RSS/Atom feed from the provided body into a slice of Articles.
+func ParseRSSFeed(body []byte, baseURL string) ([]*core.Article, error) {
+ fp := gofeed.NewParser()
+ feed, err := fp.Parse(bytes.NewReader(body))
+ if err != nil {
+ return nil, err
+ }
+
+ var articles []*core.Article
+ for _, item := range feed.Items {
+ // Prefer explicit content; fall back to description.
+ content := strings.TrimSpace(item.Content)
+ if content == "" {
+ content = item.Description
+ }
+ // Also check custom content field (for <content> tags in RSS)
+ if content == "" && item.Custom != nil {
+ if c, ok := item.Custom["content"]; ok && c != "" {
+ content = c
+ }
+ }
+
+ // Clean and limit content length
+ content = core.CleanFeedContent(content)
+
+ articles = append(articles, &core.Article{
+ URL: item.Link,
+ Title: item.Title,
+ Content: content,
+ })
+ }
+ return articles, nil
+}
+
+// fetchRSSFeed fetches and parses a single RSS feed
+func (c *TrainCommand) fetchRSSFeed(client *http.Client, rssURL string) ([]*core.Article, error) {
+ var body []byte
+ var err error
+
+ // Handle file:// URLs locally
+ if strings.HasPrefix(rssURL, "file://") {
+ // Remove file:// prefix
+ filePath := strings.TrimPrefix(rssURL, "file://")
+ body, err = os.ReadFile(filePath)
+ if err != nil {
+ return nil, fmt.Errorf("error reading file %s: %w", filePath, err)
+ }
+ } else {
+ // Handle HTTP/HTTPS URLs normally
+ req, err := http.NewRequest("GET", rssURL, nil)
+ if err != nil {
+ return nil, fmt.Errorf("error building request: %w", err)
+ }
+ req.Header.Set("User-Agent", core.PoliteUserAgent)
+
+ // Make request with retry logic
+ ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ defer cancel()
+
+ resp, err := core.DoRequestWithRetry(ctx, client, req)
+ if err != nil {
+ return nil, fmt.Errorf("error fetching %s: %w", rssURL, err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, rssURL)
+ }
+
+ // Read response body
+ body, err = io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("error reading response from %s: %w", rssURL, err)
+ }
+ }
+
+ // Parse RSS/Atom feed
+ return ParseRSSFeed(body, rssURL)
+}
+
+// ============================================================================
+// ╺┳┓┏━┓╺┳╸┏━┓ ┏━┓┏━┓┏━╸┏━┓
+// ┃┃┣━┫ ┃ ┣━┫ ┣━┛┣┳┛┣╸ ┣━┛
+// ╺┻┛╹ ╹ ╹ ╹ ╹ ╹ ╹┗╸┗━╸╹
+// ============================================================================
+
+func (c *TrainCommand) excludePositives(negatives, positives []*core.Article) []*core.Article {
+ // Build set of positive URLs for O(1) lookup
+ positiveURLs := make(map[string]bool)
+ for _, pos := range positives {
+ positiveURLs[pos.URL] = true
+ }
+
+ // Filter out positives
+ var filtered []*core.Article
+ for _, neg := range negatives {
+ if !positiveURLs[neg.URL] {
+ filtered = append(filtered, neg)
+ }
+ }
+
+ return filtered
+}
+
+// splitTrainingData performs a deterministic 80/20 split (seed=42).
+// Deterministic ensures reproducible model training across runs.
+func (c *TrainCommand) splitTrainingData(documents []string, labels []float64) (
+ trainDocs, valDocs []string,
+ trainLabels, valLabels []float64,
+) {
+ const validationSplitRatio = 0.2
+ const splitSeed = 42
+
+ if len(documents) < 3 {
+ // Not enough data to split, use all for training.
+ // A split requires at least 2 training documents to avoid MaxDF issues
+ // and at least 1 validation document.
+ return documents, nil, labels, nil
+ }
+
+ // Create a reproducible random source and shuffle indices.
+ rng := rand.New(rand.NewSource(splitSeed))
+ indices := make([]int, len(documents))
+ for i := range indices {
+ indices[i] = i
+ }
+ rng.Shuffle(len(indices), func(i, j int) {
+ indices[i], indices[j] = indices[j], indices[i]
+ })
+
+ splitIndex := int(float64(len(documents)) * (1.0 - validationSplitRatio))
+ trainIndices := indices[:splitIndex]
+ valIndices := indices[splitIndex:]
+
+ trainDocs = make([]string, len(trainIndices))
+ trainLabels = make([]float64, len(trainIndices))
+ for i, idx := range trainIndices {
+ trainDocs[i] = documents[idx]
+ trainLabels[i] = labels[idx]
+ }
+
+ valDocs = make([]string, len(valIndices))
+ valLabels = make([]float64, len(valIndices))
+ for i, idx := range valIndices {
+ valDocs[i] = documents[idx]
+ valLabels[i] = labels[idx]
+ }
+
+ return trainDocs, valDocs, trainLabels, valLabels
+}
+
+// Downsample majority class to 1:1 ratio AFTER vectorizer.Fit() to preserve IDF values.
+func (c *TrainCommand) downsampleToBalance(docs []string, labels []float64) ([]string, []float64) {
+ // Count positives and negatives
+ var posDocs, negDocs []string
+ var posLabels, negLabels []float64
+
+ for i, label := range labels {
+ if label == 1.0 {
+ posDocs = append(posDocs, docs[i])
+ posLabels = append(posLabels, label)
+ } else {
+ negDocs = append(negDocs, docs[i])
+ negLabels = append(negLabels, label)
+ }
+ }
+
+ // If already balanced, return as-is
+ if len(posDocs) == len(negDocs) {
+ return docs, labels
+ }
+
+ // Determine which class is majority
+ var majorityDocs, minorityDocs []string
+ var majorityLabels, minorityLabels []float64
+
+ if len(negDocs) > len(posDocs) {
+ // Negatives are majority
+ majorityDocs, minorityDocs = negDocs, posDocs
+ majorityLabels, minorityLabels = negLabels, posLabels
+ } else {
+ // Positives are majority (unlikely but handle)
+ majorityDocs, minorityDocs = posDocs, negDocs
+ majorityLabels, minorityLabels = posLabels, negLabels
+ }
+
+ // Downsample majority to match minority size
+ minoritySize := len(minorityDocs)
+ rng := rand.New(rand.NewSource(42)) // Use fixed seed for reproducibility
+
+ // Create random indices for downsampling
+ indices := make([]int, len(majorityDocs))
+ for i := range indices {
+ indices[i] = i
+ }
+ rng.Shuffle(len(indices), func(i, j int) {
+ indices[i], indices[j] = indices[j], indices[i]
+ })
+
+ // Select downsampled majority
+ downsampledDocs := make([]string, 0, minoritySize*2)
+ downsampledLabels := make([]float64, 0, minoritySize*2)
+
+ // Add all minority samples
+ downsampledDocs = append(downsampledDocs, minorityDocs...)
+ downsampledLabels = append(downsampledLabels, minorityLabels...)
+
+ // Add downsampled majority
+ for i := 0; i < minoritySize; i++ {
+ idx := indices[i]
+ downsampledDocs = append(downsampledDocs, majorityDocs[idx])
+ downsampledLabels = append(downsampledLabels, majorityLabels[idx])
+ }
+
+ return downsampledDocs, downsampledLabels
+}
+
+// ============================================================================
+// ╺┳╸┏━┓┏━┓╻┏┓╻ ┏┳┓┏━┓╺┳┓┏━╸╻
+// ┃ ┣┳┛┣━┫┃┃┗┫ ┃┃┃┃ ┃ ┃┃┣╸ ┃
+// ╹ ╹┗╸╹ ╹╹╹ ╹ ╹ ╹┗━┛╺┻┛┗━╸┗━╸
+// ============================================================================
+
+// trainModel trains a TF-IDF + logistic regression model
+func (c *TrainCommand) trainModel(positives, negatives []*core.Article) (*core.ModelEnvelope, error) {
+ // Combine datasets and create labels
+ var documents []string
+ var labels []float64
+
+ // Process positives
+ for _, article := range positives {
+ // Skip articles with titles that are too short
+ if len(article.Title) < 15 {
+ continue
+ }
+ documents = append(documents, article.Title)
+ labels = append(labels, 1.0)
+ }
+
+ // Process negatives
+ for _, article := range negatives {
+ // Skip articles with titles that are too short
+ if len(article.Title) < 15 {
+ continue
+ }
+ documents = append(documents, article.Title)
+ labels = append(labels, 0.0)
+ }
+
+ // Use parameters from CLI flags (with defaults matching Julia implementation)
+ const vocabCap = 50000
+
+ // Deterministic 80/20 split for train/validation
+ trainDocs, valDocs, trainLabels, valLabels := c.splitTrainingData(documents, labels)
+
+ // Create TF-IDF vectorizer with the specified parameters
+ vectorizer := &core.TFIDFVectorizer{
+ NgramMin: 1,
+ NgramMax: c.ngramMax,
+ MinDF: c.minDF,
+ MaxDF: c.maxDF,
+ VocabCap: vocabCap,
+ Vocabulary: make(map[string]float64),
+ }
+ // Fit vectorizer on UNBALANCED training data to match Julia implementation
+ // This preserves document frequencies properly
+ vectorizer.Fit(trainDocs)
+
+ // Downsample negatives to 1:1 ratio AFTER fitting (match Julia approach)
+ balancedTrainDocs, balancedTrainLabels := c.downsampleToBalance(trainDocs, trainLabels)
+
+ // Transform both training and validation sets
+ trainVectors := vectorizer.Transform(balancedTrainDocs)
+ valVectors := vectorizer.Transform(valDocs)
+
+ // Use uniform class weights since we've balanced the dataset
+ classWeights := map[float64]float64{
+ 1.0: 1.0,
+ 0.0: 1.0,
+ }
+
+ // Train logistic regression with the specified lambda parameter
+ lr := &core.LogisticRegression{
+ LearningRate: 0.5,
+ Lambda: c.lambda,
+ Iterations: 500,
+ Tolerance: 0.000001,
+ }
+ lr.Validate()
+ weights, err := lr.Fit(trainVectors, balancedTrainLabels, classWeights)
+ if err != nil {
+ return nil, fmt.Errorf("failed to train logistic regression model: %w", err)
+ }
+
+ // Find the best threshold on the validation set
+ recommendedThreshold, scoreDistributions := c.findBestThreshold(valVectors, valLabels, weights)
+
+ // Count classes for metadata
+ var posCount, negCount float64
+ for _, label := range labels {
+ if label == 1.0 {
+ posCount++
+ } else {
+ negCount++
+ }
+ }
+
+ // Create model envelope
+ model := &core.ModelEnvelope{
+ Algorithm: "tfidf-go",
+ Impl: "go",
+ Version: "1",
+ CreatedAt: time.Now().UTC(),
+ Meta: map[string]any{
+ "positives": len(positives),
+ "negatives": len(negatives),
+ "class_counts": map[string]int{
+ "pos": int(posCount),
+ "neg": int(negCount),
+ },
+ "vectorizer_params": map[string]any{
+ "ngram_min": vectorizer.NgramMin,
+ "ngram_max": vectorizer.NgramMax,
+ "min_df": vectorizer.MinDF,
+ "max_df": vectorizer.MaxDF,
+ "vocab_cap": vectorizer.VocabCap,
+ },
+ "model_params": map[string]any{
+ "learning_rate": lr.LearningRate,
+ "lambda": lr.Lambda,
+ "iterations": lr.Iterations,
+ "tolerance": lr.Tolerance,
+ },
+ "recommended_threshold": recommendedThreshold,
+ "score_distributions": scoreDistributions,
+ },
+ Vectorizer: vectorizer.Vocabulary,
+ OrderedVocab: vectorizer.OrderedVocab,
+ Weights: weights,
+ }
+
+ return model, nil
+}
+
+// ============================================================================
+// ┏┳┓┏━╸╺┳╸┏━┓╻┏━╸┏━┓
+// ┃┃┃┣╸ ┃ ┣┳┛┃┃ ┗━┓
+// ╹ ╹┗━╸ ╹ ╹┗╸╹┗━╸┗━┛
+// ============================================================================
+
+// ClassificationMetrics holds the evaluation metrics
+type ClassificationMetrics struct {
+ TruePositives int
+ TrueNegatives int
+ FalsePositives int
+ FalseNegatives int
+ Accuracy float64
+ Precision float64
+ Recall float64
+ F1Score float64
+}
+
+// Calculate computes the metrics from raw counts
+func (m *ClassificationMetrics) Calculate() {
+ total := m.TruePositives + m.TrueNegatives + m.FalsePositives + m.FalseNegatives
+
+ if total > 0 {
+ m.Accuracy = float64(m.TruePositives+m.TrueNegatives) / float64(total)
+ }
+
+ if m.TruePositives+m.FalsePositives > 0 {
+ m.Precision = float64(m.TruePositives) / float64(m.TruePositives+m.FalsePositives)
+ }
+
+ if m.TruePositives+m.FalseNegatives > 0 {
+ m.Recall = float64(m.TruePositives) / float64(m.TruePositives+m.FalseNegatives)
+ }
+
+ if m.Precision+m.Recall > 0 {
+ m.F1Score = 2 * (m.Precision * m.Recall) / (m.Precision + m.Recall)
+ }
+}
+
+// findBestThreshold sweeps a range of thresholds on a validation set to find
+// the one that maximizes combined F1 + separation score.
+func (c *TrainCommand) findBestThreshold(
+ validationVectors [][]float64,
+ validationLabels []float64,
+ weights []float64,
+) (float64, map[string]any) {
+ if len(validationVectors) == 0 {
+ return 0.5, nil // Default if no validation data
+ }
+
+ scores := make([]float64, len(validationVectors))
+ for i, vector := range validationVectors {
+ score, err := core.PredictScore(vector, weights)
+ if err != nil {
+ // This should not happen with valid data, but as a fallback:
+ return 0.5, nil
+ }
+ scores[i] = score
+ }
+
+ // Collect score distributions by label
+ var posScores, negScores []float64
+ for i, score := range scores {
+ if validationLabels[i] == 1.0 {
+ posScores = append(posScores, score)
+ } else {
+ negScores = append(negScores, score)
+ }
+ }
+
+ // Compute stats for each class
+ posStats := computeScoreStats(posScores)
+ negStats := computeScoreStats(negScores)
+
+ // Calculate Cohen's d (effect size) to measure class separation in the learned space
+ posMean := posStats["mean"]
+ negMean := negStats["mean"]
+ posStd := posStats["std"]
+ negStd := negStats["std"]
+
+ var cohensD float64
+ if posStd > 0 && negStd > 0 {
+ pooledStd := math.Sqrt((posStd*posStd + negStd*negStd) / 2)
+ cohensD = math.Abs(posMean-negMean) / pooledStd
+ }
+
+ // Calculate separation ratio to understand how much the classes overlap on the score scale
+ totalRange := math.Max(posStats["max"], negStats["max"]) - math.Min(posStats["min"], negStats["min"])
+ overlapStart := math.Max(posStats["min"], negStats["min"])
+ overlapEnd := math.Min(posStats["max"], negStats["max"])
+ overlapRange := math.Max(0, overlapEnd-overlapStart)
+ separationRatio := 0.0
+ if totalRange > 0 {
+ separationRatio = (totalRange - overlapRange) / totalRange
+ }
+
+ // Find threshold that balances false positives and false negatives using Youden's J.
+ // This metric (Sensitivity + Specificity - 1) equally weights both false positive
+ // and false negative rates. Why not F1? F1 biases toward precision when classes
+ // are imbalanced; a validation set of 10 positives and 1000 negatives would push
+ // the threshold too high. Youden's J treats both types of error equally, which
+ // better reflects real use: missing a relevant article (false negative) is as bad
+ // as showing an irrelevant one (false positive).
+ bestCombinedScore := -1.0
+ bestThreshold := 0.5
+ var bestMetrics ClassificationMetrics
+
+ boolLabels := make([]bool, len(validationLabels))
+ for i, l := range validationLabels {
+ boolLabels[i] = l == 1.0
+ }
+
+ for i := 5; i <= 95; i++ {
+ threshold := float64(i) / 100.0
+ metrics := computeMetrics(scores, boolLabels, threshold)
+
+ sensitivity := metrics.Recall // TPR: TP / (TP + FN)
+ specificity := 0.0
+ if metrics.TrueNegatives+metrics.FalsePositives > 0 {
+ specificity = float64(metrics.TrueNegatives) / float64(metrics.TrueNegatives+metrics.FalsePositives)
+ }
+ youdenJ := sensitivity + specificity - 1.0
+
+ if youdenJ > bestCombinedScore {
+ bestCombinedScore = youdenJ
+ bestThreshold = threshold
+ bestMetrics = metrics
+ }
+ }
+
+ distributions := map[string]any{
+ "positive": posStats,
+ "negative": negStats,
+ "cohens_d": cohensD,
+ "separation_ratio": separationRatio,
+ "best_f1": bestMetrics.F1Score,
+ "best_precision": bestMetrics.Precision,
+ "best_recall": bestMetrics.Recall,
+ }
+
+ return bestThreshold, distributions
+}
+
+// computeScoreStats computes min, max, mean, and std for a slice of scores
+func computeScoreStats(scores []float64) map[string]float64 {
+ if len(scores) == 0 {
+ return map[string]float64{
+ "min": 0.0,
+ "max": 0.0,
+ "mean": 0.0,
+ "std": 0.0,
+ }
+ }
+
+ min, max := scores[0], scores[0]
+ sum := 0.0
+
+ for _, score := range scores {
+ if score < min {
+ min = score
+ }
+ if score > max {
+ max = score
+ }
+ sum += score
+ }
+
+ mean := sum / float64(len(scores))
+
+ // Calculate standard deviation
+ variance := 0.0
+ for _, score := range scores {
+ diff := score - mean
+ variance += diff * diff
+ }
+ variance /= float64(len(scores))
+ std := math.Sqrt(variance)
+
+ return map[string]float64{
+ "min": min,
+ "max": max,
+ "mean": mean,
+ "std": std,
+ }
+}
+
+// computeMetrics calculates classification metrics
+func computeMetrics(scores []float64, labels []bool, threshold float64) ClassificationMetrics {
+ var metrics ClassificationMetrics
+ for i, score := range scores {
+ predicted := score > threshold
+ actual := labels[i]
+
+ if predicted && actual {
+ metrics.TruePositives++
+ } else if predicted && !actual {
+ metrics.FalsePositives++
+ } else if !predicted && actual {
+ metrics.FalseNegatives++
+ } else {
+ metrics.TrueNegatives++
+ }
+ }
+ metrics.Calculate()
+ return metrics
+}
+
+// ============================================================================
+// ╻ ╻┏━╸╻ ┏━┓┏━╸┏━┓┏━┓
+// ┣━┫┣╸ ┃ ┣━┛┣╸ ┣┳┛┗━┓
+// ╹ ╹┗━╸┗━╸╹ ┗━╸╹┗╸┗━┛
+// ============================================================================
+
+// shortURL formats a URL to be human-readable and not too long
+func shortURL(urlStr string) string {
+ u, err := url.Parse(urlStr)
+ if err != nil {
+ return urlStr
+ }
+
+ path := u.Path
+ if len(path) > 30 {
+ path = path[:30] + "..."
+ }
+
+ return u.Host + path
+}
diff --git a/cmds/train_test.go b/cmds/train_test.go
new file mode 100644
index 0000000..8298494
--- /dev/null
+++ b/cmds/train_test.go
@@ -0,0 +1,66 @@
+package cmds
+
+import (
+ "scholscan/core"
+ "strings"
+ "testing"
+)
+
+// test RSS parsing
+func TestParseRSSFeed(t *testing.T) {
+ rssXML := `<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+<channel>
+<title>Test Feed</title>
+<item>
+<title>Test Article 1</title>
+<link>https://example.com/article1</link>
+<description>This is a test article with some content.</description>
+</item>
+<item>
+<title>Test Article 2</title>
+<link>https://example.com/article2</link>
+<content><![CDATA[<p>This is content with <b>HTML</b> tags.</p>]]></content>
+</item>
+</channel>
+</rss>`
+
+ articles, err := ParseRSSFeed([]byte(rssXML), "https://example.com/feed")
+ if err != nil {
+ t.Fatalf("Failed to parse RSS feed: %v", err)
+ }
+
+ if len(articles) != 2 {
+ t.Fatalf("Expected 2 articles, got %d", len(articles))
+ }
+
+ if articles[0].Title != "Test Article 1" {
+ t.Errorf("Expected title 'Test Article 1', got '%s'", articles[0].Title)
+ }
+ if articles[0].URL != "https://example.com/article1" {
+ t.Errorf("Expected URL 'https://example.com/article1', got '%s'", articles[0].URL)
+ }
+ if articles[0].Content != "This is a test article with some content." {
+ t.Errorf("Expected content 'This is a test article with some content.', got '%s'", articles[0].Content)
+ }
+
+ if articles[1].Title != "Test Article 2" {
+ t.Errorf("Expected title 'Test Article 2', got '%s'", articles[1].Title)
+ }
+ if articles[1].Content != "This is content with HTML tags." {
+ t.Errorf("Expected 'This is content with HTML tags.', got '%s'", articles[1].Content)
+ }
+}
+
+func TestCleanFeedContent(t *testing.T) {
+ longInput := strings.Repeat("test content ", 500) // 6000+ bytes
+ result := core.CleanFeedContent(longInput)
+
+ if len(result) <= 5000 {
+ t.Errorf("Expected content to be truncated to >5000 chars, got %d", len(result))
+ }
+
+ if !strings.HasSuffix(result, "...") {
+ t.Errorf("Expected truncated content to end with '...', got '%s'", result[len(result)-3:])
+ }
+}
diff --git a/core/constants.go b/core/constants.go
new file mode 100644
index 0000000..2dadac4
--- /dev/null
+++ b/core/constants.go
@@ -0,0 +1,21 @@
+// Default configuration constants.
+//
+// Timeouts are defensive: 30s for HTTP requests, 5s for graceful shutdown.
+// Score threshold 0.5 is neutral; models should learn their own.
+// MinTitleLength filters junk/broken titles (<15 chars rarely meaningful).
+// ChunkSize 50 balances memory usage vs batch efficiency.
+package core
+
+import "time"
+
+const (
+ DefaultHTTPTimeout = 30 * time.Second
+ DefaultContextTimeout = 10 * time.Second
+ DefaultReadTimeout = 30 * time.Second
+ DefaultWriteTimeout = 30 * time.Second
+ DefaultIdleTimeout = 120 * time.Second
+ DefaultShutdownTimeout = 5 * time.Second
+ DefaultScoreThreshold = 0.5
+ MinTitleLength = 15
+ DefaultChunkSize = 50
+)
diff --git a/core/http.go b/core/http.go
new file mode 100644
index 0000000..8629676
--- /dev/null
+++ b/core/http.go
@@ -0,0 +1,196 @@
+// HTTP client with exponential backoff retry.
+//
+// Handles transient network failures, timeouts, and rate limiting.
+// - Backoff: 500ms → 1s → 2s → 4s max
+// - Jitter prevents thundering herd
+// - Respects 429 Retry-After header
+package core
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "math/rand"
+ "net"
+ "net/http"
+ "os"
+ "strconv"
+ "strings"
+ "time"
+)
+
+
+// ============================================================================
+// ╻ ╻╺┳╸╺┳╸┏━┓ ┏━┓┏━╸╺┳╸┏━┓╻ ╻
+// ┣━┫ ┃ ┃ ┣━┛ ┣┳┛┣╸ ┃ ┣┳┛┗┳┛
+// ╹ ╹ ╹ ╹ ╹ ╹┗╸┗━╸ ╹ ╹┗╸ ╹
+// ============================================================================
+
+
+const PoliteUserAgent = "scholscan/1.0 (https://github.com/mrichman/scholscan; mailto:matt@mrichman.net)"
+
+var DefaultHTTPClient = &http.Client{
+ Timeout: 30 * time.Second,
+}
+
+var (
+ retryMaxAttempts = 4
+ retryInitialBackoff = 500 * time.Millisecond
+ retryMaxBackoff = 5 * time.Second
+)
+
+// Makes HTTP request with exponential backoff retry
+func DoRequestWithRetry(
+ ctx context.Context,
+ client *http.Client,
+ req *http.Request,
+) (*http.Response, error) {
+ if client == nil {
+ client = DefaultHTTPClient
+ }
+ var lastErr error
+ backoff := retryInitialBackoff
+
+ for attempt := 1; attempt <= retryMaxAttempts; attempt++ {
+ // Make the request cancellable
+ reqWithCtx := req.WithContext(ctx)
+ resp, err := client.Do(reqWithCtx)
+ if err == nil {
+ if isRetriableStatus(resp.StatusCode) {
+ retryAfter := parseRetryAfter(resp.Header.Get("Retry-After"))
+ _ = resp.Body.Close()
+ sleep := backoff
+ if retryAfter > sleep {
+ sleep = retryAfter
+ }
+
+ // Add jitter to avoid thundering herd.
+ jitter := time.Duration(rand.Intn(int(backoff / 2)))
+ sleep += jitter
+
+ // Make sleep cancellable
+ timer := time.NewTimer(sleep)
+ select {
+ case <-ctx.Done():
+ timer.Stop()
+ return nil, ctx.Err()
+ case <-timer.C:
+ }
+
+ backoff = minDuration(backoff*2, retryMaxBackoff)
+ continue
+ }
+ return resp, nil
+ }
+ // Check for context cancellation
+ if ctx.Err() != nil {
+ return nil, ctx.Err()
+ }
+ // Network error: retry on timeouts, context deadline, transient net errors, and HTTP/2 stream errors
+ if os.IsTimeout(err) || errors.Is(err, context.DeadlineExceeded) || isTransientNetError(err) || isHTTP2StreamErr(err) {
+ lastErr = err
+
+ // Add jitter to avoid thundering herd.
+ jitter := time.Duration(rand.Intn(int(backoff / 2)))
+ sleep := backoff + jitter
+
+ // Make sleep cancellable
+ timer := time.NewTimer(sleep)
+ select {
+ case <-ctx.Done():
+ timer.Stop()
+ return nil, ctx.Err()
+ case <-timer.C:
+ }
+
+ backoff = minDuration(backoff*2, retryMaxBackoff)
+ continue
+ }
+ // Non-retriable error
+ return nil, err
+ }
+ if lastErr == nil {
+ lastErr = fmt.Errorf("request retries exhausted")
+ }
+ return nil, lastErr
+}
+
+
+// ============================================================================
+// ╻ ╻┏━╸╻ ┏━┓┏━╸┏━┓┏━┓
+// ┣━┫┣╸ ┃ ┣━┛┣╸ ┣┳┛┗━┓
+// ╹ ╹┗━╸┗━╸╹ ┗━╸╹┗╸┗━┛
+// ============================================================================
+
+
+func isRetriableStatus(code int) bool {
+ if code == http.StatusTooManyRequests {
+ return true
+ }
+ return code >= 500 && code != http.StatusNotImplemented
+}
+
+func parseRetryAfter(v string) time.Duration {
+ if v == "" {
+ return 0
+ }
+ if secs, err := strconv.Atoi(strings.TrimSpace(v)); err == nil && secs > 0 {
+ return time.Duration(secs) * time.Second
+ }
+ if t, err := http.ParseTime(v); err == nil {
+ if d := time.Until(t); d > 0 {
+ return d
+ }
+ }
+ return 0
+}
+
+func minDuration(a, b time.Duration) time.Duration {
+ if a < b {
+ return a
+ }
+ return b
+}
+
+// isTransientNetError returns true for network errors which are commonly transient,
+// such as timeouts and common connection reset/closed cases.
+func isTransientNetError(err error) bool {
+ if err == nil {
+ return false
+ }
+ var ne net.Error
+ if errors.As(err, &ne) {
+ if ne.Timeout() {
+ return true
+ }
+ }
+ msg := strings.ToLower(err.Error())
+ switch {
+ case strings.Contains(msg, "use of closed network connection"):
+ return true
+ case strings.Contains(msg, "connection reset by peer"):
+ return true
+ case strings.Contains(msg, "connection aborted"):
+ return true
+ case strings.Contains(msg, "broken pipe"):
+ return true
+ case strings.Contains(msg, "eof"):
+ // Treat unexpected EOFs as transient when occurring at transport level.
+ return true
+ default:
+ return false
+ }
+}
+
+// isHTTP2StreamErr detects HTTP/2 stream-level errors which are often transient.
+func isHTTP2StreamErr(err error) bool {
+ if err == nil {
+ return false
+ }
+ msg := strings.ToLower(err.Error())
+ return strings.Contains(msg, "stream error") ||
+ strings.Contains(msg, "internal_error") ||
+ strings.Contains(msg, "rst_stream") ||
+ strings.Contains(msg, "goaway") ||
+ strings.Contains(msg, "http2:")
+}
diff --git a/core/ml.go b/core/ml.go
new file mode 100644
index 0000000..afdd2f3
--- /dev/null
+++ b/core/ml.go
@@ -0,0 +1,427 @@
+// ML implementation: TF-IDF + Logistic Regression for article filtering.
+//
+// Why title-only: Avoids content scraping overhead, titles are already informative.
+// MinDF=2: Removes typos and rare terms that don't generalize.
+// MaxDF=0.8: Removes common words that appear in >80% of documents.
+// λ=0.001: Light L2 regularization to prevent overfitting on small datasets.
+//
+// Public API:
+// - TFIDFVectorizer.Fit(): Learn vocabulary from documents
+// - TFIDFVectorizer.Transform(): Convert documents to TF-IDF vectors
+// - LogisticRegression.Fit(): Train classifier on vectors
+// - CreateVectorizerFromModel(): Reconstruct vectorizer from saved model
+// - PredictScore(): Score article using trained weights
+package core
+
+import (
+ "fmt"
+ "math"
+ "regexp"
+ "sort"
+ "strings"
+)
+
+
+// ============================================================================
+// ╻ ╻┏━╸┏━╸╺┳╸┏━┓┏━┓╻┏━┓┏━╸┏━┓
+// ┃┏┛┣╸ ┃ ┃ ┃ ┃┣┳┛┃┗━┓┣╸ ┣┳┛
+// ┗┛ ┗━╸┗━╸ ╹ ┗━┛╹┗╸╹┗━┛┗━╸╹┗╸
+// ============================================================================
+
+
+var wordHyphenRegex = regexp.MustCompile("[^a-zA-Z0-9-]+")
+
+// StopWords: Common words that don't help distinguish articles.
+// Why: Reduces noise and improves model generalization.
+var stopWords = map[string]struct{}{
+ // Single letters and symbols
+ "s": {}, "-": {}, "0": {}, "1": {}, "2": {}, "3": {}, "4": {}, "5": {}, "6": {}, "7": {}, "8": {}, "9": {},
+
+ // Common English stop words
+ "a": {}, "about": {}, "above": {}, "after": {}, "again": {}, "against": {}, "al": {}, "all": {}, "am": {}, "an": {}, "and": {}, "any": {}, "are": {}, "aren't": {}, "as": {}, "at": {}, "be": {}, "because": {}, "been": {}, "before": {}, "being": {}, "below": {}, "between": {}, "both": {}, "but": {}, "by": {}, "can't": {}, "cannot": {}, "could": {}, "couldn't": {}, "did": {}, "didn't": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "don't": {}, "down": {}, "during": {}, "each": {}, "et": {}, "few": {}, "for": {}, "from": {}, "further": {}, "had": {}, "hadn't": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "he's": {}, "her": {}, "here": {}, "here's": {}, "hers": {}, "herself": {}, "him": {}, "himself": {}, "his": {}, "how": {}, "how's": {}, "i": {}, "i'd": {}, "i'll": {}, "i'm": {}, "i've": {}, "if": {}, "in": {}, "into": {}, "is": {}, "isn't": {}, "it": {}, "it's": {}, "its": {}, "itself": {}, "let's": {}, "me": {}, "more": {}, "most": {}, "mustn't": {}, "my": {}, "myself": {}, "no": {}, "nor": {}, "not": {}, "of": {}, "off": {}, "on": {}, "once": {}, "only": {}, "or": {}, "other": {}, "ought": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "over": {}, "own": {}, "same": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "so": {}, "some": {}, "such": {}, "than": {}, "that": {}, "that's": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "there": {}, "there's": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "this": {}, "those": {}, "through": {}, "to": {}, "too": {}, "under": {}, "until": {}, "up": {}, "very": {}, "was": {}, "wasn't": {}, "we": {}, "we'd": {}, "we'll": {}, "we're": {}, "we've": {}, "were": {}, "weren't": {}, "what": {}, "what's": {}, "when": {}, "when's": {}, "where": {}, "where's": {}, "which": {}, "while": {}, "who": {}, "who's": {}, "whom": {}, "why": {}, "why's": {}, "with": {}, "won't": {}, "would": {}, "wouldn't": {}, "you": {}, "you'd": {}, "you'll": {}, "you're": {}, "you've": {}, "your": {}, "yours": {}, "yourself": {}, "yourselves": {},
+}
+
+type TFIDFVectorizer struct {
+ Vocabulary map[string]float64
+ OrderedVocab []string
+ NgramMin int
+ NgramMax int
+ MinDF int // Minimum document frequency (absolute)
+ MaxDF float64 // Maximum document frequency (ratio)
+ VocabCap int
+}
+
+func CreateVectorizerFromModel(model *ModelEnvelope) *TFIDFVectorizer {
+ return &TFIDFVectorizer{
+ Vocabulary: model.Vectorizer,
+ OrderedVocab: model.OrderedVocab,
+ }
+}
+
+
+// Learns vocabulary and IDF from documents
+func (v *TFIDFVectorizer) Fit(documents []string) {
+ numDocs := len(documents)
+ docFreqs := make(map[string]int)
+
+ for _, doc := range documents {
+ unigrams := Tokenize(doc)
+ ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax)
+ seenInDoc := make(map[string]struct{})
+ for _, ngram := range ngrams {
+ if _, seen := seenInDoc[ngram]; !seen {
+ docFreqs[ngram]++
+ seenInDoc[ngram] = struct{}{}
+ }
+ }
+ }
+
+ maxDocs := int(v.MaxDF * float64(numDocs))
+ filteredVocab := make(map[string]int)
+ for term, freq := range docFreqs {
+ if freq >= v.MinDF && freq <= maxDocs {
+ filteredVocab[term] = freq
+ }
+ }
+
+ if v.VocabCap > 0 && len(filteredVocab) > v.VocabCap {
+ type termFreq struct {
+ term string
+ freq int
+ }
+ terms := make([]termFreq, 0, len(filteredVocab))
+ for term, freq := range filteredVocab {
+ terms = append(terms, termFreq{term, freq})
+ }
+ sort.Slice(terms, func(i, j int) bool {
+ return terms[i].freq > terms[j].freq
+ })
+
+ cappedTerms := terms[:v.VocabCap]
+ filteredVocab = make(map[string]int, v.VocabCap)
+ for _, tf := range cappedTerms {
+ filteredVocab[tf.term] = tf.freq
+ }
+ }
+
+ v.OrderedVocab = make([]string, 0, len(filteredVocab))
+ for term := range filteredVocab {
+ v.OrderedVocab = append(v.OrderedVocab, term)
+ }
+ sort.Strings(v.OrderedVocab) // deterministic order
+
+ v.Vocabulary = make(map[string]float64, len(v.OrderedVocab))
+ for _, term := range v.OrderedVocab {
+ // IDF = log(total num of docs / num of docs with term)
+ idf := math.Log(float64(numDocs) / float64(filteredVocab[term]))
+ v.Vocabulary[term] = idf
+ }
+}
+
+// Converts documents to TF-IDF vectors using learned vocabulary
+func (v *TFIDFVectorizer) Transform(documents []string) [][]float64 {
+ vectors := make([][]float64, len(documents))
+
+ for i, doc := range documents {
+ unigrams := Tokenize(doc)
+ ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax)
+ vector := make([]float64, len(v.OrderedVocab))
+
+ if len(ngrams) > 0 {
+ // tf: term frequency (normalized count of each n-gram in document)
+ tf := make(map[string]float64)
+ for _, ngram := range ngrams {
+ tf[ngram]++
+ }
+ numNgrams := float64(len(ngrams))
+ for ngram, count := range tf {
+ tf[ngram] = count / numNgrams
+ }
+
+ for j, term := range v.OrderedVocab {
+ if tfValue, ok := tf[term]; ok {
+ // only score terms that were in our training vocabulary
+ if idfValue, inVocab := v.Vocabulary[term]; inVocab {
+ vector[j] = tfValue * idfValue
+ }
+ }
+ }
+ }
+ vectors[i] = vector
+ }
+
+ return vectors
+}
+
+func Tokenize(text string) []string {
+ text = strings.ToLower(text)
+ words := wordHyphenRegex.Split(text, -1)
+ tokens := make([]string, 0, len(words))
+ for _, word := range words {
+ if word == "" {
+ continue
+ }
+ if _, isStopWord := stopWords[word]; isStopWord {
+ continue
+ }
+ tokens = append(tokens, word)
+ }
+ return tokens
+}
+
+func generateNgrams(tokens []string, minN, maxN int) []string {
+ if minN <= 0 {
+ minN = 1
+ }
+ if maxN < minN {
+ maxN = minN
+ }
+
+ numTokens := len(tokens)
+
+ estimatedCap := 0
+ for n := minN; n <= maxN; n++ {
+ if numTokens >= n {
+ estimatedCap += numTokens - n + 1
+ }
+ }
+ ngrams := make([]string, 0, estimatedCap)
+
+ for n := minN; n <= maxN; n++ {
+ if numTokens < n {
+ continue
+ }
+ for i := 0; i <= numTokens-n; i++ {
+ ngrams = append(ngrams, strings.Join(tokens[i:i+n], " "))
+ }
+ }
+ return ngrams
+}
+
+
+// ============================================================================
+// ┏━╸╻ ┏━┓┏━┓┏━┓╻┏━╸╻┏━╸┏━┓
+// ┃ ┃ ┣━┫┗━┓┗━┓┃┣╸ ┃┣╸ ┣┳┛
+// ┗━╸┗━╸╹ ╹┗━┛┗━┛╹╹ ╹┗━╸╹┗╸
+// ============================================================================
+
+
+// Binary logistic regression with L2 regularization
+// Bias term stored separately (not regularized)
+type LogisticRegression struct {
+ LearningRate float64
+ Lambda float64 // L2 regularization parameter
+ Iterations int
+ Tolerance float64 // Convergence tolerance on loss improvement
+}
+
+// validate checks and clamps hyperparams to reasonable bounds.
+func (lr *LogisticRegression) Validate() *LogisticRegression {
+ const (
+ defaultLearningRate = 0.5
+ defaultIterations = 500
+ defaultTolerance = 0.000001
+ )
+
+ if lr.LearningRate <= 0 {
+ lr.LearningRate = defaultLearningRate
+ }
+ if lr.LearningRate > 10 {
+ lr.LearningRate = 10.0
+ }
+ if lr.Lambda < 0 {
+ lr.Lambda = 0.0
+ }
+ if lr.Iterations <= 0 {
+ lr.Iterations = defaultIterations
+ }
+ if lr.Tolerance <= 0 {
+ lr.Tolerance = defaultTolerance
+ }
+ return lr
+}
+
+// Fit trains via SGD with L2 regularization on feature weights (not bias).
+// Class weights reweight samples; unused in our pipeline (we downsample instead).
+// Returns weights with bias as last element.
+func (lr *LogisticRegression) Fit(vectors [][]float64, labels []float64, classWeights map[float64]float64) ([]float64, error) {
+ if len(vectors) == 0 {
+ return nil, fmt.Errorf("cannot train on empty dataset")
+ }
+ if len(vectors) != len(labels) {
+ return nil, fmt.Errorf(
+ "mismatch between number of vectors (%d) and labels (%d)",
+ len(vectors),
+ len(labels),
+ )
+ }
+
+ for i, y := range labels {
+ if y != 0 && y != 1 {
+ return nil, fmt.Errorf("invalid label at %d: %v (expected 0 or 1)", i, y)
+ }
+ }
+
+ numFeatures := len(vectors[0])
+ if numFeatures == 0 {
+ return nil, fmt.Errorf("cannot train with zero-length feature vectors")
+ }
+ for i := 1; i < len(vectors); i++ {
+ if len(vectors[i]) != numFeatures {
+ return nil, fmt.Errorf(
+ "inconsistent feature vector length at index %d: got %d, expected %d",
+ i,
+ len(vectors[i]),
+ numFeatures,
+ )
+ }
+ }
+ useUniformWeights := classWeights == nil
+ if useUniformWeights {
+ classWeights = map[float64]float64{0.0: 1.0, 1.0: 1.0}
+ }
+
+ numSamples := float64(len(vectors))
+ var totalWeight float64
+ if useUniformWeights {
+ totalWeight = numSamples
+ } else {
+ for _, y := range labels {
+ totalWeight += classWeights[y]
+ }
+ }
+ if totalWeight == 0 {
+ totalWeight = numSamples // Fallback
+ }
+
+ weights := make([]float64, numFeatures)
+ var bias float64
+
+ prevLoss := math.MaxFloat64
+
+ for i := 0; i < lr.Iterations; i++ {
+ gradWeights := make([]float64, numFeatures)
+ var gradBias float64
+ var currentLoss float64
+
+ for j, x := range vectors {
+ y := labels[j]
+ sampleWeight := classWeights[y]
+
+ z, err := dot(weights, x)
+ if err != nil {
+ return nil, fmt.Errorf("error calculating dot product for vector %d: %w", j, err)
+ }
+ p := Sigmoid(z + bias)
+
+ // Compute prediction error. This term gets multiplied by each feature value
+ // to accumulate gradients (higher error pushes weights harder).
+ errTerm := p - y
+ for k := 0; k < numFeatures; k++ {
+ gradWeights[k] += sampleWeight * errTerm * x[k]
+ }
+ gradBias += sampleWeight * errTerm
+
+ cp := clamp(p)
+ currentLoss += sampleWeight * (-(y*math.Log(cp) + (1-y)*math.Log(1-cp)))
+ }
+
+ // Update weights with L2 regularization (only on feature weights, not bias).
+ // This pulls weights toward zero, preventing overfitting on small datasets.
+ for k := 0; k < numFeatures; k++ {
+ regularizedGrad := (gradWeights[k] / totalWeight) + (lr.Lambda * weights[k])
+ weights[k] -= lr.LearningRate * regularizedGrad
+ }
+ gradBias /= totalWeight
+ bias -= lr.LearningRate * gradBias
+
+ // Check convergence: if loss change is below tolerance, we're done.
+ // We include the L2 penalty in total loss to assess true convergence.
+ avgLoss := currentLoss / totalWeight
+ var l2Penalty float64
+ for _, w := range weights {
+ l2Penalty += w * w
+ }
+ totalLoss := avgLoss + 0.5*lr.Lambda*l2Penalty
+ if math.Abs(prevLoss-totalLoss) < lr.Tolerance {
+ break
+ }
+ prevLoss = totalLoss
+ }
+
+ // bias is stored as the last element
+ return append(weights, bias), nil
+}
+
+// PredictScore computes the probability for a single vec given weights.
+// the last element of weights is the bias.
+func PredictScore(vector []float64, weights []float64) (float64, error) {
+ if len(weights) == 0 {
+ return 0, fmt.Errorf("weights cannot be empty")
+ }
+ if len(vector) != len(weights)-1 {
+ return 0, fmt.Errorf(
+ "vector length mismatch: expected %d features, got %d",
+ len(weights)-1,
+ len(vector),
+ )
+ }
+
+ for i, v := range vector {
+ if math.IsNaN(v) || math.IsInf(v, 0) {
+ return 0, fmt.Errorf("invalid value at vector[%d]: %v", i, v)
+ }
+ }
+ for i, w := range weights {
+ if math.IsNaN(w) || math.IsInf(w, 0) {
+ return 0, fmt.Errorf("invalid value at weights[%d]: %v", i, w)
+ }
+ }
+
+ featureWeights := weights[:len(weights)-1]
+ bias := weights[len(weights)-1]
+
+ z, err := dot(featureWeights, vector)
+ if err != nil {
+ return 0, fmt.Errorf("failed to compute dot product: %w", err)
+ }
+ return Sigmoid(z + bias), nil
+}
+
+
+// ============================================================================
+// ┏┳┓┏━┓╺┳╸╻ ╻┏━┓
+// ┃┃┃┣━┫ ┃ ┣━┫┗━┓
+// ╹ ╹╹ ╹ ╹ ╹ ╹┗━┛
+// ============================================================================
+
+
+func Sigmoid(z float64) float64 {
+ if z >= 0 {
+ return 1.0 / (1.0 + math.Exp(-z))
+ }
+ ez := math.Exp(z)
+ return ez / (1.0 + ez)
+}
+
+func dot(a, b []float64) (float64, error) {
+ if len(a) != len(b) {
+ return 0, fmt.Errorf("vector length mismatch: %d != %d", len(a), len(b))
+ }
+ var sum float64
+ for i := range a {
+ sum += a[i] * b[i]
+ }
+ return sum, nil
+}
+
+func clamp(p float64) float64 {
+ const probabilityClamp = 1e-15
+ if p < probabilityClamp {
+ return probabilityClamp
+ }
+ if p > 1.0-probabilityClamp {
+ return 1.0 - probabilityClamp
+ }
+ return p
+}
diff --git a/core/model.go b/core/model.go
new file mode 100644
index 0000000..28f4045
--- /dev/null
+++ b/core/model.go
@@ -0,0 +1,20 @@
+// Model envelope persists trained model to JSON. Contains Vectorizer for IDF values,
+// OrderedVocab for feature ordering, and Weights for logistic regression.
+// To score: recreate TFIDFVectorizer, transform, then PredictScore.
+package core
+
+import (
+ "time"
+)
+
+// ModelEnvelope - complete trained model for scoring articles
+type ModelEnvelope struct {
+ Algorithm string `json:"algorithm"`
+ Impl string `json:"impl"`
+ Version string `json:"version"`
+ CreatedAt time.Time `json:"created_at"`
+ Meta map[string]any `json:"meta"`
+ Vectorizer map[string]float64 `json:"vectorizer"`
+ OrderedVocab []string `json:"ordered_vocab"`
+ Weights []float64 `json:"weights"`
+}
diff --git a/core/scoring.go b/core/scoring.go
new file mode 100644
index 0000000..9896c80
--- /dev/null
+++ b/core/scoring.go
@@ -0,0 +1,14 @@
+// Score conversion utilities.
+//
+// ScoreToScale: Maps probability (0-1) to user-friendly 1-10 scale.
+// Why: Users understand "8/10" better than "0.82 probability".
+package core
+
+import "math"
+
+// ScoreToScale turns probability into 1-10 display score
+func ScoreToScale(rawScore, threshold float64) int {
+ k := 10.0
+ adjustedScore := 1.0 / (1.0 + math.Exp(-k*(rawScore-threshold)))
+ return int(math.Round(1.0 + (adjustedScore * 9.0)))
+}
diff --git a/core/text.go b/core/text.go
new file mode 100644
index 0000000..ef4f861
--- /dev/null
+++ b/core/text.go
@@ -0,0 +1,36 @@
+// Text processing for RSS feed content.
+// Used for web UI previews and search indexing - not ML (title-only scoring).
+package core
+
+import (
+ "regexp"
+ "strings"
+)
+
+// CleanFeedContent strips HTML, normalizes whitespace, truncates to 5KB
+func CleanFeedContent(content string) string {
+ if content == "" {
+ return ""
+ }
+
+ content = StripHTMLTags(content)
+ content = NormalizeSpace(content)
+
+ maxLength := 5000
+ if len(content) > maxLength {
+ content = content[:maxLength] + "..."
+ }
+
+ return content
+}
+
+// StripHTMLTags removes HTML tags
+func StripHTMLTags(content string) string {
+ re := regexp.MustCompile(`<[^>]*>`)
+ return re.ReplaceAllString(content, "")
+}
+
+// NormalizeSpace collapses whitespace and trims
+func NormalizeSpace(s string) string {
+ return strings.Join(strings.Fields(strings.TrimSpace(s)), " ")
+}
diff --git a/core/types.go b/core/types.go
new file mode 100644
index 0000000..3bfa311
--- /dev/null
+++ b/core/types.go
@@ -0,0 +1,84 @@
+// Core type definitions for article filtering.
+//
+// Article: Represents paper with metadata, URL, title, optional content.
+//
+// Score, LabelPositive, Classification for ML pipeline state.
+//
+// Config: Application settings (timeouts, user agent, enrich).
+// Command: Interface for CLI subcommands (train, scan, serve).
+package core
+
+import (
+ "io"
+ "time"
+)
+
+// Article represents a single article with enriched metadata and scoring.
+type Article struct {
+ // Basic article information
+ Title string `json:"title"`
+ Content string `json:"content,omitempty"`
+ URL string `json:"url"`
+
+ // Enrichment metadata
+ FetchedAt *time.Time `json:"fetched_at,omitempty"`
+ PublishedAt *time.Time `json:"published_at,omitempty"`
+ Source string `json:"source,omitempty"`
+
+ // Machine learning fields
+ Score *float64 `json:"score,omitempty"`
+ LabelPositive *bool `json:"label_positive,omitempty"`
+ Classification string `json:"classification,omitempty"`
+
+ // Additional metadata
+ Authors []string `json:"authors,omitempty"`
+ Journal string `json:"journal,omitempty"`
+ Year *int `json:"year,omitempty"`
+ DOI string `json:"doi,omitempty"`
+
+ // Raw extracted text from APIs or HTML
+ // Fields that may populate Title/Content
+ RawTitle string `json:"raw_title,omitempty"`
+ RawContent string `json:"raw_content,omitempty"`
+}
+
+// Config represents the application configuration.
+type Config struct {
+ // Default model and threshold
+ Defaults struct {
+ Model string `json:"model"`
+ Threshold *float64 `json:"threshold"`
+ EventsOut string `json:"events_out"`
+ } `json:"defaults"`
+
+ // HTTP behavior
+ UserAgent string `json:"user_agent"`
+ ContactEmail string `json:"contact_email"`
+
+ // Enrichment settings
+ Enrich struct {
+ MinTitleLength int `json:"min_title_length"`
+ ChunkSize int `json:"chunk_size"`
+ } `json:"enrich"`
+
+ // API provider settings
+ Providers struct {
+ SemanticScholar struct {
+ APIKey string `json:"api_key"`
+ } `json:"semantic_scholar"`
+ } `json:"providers"`
+}
+
+// Command defines the interface that all CLI subcommands must implement.
+type Command interface {
+ // Name returns the command name (e.g., "train", "scan", "clean").
+ Name() string
+
+ // Init parses command-line arguments and initializes the command.
+ // It should return flag.ErrHelp if --help was requested.
+ Init(args []string) error
+
+ // Run executes the command, reading from stdin and writing to stdout.
+ // The command should handle its own error reporting to stderr.
+ Run(stdin io.Reader, stdout io.Writer) error
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..967c54a
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,19 @@
+module scholscan
+
+go 1.25.1
+
+require (
+ github.com/PuerkitoBio/goquery v1.10.3
+ github.com/mmcdole/gofeed v1.3.0
+)
+
+require (
+ github.com/andybalholm/cascadia v1.3.3 // indirect
+ github.com/json-iterator/go v1.1.12 // indirect
+ github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 // indirect
+ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+ github.com/modern-go/reflect2 v1.0.2 // indirect
+ github.com/stretchr/testify v1.10.0 // indirect
+ golang.org/x/net v0.43.0 // indirect
+ golang.org/x/text v0.28.0 // indirect
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..f82512c
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,96 @@
+github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo=
+github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y=
+github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
+github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
+github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/mmcdole/gofeed v1.3.0 h1:5yn+HeqlcvjMeAI4gu6T+crm7d0anY85+M+v6fIFNG4=
+github.com/mmcdole/gofeed v1.3.0/go.mod h1:9TGv2LcJhdXePDzxiuMnukhV2/zb6VtnZt1mS+SjkLE=
+github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 h1:Zr92CAlFhy2gL+V1F+EyIuzbQNbSgP4xhTODZtrXUtk=
+github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
+github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
+github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
+github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
+golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
+golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
+golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
+golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
+golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
+golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
+golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
+golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
+golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
+golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
+golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
+golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
+golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
+golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng=
+golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
+golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/justfile b/justfile
new file mode 100644
index 0000000..eabf06c
--- /dev/null
+++ b/justfile
@@ -0,0 +1,39 @@
+# ScholScan Go Implementation
+
+# Default recipe
+default:
+ @just --list
+
+# Build the binary
+build:
+ go build -o scholscan .
+
+# Install to system (optional)
+install:
+ go install .
+
+# Run tests
+test:
+ go test ./...
+
+# Clean cache (only works if running from project directory)
+clean-cache:
+ ./scholscan clean
+
+# Format Go code
+fmt:
+ go fmt ./...
+
+# Run linter (requires golangci-lint)
+lint:
+ golangci-lint run
+
+# Example: Train model from articles and RSS feeds (provide your own paths)
+example-train articles feeds:
+ @mkdir -p /tmp/scholscan
+ ./scholscan train {{articles}} --rss-feeds {{feeds}} > /tmp/scholscan/model.json
+ @echo "Model saved to /tmp/scholscan/model.json"
+
+# Example: Scan with trained model (provide your own paths)
+example-scan model url:
+ ./scholscan scan --model {{model}} --url {{url}}
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..d523332
--- /dev/null
+++ b/main.go
@@ -0,0 +1,83 @@
+// scholscan command-line tool
+// this is the main entry point, commands are implemented in cmds/
+// and basic logic in core/
+package main
+
+import (
+ "errors"
+ "flag"
+ "fmt"
+ "os"
+ "scholscan/cmds"
+ "scholscan/core"
+)
+
+func main() {
+ if len(os.Args) < 2 {
+ printHelp()
+ os.Exit(1)
+ }
+
+ cmdName := os.Args[1]
+ args := os.Args[2:]
+
+ // handle the help stuff
+ if cmdName == "help" || cmdName == "--help" || cmdName == "-h" {
+ printHelp()
+ return
+ }
+
+ // flag -> command
+ var cmd core.Command
+ switch cmdName {
+ case "train":
+ cmd = &cmds.TrainCommand{}
+ case "scan":
+ cmd = &cmds.ScanCommand{}
+ case "serve":
+ cmd = &cmds.ServeCommand{}
+ default:
+ fmt.Fprintf(os.Stderr, "Unknown command: %s\n\n", cmdName)
+ printHelp()
+ os.Exit(1)
+ }
+
+ // init the command, then run it
+ if err := cmd.Init(args); err != nil {
+ if errors.Is(err, flag.ErrHelp) {
+ os.Exit(0)
+ }
+ fmt.Fprintf(os.Stderr, "Error initializing %s command: %v\n", cmdName, err)
+ os.Exit(1)
+ }
+
+ if err := cmd.Run(os.Stdin, os.Stdout); err != nil {
+ fmt.Fprintf(os.Stderr, "Error running %s command: %v\n", cmdName, err)
+ os.Exit(1)
+ }
+}
+
+func printHelp() {
+ fmt.Printf(`scholscan <command> [arguments]
+
+A command-line tool for filtering articles based on learned user preferences.
+
+Commands:
+ train Train a model from positives and RSS feeds
+ scan Filter articles using a trained model
+ serve Start HTTP server with filtered RSS and scoring API
+
+Usage:
+ scholscan train POSITIVES_FILE --rss-feeds RSS_FEEDS_FILE > model.json
+ scholscan scan --url RSS_URL --model MODEL > results.jsonl
+ scholscan serve --model MODEL --rss-world RSS_FEEDS_FILE # Start server
+ scholscan serve --title "My Custom ScholScan" # Custom title for web interface
+ scholscan help # Show this help message
+
+Examples:
+ scholscan train positives.jsonl --rss-feeds rss_world.txt > model.json
+ scholscan scan --url "https://feeds.reuters.com/reuters/topNews" --model model.json
+ scholscan serve --port 8080 --model model.json --rss-world rss_world.txt
+
+`)
+}