diff options
| author | Sam Scholten | 2025-12-15 19:34:17 +1000 |
|---|---|---|
| committer | Sam Scholten | 2025-12-15 19:34:59 +1000 |
| commit | 9f5978186ac3de07f4325975fecf4f538fe713b6 (patch) | |
| tree | 41440b703054fe59eb561ba81d80fd60380c1f7a | |
| download | scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.tar.gz scholscan-9f5978186ac3de07f4325975fecf4f538fe713b6.zip | |
Init v0.1.0
| -rw-r--r-- | .gitignore | 33 | ||||
| -rw-r--r-- | Containerfile | 11 | ||||
| -rw-r--r-- | DESIGN.md | 81 | ||||
| -rw-r--r-- | README.md | 37 | ||||
| -rw-r--r-- | cmds/scan.go | 416 | ||||
| -rw-r--r-- | cmds/serve.go | 1010 | ||||
| -rw-r--r-- | cmds/templates/live-feed.html | 158 | ||||
| -rw-r--r-- | cmds/templates/results.html | 279 | ||||
| -rw-r--r-- | cmds/templates/tools.html | 202 | ||||
| -rw-r--r-- | cmds/train.go | 841 | ||||
| -rw-r--r-- | cmds/train_test.go | 66 | ||||
| -rw-r--r-- | core/constants.go | 21 | ||||
| -rw-r--r-- | core/http.go | 196 | ||||
| -rw-r--r-- | core/ml.go | 427 | ||||
| -rw-r--r-- | core/model.go | 20 | ||||
| -rw-r--r-- | core/scoring.go | 14 | ||||
| -rw-r--r-- | core/text.go | 36 | ||||
| -rw-r--r-- | core/types.go | 84 | ||||
| -rw-r--r-- | go.mod | 19 | ||||
| -rw-r--r-- | go.sum | 96 | ||||
| -rw-r--r-- | justfile | 39 | ||||
| -rw-r--r-- | main.go | 83 |
22 files changed, 4169 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b57a04a --- /dev/null +++ b/.gitignore @@ -0,0 +1,33 @@ +scholscan + +*.exe +*.exe~ +*.dll +*.so +*.dylib + +*.test + +*.out + +go.work + +.vscode/ +.idea/ +*.swp +*.swo +*~ + +.DS_Store +Thumbs.db + +data/ +!README.md + +*.log + +.env* +config.* +secrets.*# RSS world data file +rss_world.txt +*.kate-swp diff --git a/Containerfile b/Containerfile new file mode 100644 index 0000000..58f011f --- /dev/null +++ b/Containerfile @@ -0,0 +1,11 @@ +# Copy & customize: mount model.json and rss_world.txt, set --title flag as needed +FROM golang:1.25-alpine AS builder +RUN apk add --no-cache git +WORKDIR /build +RUN git clone https://your-git-repo-url/scholscan.git . +RUN go build -o scholscan . + +FROM alpine:latest +COPY --from=builder /build/scholscan /app/scholscan +WORKDIR /app +ENTRYPOINT ["/app/scholscan"] diff --git a/DESIGN.md b/DESIGN.md new file mode 100644 index 0000000..dba3394 --- /dev/null +++ b/DESIGN.md @@ -0,0 +1,81 @@ +Scholscan Design +================= + +Article filter that learns from positive examples then filters RSS feeds automatically. Classifier uses TF-IDF on article titles plus logistic regression - fast, no content scraping needed. + +Code Structure +--------------- + +main.go - Entry point, validates commands, dispatches + +cmds/ + train.go - Load positive articles, fetch RSS as negatives, train model, output JSON + scan.go - Fetch articles from RSS, score with model, output filtered results + serve.go - HTTP server with background feed refresh, embedded web UI, RSS output + +core/ + types.go - Article struct holds article data, Config struct for app settings, Command interface + ml.go - TF-IDF implementation with n-gram support, logistic regression classifier + model.go - ModelEnvelope for serialized models, model save/load functions + scoring.go - Score conversion from raw 0-1 to display 1-10 scale + text.go - HTML content extraction, word tokenization, text cleaning + http.go - HTTP client with retries, timeouts, user agents + constants.go - Default timeouts, thresholds, chunk sizes + +Training Flow +------------- + +Command loads positive examples from JSONL file. Reads RSS URLs from text file (one per line, # comments allowed). Fetches RSS feeds in parallel, removes any articles matching positive URLs. Trains TF-IDF vectorizer then logistic regression on balanced dataset. Finds optimal threshold on validation split using Youden's J metric. Outputs complete model JSON to stdout. + +Scanning Flow +------------- + +Command fetches specified RSS feed, scores each article using trained model. Articles scoring above threshold output as JSON-Lines (same format as input). Includes enrichment metadata if available. Verbose mode shows fetch and scoring progress to stderr. + +Server Flow +----------- + +Server loads model and RSS world feed list on startup. Background goroutine refreshes all feeds in parallel every N minutes (configurable). Results cached in memory with RWMutex. HTTP handlers serve both HTML UI and JSON/RSS API endpoints. + +API Endpoints +------------- + +### HTML Pages +- GET `/` - Redirect to /live-feed +- GET `/live-feed` - Filtered articles web interface (server-rendered) +- GET `/tools` - Manual article scoring interface (server-rendered) + +### HTTP Handlers +- GET `/api/filtered/feed` - Articles as JSON array (for external consumption) +- GET `/api/health` - Health check returns {"status":"ok"} +- POST `/score` - Score single article via form post +- POST `/scan` - Scan RSS feed via form post + +### RSS Output +- GET `/api/filtered/rss` - Scored articles as RSS feed + +Model Details +------------- + +Vectorizer uses unigrams plus bigrams. Minimum document frequency 2 (removes typos), maximum 80% (removes stopwords). Vocabulary capped at 50000 terms. Logistic regression with L2 regularization lambda=0.001, learning rate 0.5, 500 iterations. Validation split 80/20 with seed 42 for reproducible results. Threshold selected using Youden's J to balance false positives against false negatives. + +Server Implementation +--------------------- + +HTML templates embedded in binary using embed.FS. All rendering is server-side with no JavaScript. Tools page uses standard HTML forms with POST submissions. Live feed displays cached background results with server-side rendering. Background refresh uses separate goroutine per feed. Results cached with last update time for each feed. RSS output repackages filtered articles into RSS format for consumption. + +Key Implementation Notes +------------------------ + +- Articles processed in 50-item chunks for memory efficiency +- File paths validated against directory traversal attacks +- HTTP requests use custom polite user agent with email contact +- RSS parsing handles both RSS and Atom via gofeed library +- TF-IDF vectorizer stores vocabulary as sorted string array for deterministic ordering +- Model version field allows future format changes +- Background refresh errors logged but don't crash server + +External Dependencies +--------------------- + +gofeed mmcdole for RSS/Atom parsing. All other functionality uses Go standard library only. diff --git a/README.md b/README.md new file mode 100644 index 0000000..870bf34 --- /dev/null +++ b/README.md @@ -0,0 +1,37 @@ +# Scholscan + +Filters academic articles using TF-IDF on titles plus logistic regression. + +## Build +``` +go build -o scholscan . +``` + +## Usage +``` +# Train model from articles you like +./scholscan train positives.jsonl --rss-feeds feeds.txt > model.json + +# Score new RSS feed +./scholscan scan --url RSS_URL --model model.json > results.jsonl + +# Run web server +./scholscan serve --port 8080 --model model.json --rss-world rss_world.txt +``` + +## Endpoints + +- GET `/` - redirect to live feed +- GET `/live-feed` - filtered articles web UI +- GET `/tools` - score individual articles +- POST `/score` - API for scoring titles +- POST `/scan` - API for scanning RSS +- GET `/api/filtered/feed` - JSON feed +- GET `/api/filtered/rss` - RSS feed +- GET `/api/health` - health check + +## Model settings + +- TF-IDF: unigrams + bigrams, MinDF=2, MaxDF=0.8 +- Logistic regression: λ=0.001, L2 regularization +- Class balancing: downsample majority to 1:1 ratio
\ No newline at end of file diff --git a/cmds/scan.go b/cmds/scan.go new file mode 100644 index 0000000..789157c --- /dev/null +++ b/cmds/scan.go @@ -0,0 +1,416 @@ +// Scan command: filters articles using trained model. +// +// takes articles from RSS feed, text, or JSONL. Scores & outputs those passing. +// Batches processing (default 50) to allow continuous streaming. +package cmds + +import ( + "bufio" + "context" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "net/http" + "net/url" + "os" + "path/filepath" + "strings" + + "github.com/mmcdole/gofeed" + "scholscan/core" +) + + +// ============================================================================ +// ┏━╸┏━┓┏┳┓┏┳┓┏━┓┏┓╻╺┳┓ +// ┃ ┃ ┃┃┃┃┃┃┃┣━┫┃┗┫ ┃┃ +// ┗━╸┗━┛╹ ╹╹ ╹╹ ╹╹ ╹╺┻┛ +// ============================================================================ + + +// scores articles with trained model and outputs filtered results above thresh +type ScanCommand struct { + URL string + FromText bool + FromArticles bool + + ModelPath string + Threshold string + + MinTitleLength int + ChunkSize int + + EventsOut string + MetricsOut string + Verbose bool +} + +func (c *ScanCommand) Name() string { return "scan" } + +func (c *ScanCommand) Init(args []string) error { + fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError) + fs.Usage = func() { + fmt.Fprint(fs.Output(), `Usage: scholscan scan [options] + +Fetches articles, scores with model, outputs matched (>thresh) ones. + +Source options (exactly one required): + --url <feed_url> Fetch articles from RSS/Atom feed + --from-text Extract URLs from text on stdin + --from-articles Use Article JSONL from stdin directly + +Model and filtering: + --model <path> Path to trained model JSON file (required) + --threshold <float> Score threshold (if not provided, uses model's recommended threshold) + +Enrichment options: +`) + fs.PrintDefaults() + fmt.Fprint(fs.Output(), ` +Examples: + scholscan scan --url "http://some.blog/rss.xml" --model model.json > interesting.jsonl + echo "see https://example.com" | scholscan scan --from-text --model model.json + cat articles.jsonl | scholscan scan --from-articles --model model.json +`) + } + + fs.StringVar(&c.URL, "url", "", "RSS/Atom feed URL to fetch") + fs.BoolVar(&c.FromText, "from-text", false, "Extract URLs from text on stdin") + fs.BoolVar(&c.FromArticles, "from-articles", false, "Use Article JSONL from stdin") + fs.StringVar(&c.ModelPath, "model", "", "Path to trained model JSON file (required)") + fs.StringVar(&c.Threshold, "threshold", "", "Score threshold for filtering (if not provided, uses model's recommended threshold)") + fs.IntVar(&c.MinTitleLength, "min-title-length", core.MinTitleLength, "Minimum title length to consider valid") + fs.IntVar(&c.ChunkSize, "chunk-size", core.DefaultChunkSize, "Number of articles to process in each batch") + fs.StringVar(&c.EventsOut, "events-out", "events.jsonl", "Write per-article events to a JSONL file") + fs.StringVar(&c.MetricsOut, "metrics-out", "metrics.json", "Write summary metrics to a JSON file") + fs.BoolVar(&c.Verbose, "verbose", false, "Show progress information") + + if err := fs.Parse(args); err != nil { + return err + } + + if fs.NArg() != 0 { + return fmt.Errorf("unexpected arguments provided: %v", fs.Args()) + } + + // one src opt required + sourceCount := 0 + if c.URL != "" { + sourceCount++ + } + if c.FromText { + sourceCount++ + } + if c.FromArticles { + sourceCount++ + } + + if sourceCount == 0 { + return fmt.Errorf("exactly one source option must be specified: --url, --from-text, or --from-articles") + } + if sourceCount > 1 { + return fmt.Errorf("only one source option may be specified: --url, --from-text, or --from-articles") + } + + if c.ModelPath == "" { + return fmt.Errorf("--model flag is required") + } + + // prevent dir traversal + if strings.Contains(filepath.Clean(c.ModelPath), "..") { + return fmt.Errorf("invalid model path: directory traversal not allowed") + } + + if c.URL != "" { + if _, err := url.Parse(c.URL); err != nil { + return fmt.Errorf("invalid URL format: %w", err) + } + } + + return nil +} + +// Run runs the scan: load the model, decide on a threshold, get articles, then score them in chunks. +// We bail out early on config problems but try to keep going even if some articles fail to fetch. +func (c *ScanCommand) Run(stdin io.Reader, stdout io.Writer) error { + if c.Verbose { + log.SetOutput(os.Stderr) + log.Println("Starting scan workflow...") + log.Printf("Source: %v", c.getSourceDescription()) + log.Printf("Model: %s", c.ModelPath) + } + + model, err := c.loadModel() + if err != nil { + return fmt.Errorf("failed to load model: %w", err) + } + + threshold, err := c.getThreshold(model) + if err != nil { + return fmt.Errorf("failed to determine threshold: %w", err) + } + + if c.Verbose { + log.Printf("Using threshold: %.3f", threshold) + } + + var articles []*core.Article + if c.FromArticles { + articles, err = c.readArticlesFromStdin(stdin) + } else { + articles, err = c.fetchArticles() + } + if err != nil { + return fmt.Errorf("failed to get articles: %w", err) + } + + if c.Verbose { + log.Printf("Processing %d articles", len(articles)) + } + + // process articles in chunks + return c.processArticles(articles, model, threshold, stdout, stdin) +} + + +// ============================================================================ +// ┏┳┓┏━┓╺┳┓┏━╸╻ ┏┓ ┏━╸┏━┓┏┓╻┏━╸╻┏━╸ +// ┃┃┃┃ ┃ ┃┃┣╸ ┃ ┃╺╋╸ ┃ ┃ ┃┃┗┫┣╸ ┃┃╺┓ +// ╹ ╹┗━┛╺┻┛┗━╸┗━╸ ┗━┛ ┗━╸┗━┛╹ ╹╹ ╹┗━┛ +// ============================================================================ + + + +func (c *ScanCommand) getSourceDescription() string { + if c.URL != "" { + return fmt.Sprintf("RSS feed: %s", c.URL) + } + if c.FromText { + return "text from stdin" + } + if c.FromArticles { + return "articles from stdin" + } + return "unknown" +} + +// loadModel reads and parses the model JSON file. +// The envelope contains weights, vocabulary, and optionally a recommended threshold. +func (c *ScanCommand) loadModel() (*core.ModelEnvelope, error) { + f, err := os.Open(c.ModelPath) + if err != nil { + return nil, fmt.Errorf("failed to open model file %s: %w", c.ModelPath, err) + } + defer f.Close() + + var model core.ModelEnvelope + if err := json.NewDecoder(f).Decode(&model); err != nil { + return nil, fmt.Errorf("failed to decode model: %w", err) + } + + return &model, nil +} + +func (c *ScanCommand) getThreshold(model *core.ModelEnvelope) (float64, error) { + if c.Threshold != "" { + var threshold float64 + _, err := fmt.Sscanf(c.Threshold, "%f", &threshold) + if err == nil { + return threshold, nil + } + } + + if model.Meta != nil { + if meta, ok := model.Meta["recommended_threshold"].(float64); ok { + return meta, nil + } + } + + return core.DefaultScoreThreshold, nil +} + +// ============================================================================ +// ┏━┓┏━┓╺┳╸╻┏━╸╻ ┏━╸ ┏━┓┏━┓┏━╸┏━┓ +// ┣━┫┣┳┛ ┃ ┃┃ ┃ ┣╸ ┗━┓┣┳┛┃ ┗━┓ +// ╹ ╹╹┗╸ ╹ ╹┗━╸┗━╸┗━╸ ┗━┛╹┗╸┗━╸┗━┛ +// ============================================================================ + + +func (c *ScanCommand) fetchArticles() ([]*core.Article, error) { + if c.FromText { + return c.extractURLsFromText(os.Stdin) + } + if c.URL != "" { + return c.fetchRSSFeed(c.URL) + } + return nil, fmt.Errorf("no valid source specified") +} + +// extractURLsFromText pulls URLs from plain text on stdin. +// We create minimal Article objects since only the URL is needed for scoring. +func (c *ScanCommand) extractURLsFromText(stdin io.Reader) ([]*core.Article, error) { + var urls []string + s := bufio.NewScanner(stdin) + for s.Scan() { + line := s.Text() + // url extraction + fields := strings.Fields(line) + for _, field := range fields { + if strings.HasPrefix(field, "http://") || strings.HasPrefix(field, "https://") { + urls = append(urls, field) + } + } + } + + // create Article objs for URLs + articles := make([]*core.Article, len(urls)) + for i, url := range urls { + articles[i] = &core.Article{ + URL: url, + Title: fmt.Sprintf("Article from %s", url), + Content: "", + } + } + + return articles, s.Err() +} + +// fetchRSSFeed fetches and parses a single RSS feed with a 30s timeout. +// We skip articles with short titles since they're usually noise or truncated. +func (c *ScanCommand) fetchRSSFeed(url string) ([]*core.Article, error) { + client := &http.Client{Timeout: core.DefaultHTTPTimeout} + + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, fmt.Errorf("error building request: %w", err) + } + req.Header.Set("User-Agent", core.PoliteUserAgent) + + ctx, cancel := context.WithTimeout(context.Background(), core.DefaultHTTPTimeout) + defer cancel() + + resp, err := client.Do(req.WithContext(ctx)) + if err != nil { + return nil, fmt.Errorf("error fetching %s: %w", url, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("error reading response from %s: %w", url, err) + } + + // parse feed + fp := gofeed.NewParser() + feed, err := fp.Parse(strings.NewReader(string(body))) + if err != nil { + return nil, fmt.Errorf("error parsing feed from %s: %w", url, err) + } + + var articles []*core.Article + for _, item := range feed.Items { + article := &core.Article{ + URL: item.Link, + Title: strings.TrimSpace(item.Title), + } + + if len(article.Title) >= c.MinTitleLength { + articles = append(articles, article) + } + } + + return articles, nil +} + +// readArticlesFromStdin reads Article objects from JSONL on stdin. +// Malformed lines are skipped to allow partial processing of corrupted input. +func (c *ScanCommand) readArticlesFromStdin(stdin io.Reader) ([]*core.Article, error) { + var articles []*core.Article + decoder := json.NewDecoder(stdin) + for { + var article core.Article + if err := decoder.Decode(&article); err != nil { + if err == io.EOF { + break + } + continue + } + + if len(article.Title) >= c.MinTitleLength { + articles = append(articles, &article) + } + } + return articles, nil +} + + + +// ============================================================================ +// ┏━┓┏━┓┏━┓┏━╸┏━╸┏━┓┏━┓ ┏━┓┏━┓╺┳╸╻┏━╸╻ ┏━╸┏━┓ +// ┣━┛┣┳┛┃ ┃┃ ┣╸ ┗━┓┗━┓ ┣━┫┣┳┛ ┃ ┃┃ ┃ ┣╸ ┗━┓ +// ╹ ╹┗╸┗━┛┗━╸┗━╸┗━┛┗━┛ ╹ ╹╹┗╸ ╹ ╹┗━╸┗━╸┗━╸┗━┛ +// ============================================================================ + + +// processArticles handles scoring and filtering in batches to keep memory usage predictable. +// Scoring errors don't crash the process - we log them and continue with the next article. +func (c *ScanCommand) processArticles(articles []*core.Article, model *core.ModelEnvelope, threshold float64, stdout io.Writer, stdin io.Reader) error { + vectorizer := core.CreateVectorizerFromModel(model) + + encoder := json.NewEncoder(stdout) + + // process each batch + for i := 0; i < len(articles); i += c.ChunkSize { + end := i + c.ChunkSize + if end > len(articles) { + end = len(articles) + } + + chunk := articles[i:end] + if c.Verbose { + log.Printf("Processing chunk %d-%d of %d articles", i+1, end, len(articles)) + } + + // calc score for batch + docs := make([]string, len(chunk)) + for j, article := range chunk { + docs[j] = strings.TrimSpace(article.Title) + } + + vectors := vectorizer.Transform(docs) + scores := make([]float64, len(chunk)) + + for j, vector := range vectors { + score, err := core.PredictScore(vector, model.Weights) + if err != nil { + log.Printf("Error computing score for article %d: %v", i+j, err) + scores[j] = 0.0 + } else { + scores[j] = score + } + } + + for j, article := range chunk { + score := scores[j] + article.Score = &score + + if score >= threshold { + if err := encoder.Encode(article); err != nil { + log.Printf("Error encoding article: %v", err) + } + } + } + } + + if c.Verbose { + log.Println("Scan complete") + } + + return nil +} diff --git a/cmds/serve.go b/cmds/serve.go new file mode 100644 index 0000000..92aa64c --- /dev/null +++ b/cmds/serve.go @@ -0,0 +1,1010 @@ +// Serve command: HTTP server for web UI and APIs. +// +// Two main flows: live-feed (cached + background refresh) and tools (on-demand scoring). +// Live-feed rescans all configured RSS feeds on a timer (default 24h), caches results, +// serves filtered articles via web UI and JSON/RSS APIs. +// Tools provides real-time /score (single title) and /scan (ad-hoc feed) endpoints. +// Background refresh continues despite individual feed failures; RWMutex allows +// many concurrent readers with exclusive writer updates. +// Templates are embedded for single-binary deployment. +package cmds + +import ( + "bufio" + "context" + "embed" + "encoding/json" + "flag" + "fmt" + "html/template" + "io" + "log" + "net/http" + "net/url" + "os" + "os/signal" + "path/filepath" + "regexp" + "sort" + "strings" + "sync" + "syscall" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/mmcdole/gofeed" + "scholscan/core" +) + +//go:embed templates/*.html +var templateFS embed.FS + +// ============================================================================ +// ┏━╸┏┳┓╺┳┓ ┏━┓┏┓ ┏┓ +// ┃ ┃┃┃ ┃┃ ┃ ┃┣┻┓ ┃ +// ┗━╸╹ ╹╺┻┛ ┗━┛┗━┛┗━┛ +// ============================================================================ + +type ServeCommand struct { + Port int + RSSWorldPath string + RefreshInterval string + ModelPath string + Title string + + // Parsed interval + refreshInterval time.Duration + // Loaded model (cached) + model *core.ModelEnvelope + modelMu sync.RWMutex + // Cached filtered RSS results and timestamp. + // RWMutex allows many concurrent readers (HTTP handlers) with exclusive writer (background refresh). + filteredResults []*core.Article + filteredResultsTime time.Time + resultsMu sync.RWMutex + // Loaded templates + tmpl *template.Template +} + +func (c *ServeCommand) Name() string { return "serve" } + +// Init configures the serve command with robust input validation. +// Prevents directory traversal, validates paths, and sets sensible defaults. +// Ensures only one configuration is possible to reduce runtime complexity. +func (c *ServeCommand) Init(args []string) error { + fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError) + fs.Usage = func() { + fmt.Fprint(fs.Output(), `Usage: scholscan serve [options] + + Start HTTP server for filtered RSS and scoring web UI. + + Flags: + `) + fs.PrintDefaults() + fmt.Fprint(fs.Output(), ` + Examples: + scholscan serve --port 8080 --rss-world rss_world.txt --model model.json + scholscan serve --refresh-interval 24h --model ./model.json --rss-world feeds.txt + `) + } + + fs.IntVar(&c.Port, "port", 8080, "Port to listen on") + fs.StringVar(&c.RSSWorldPath, "rss-world", "rss_world.txt", "Path to RSS world file (one feed URL per line)") + fs.StringVar(&c.RefreshInterval, "refresh-interval", "24h", "Interval for background rescans (e.g., 24h, 1h)") + fs.StringVar(&c.ModelPath, "model", "model.json", "Path to trained model JSON file") + fs.StringVar(&c.Title, "title", "", "Custom title for the web interface") + + if err := fs.Parse(args); err != nil { + return err + } + + if fs.NArg() != 0 { + return fmt.Errorf("unexpected arguments provided: %v", fs.Args()) + } + + // Parse refresh interval + interval, err := time.ParseDuration(c.RefreshInterval) + if err != nil { + return fmt.Errorf("invalid refresh-interval %q: %w", c.RefreshInterval, err) + } + c.refreshInterval = interval + + if strings.Contains(filepath.Clean(c.RSSWorldPath), "..") { + return fmt.Errorf("invalid rss-world path: directory traversal not allowed") + } + if strings.Contains(filepath.Clean(c.ModelPath), "..") { + return fmt.Errorf("invalid model path: directory traversal not allowed") + } + + return nil +} + +func (c *ServeCommand) Run(stdin io.Reader, stdout io.Writer) error { + log.Printf("Starting scholscan server on port %d", c.Port) + + // Initialize filteredResultsTime to server start time + c.resultsMu.Lock() + c.filteredResultsTime = time.Now() + c.resultsMu.Unlock() + + // Load templates at startup + tmpl, err := template.ParseFS(templateFS, "templates/*.html") + if err != nil { + return fmt.Errorf("failed to parse templates: %w", err) + } + c.tmpl = tmpl + log.Printf("Templates loaded successfully") + + // Load model at startup + model, err := c.loadModel() + if err != nil { + return fmt.Errorf("failed to load model at startup: %w", err) + } + c.modelMu.Lock() + c.model = model + c.modelMu.Unlock() + + log.Printf("Model loaded successfully") + + // Start background ticker for periodic refresh + ticker := time.NewTicker(c.refreshInterval) + go c.backgroundRefresh(ticker) + + // Perform initial scan asynchronously + go func() { + log.Println("Starting initial feed scan...") + if err := c.refreshFilteredResults(); err != nil { + log.Printf("Warning: initial scan failed: %v", err) + } else { + c.resultsMu.RLock() + count := len(c.filteredResults) + c.resultsMu.RUnlock() + log.Printf("Initial scan complete, %d articles filtered", count) + } + }() + + // Setup HTTP handlers + http.HandleFunc("/", c.handleRoot) + http.HandleFunc("/live-feed", c.handleLiveFeed) + http.HandleFunc("/tools", c.handleTools) + http.HandleFunc("/score", c.handleScore) + http.HandleFunc("/scan", c.handleScan) + http.HandleFunc("/api/filtered/feed", c.handleFilteredFeed) + http.HandleFunc("/api/filtered/rss", c.handleFilteredRSS) + http.HandleFunc("/api/health", c.handleHealth) + + // Setup server with graceful shutdown + server := &http.Server{ + Addr: fmt.Sprintf(":%d", c.Port), + Handler: http.DefaultServeMux, + ReadTimeout: core.DefaultReadTimeout, + WriteTimeout: core.DefaultWriteTimeout, + IdleTimeout: core.DefaultIdleTimeout, + } + + // Handle shutdown signals + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + go func() { + <-sigChan + log.Println("Shutdown signal received") + ticker.Stop() + ctx, cancel := context.WithTimeout(context.Background(), core.DefaultShutdownTimeout) + defer cancel() + if err := server.Shutdown(ctx); err != nil { + log.Printf("Server shutdown error: %v", err) + } + }() + + log.Printf("Server listening on http://localhost:%d", c.Port) + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + return fmt.Errorf("server error: %w", err) + } + + return nil +} + +// ============================================================================ +// ┏━╸┏━┓┏━┓┏━╸ ╻ ┏━┓┏━╸╻┏━╸ +// ┃ ┃ ┃┣┳┛┣╸ ┃ ┃ ┃┃╺┓┃┃ +// ┗━╸┗━┛╹┗╸┗━╸ ┗━╸┗━┛┗━┛╹┗━╸ +// ============================================================================ + +func (c *ServeCommand) loadModel() (*core.ModelEnvelope, error) { + f, err := os.Open(c.ModelPath) + if err != nil { + return nil, fmt.Errorf("failed to open model file %s: %w", c.ModelPath, err) + } + defer f.Close() + + var model core.ModelEnvelope + if err := json.NewDecoder(f).Decode(&model); err != nil { + return nil, fmt.Errorf("failed to decode model: %w", err) + } + + return &model, nil +} + +func (c *ServeCommand) scoreArticle(article *core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope) float64 { + docs := []string{strings.TrimSpace(article.Title)} + vectors := vectorizer.Transform(docs) + + if len(vectors) == 0 || len(vectors[0]) == 0 { + return 0.0 + } + + score, err := core.PredictScore(vectors[0], model.Weights) + if err != nil { + // Return 0.0 on error (below threshold). Malformed articles don't break the display, + // they just get filtered out. Log the error for diagnostics. + log.Printf("Error scoring article: %v", err) + return 0.0 + } + + return score +} + +func (c *ServeCommand) getThreshold(model *core.ModelEnvelope) (float64, error) { + if model.Meta != nil { + if threshold, ok := model.Meta["recommended_threshold"].(float64); ok { + return threshold, nil + } + } + return core.DefaultScoreThreshold, nil +} + +// scoreAndFormatArticles scores a list of articles and returns them formatted for templates. +// Articles are scored using the model and vectorizer, then returned with human-readable ratings. +func (c *ServeCommand) scoreAndFormatArticles(articles []*core.Article, vectorizer *core.TFIDFVectorizer, model *core.ModelEnvelope, threshold float64) []map[string]interface{} { + type ArticleResponse struct { + Title string `json:"title"` + URL string `json:"url"` + Source string `json:"source,omitempty"` + Rating int `json:"rating"` + Score float64 `json:"score"` + } + + scored := make([]ArticleResponse, 0, len(articles)) + for _, article := range articles { + score := c.scoreArticle(article, vectorizer, model) + rating := core.ScoreToScale(score, threshold) + + scored = append(scored, ArticleResponse{ + Title: article.Title, + URL: article.URL, + Source: article.Source, + Rating: rating, + Score: score, + }) + } + + result := make([]map[string]interface{}, len(scored)) + for i, a := range scored { + result[i] = map[string]interface{}{ + "Title": a.Title, + "URL": a.URL, + "Source": a.Source, + "Rating": a.Rating, + "Score": a.Score, + } + } + return result +} + +// ============================================================================ +// ┏━┓┏━┓┏━┓ ┏━┓╺┳╸╻ ╻┏━╸┏━╸ +// ┣┳┛┗━┓┗━┓ ┗━┓ ┃ ┃ ┃┣╸ ┣╸ +// ╹┗╸┗━┛┗━┛ ┗━┛ ╹ ┗━┛╹ ╹ +// ============================================================================ + +func (c *ServeCommand) readRSSWorldFeeds() ([]string, error) { + f, err := os.Open(c.RSSWorldPath) + if err != nil { + return nil, fmt.Errorf("failed to open rss_world file %s: %w", c.RSSWorldPath, err) + } + defer f.Close() + + var feeds []string + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line != "" && !strings.HasPrefix(line, "#") { + feeds = append(feeds, line) + } + } + + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("error reading rss_world file: %w", err) + } + + return feeds, nil +} + +func (c *ServeCommand) refreshFilteredResults() error { + feeds, err := c.readRSSWorldFeeds() + if err != nil { + return err + } + + c.modelMu.RLock() + model := c.model + c.modelMu.RUnlock() + + if model == nil { + return fmt.Errorf("model not loaded") + } + + // Scan all feeds. Continue on individual feed failures to maximize results. + // RSS feeds are often flaky; one down shouldn't prevent others from being processed. + var allArticles []*core.Article + for _, feed := range feeds { + articles, err := c.fetchRSSFeed(feed) + if err != nil { + log.Printf("Warning: failed to fetch feed %s: %v", feed, err) + continue + } + allArticles = append(allArticles, articles...) + } + + // Score and filter articles + threshold, err := c.getThreshold(model) + if err != nil { + return err + } + + vectorizer := core.CreateVectorizerFromModel(model) + + filtered := make([]*core.Article, 0, len(allArticles)) + for _, article := range allArticles { + score := c.scoreArticle(article, vectorizer, model) + if score >= threshold { + // Create a copy with score to avoid reference issues + articleCopy := *article + articleCopy.Score = &score + filtered = append(filtered, &articleCopy) + } + } + + c.resultsMu.Lock() + c.filteredResults = filtered + c.filteredResultsTime = time.Now() + c.resultsMu.Unlock() + + return nil +} + +// backgroundRefresh runs in a goroutine, rescanning all RSS feeds on interval. +// Failures in individual feeds don't affect others - we log and continue. +func (c *ServeCommand) backgroundRefresh(ticker *time.Ticker) { + for range ticker.C { + log.Println("Background refresh started") + if err := c.refreshFilteredResults(); err != nil { + log.Printf("Background refresh error (continuing): %v", err) + } else { + c.resultsMu.RLock() + count := len(c.filteredResults) + c.resultsMu.RUnlock() + log.Printf("Background refresh complete, %d articles filtered", count) + } + } +} + +func (c *ServeCommand) fetchRSSFeed(url string) ([]*core.Article, error) { + client := &http.Client{Timeout: core.DefaultHTTPTimeout} + + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, fmt.Errorf("error building request: %w", err) + } + req.Header.Set("User-Agent", core.PoliteUserAgent) + + ctx, cancel := context.WithTimeout(context.Background(), core.DefaultHTTPTimeout) + defer cancel() + + resp, err := client.Do(req.WithContext(ctx)) + if err != nil { + return nil, fmt.Errorf("error fetching %s: %w", url, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("error reading response from %s: %w", url, err) + } + + fp := gofeed.NewParser() + feed, err := fp.Parse(strings.NewReader(string(body))) + if err != nil { + return nil, fmt.Errorf("error parsing feed from %s: %w", url, err) + } + + var articles []*core.Article + for _, item := range feed.Items { + article := &core.Article{ + URL: item.Link, + Title: strings.TrimSpace(item.Title), + Source: feed.Title, + } + + if item.PublishedParsed != nil { + article.PublishedAt = item.PublishedParsed + } + + if len(article.Title) >= core.MinTitleLength { + articles = append(articles, article) + } + } + + return articles, nil +} + +// ============================================================================ +// ╻ ╻┏━╸┏┓ ╻ ╻╻ +// ┃╻┃┣╸ ┣┻┓ ┃ ┃┃ +// ┗┻┛┗━╸┗━┛ ┗━┛╹ +// ============================================================================ + +func (c *ServeCommand) handleRoot(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/" { + http.NotFound(w, r) + return + } + + // Redirect to live feed + http.Redirect(w, r, "/live-feed", http.StatusMovedPermanently) +} + +func (c *ServeCommand) handleLiveFeed(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + c.resultsMu.RLock() + articles := c.filteredResults + resultsTime := c.filteredResultsTime + c.resultsMu.RUnlock() + + c.modelMu.RLock() + model := c.model + c.modelMu.RUnlock() + + if model == nil { + http.Error(w, "Model not loaded", http.StatusInternalServerError) + return + } + + threshold, _ := c.getThreshold(model) + + // Parse filter parameter (day, week, or all) + filter := r.URL.Query().Get("filter") + if filter == "" { + filter = "all" + } + + // Filter articles by date if needed + now := time.Now() + filtered := articles + if filter == "day" || filter == "week" { + var cutoff time.Time + if filter == "day" { + cutoff = now.Add(-24 * time.Hour) + } else if filter == "week" { + cutoff = now.Add(-7 * 24 * time.Hour) + } + + filtered = make([]*core.Article, 0, len(articles)) + for _, article := range articles { + // Always include articles without PublishedAt + if article.PublishedAt == nil || article.PublishedAt.After(cutoff) { + filtered = append(filtered, article) + } + } + } + + // Convert articles to template format + type TemplateArticle struct { + Title string + URL string + Source string + Rating int + Score float64 + PublishedAt string + } + + templateArticles := make([]TemplateArticle, 0, len(filtered)) + for _, article := range filtered { + score := 0.0 + if article.Score != nil { + score = *article.Score + } + rating := core.ScoreToScale(score, threshold) + + publishedAt := "" + if article.PublishedAt != nil { + publishedAt = article.PublishedAt.Format("2006-01-02") + } + + templateArticles = append(templateArticles, TemplateArticle{ + Title: article.Title, + URL: article.URL, + Source: article.Source, + Rating: rating, + Score: score, + PublishedAt: publishedAt, + }) + } + + // Sort articles by score (highest first) + sort.Slice(templateArticles, func(i, j int) bool { + return templateArticles[i].Score > templateArticles[j].Score + }) + + data := map[string]interface{}{ + "Page": "live-feed", + "Articles": templateArticles, + "Threshold": threshold, + "UpdatedAt": resultsTime.Format("2006-01-02 15:04:05"), + "Filter": filter, + "Title": displayTitle(c.Title), + } + + w.Header().Set("Content-Type", "text/html; charset=utf-8") + if err := c.tmpl.ExecuteTemplate(w, "live-feed", data); err != nil { + http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) + } +} + +func (c *ServeCommand) handleTools(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + data := map[string]interface{}{ + "Page": "tools", + "Title": displayTitle(c.Title), + } + + w.Header().Set("Content-Type", "text/html; charset=utf-8") + if err := c.tmpl.ExecuteTemplate(w, "tools", data); err != nil { + http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) + } +} + +func (c *ServeCommand) handleScore(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodGet { + c.handleTools(w, r) + return + } + + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + c.modelMu.RLock() + model := c.model + c.modelMu.RUnlock() + + if model == nil { + http.Error(w, "Model not loaded", http.StatusInternalServerError) + return + } + + if err := r.ParseForm(); err != nil { + http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest) + return + } + + title := strings.TrimSpace(r.FormValue("title")) + url := strings.TrimSpace(r.FormValue("url")) + + // If URL provided, fetch and extract title from it; otherwise use provided title. + if url != "" { + extractedTitle, err := extractTitleFromURL(url) + if err != nil { + c.renderResultsError(w, fmt.Sprintf("Failed to extract title from URL: %v", err), title) + return + } + title = extractedTitle + } + + // Validate input before scoring + if valErr := c.validateTitle(title); valErr != "" { + c.renderResultsError(w, valErr, title) + return + } + + vectorizer := core.CreateVectorizerFromModel(model) + article := &core.Article{Title: title} + score := c.scoreArticle(article, vectorizer, model) + + threshold, _ := c.getThreshold(model) + rating := core.ScoreToScale(score, threshold) + + data := map[string]interface{}{ + "Page": "tools", + "IsScoreResult": true, + "Title": title, + "Rating": rating, + "Score": score, + "Threshold": threshold, + "PageTitle": displayTitle(c.Title), + } + + w.Header().Set("Content-Type", "text/html; charset=utf-8") + if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil { + http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) + } +} + +func (c *ServeCommand) handleScan(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodGet { + c.handleTools(w, r) + return + } + + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + c.modelMu.RLock() + model := c.model + c.modelMu.RUnlock() + + if model == nil { + http.Error(w, "Model not loaded", http.StatusInternalServerError) + return + } + + if err := r.ParseForm(); err != nil { + http.Error(w, fmt.Sprintf("Failed to parse form: %v", err), http.StatusBadRequest) + return + } + + feedURL := strings.TrimSpace(r.FormValue("feed_url")) + + // Validate and fetch the feed + if valErr := c.validateFeedURL(feedURL); valErr != "" { + c.renderScanResultsError(w, valErr, feedURL) + return + } + + articles, err := c.fetchRSSFeed(feedURL) + if err != nil { + c.renderScanResultsError(w, fmt.Sprintf("Failed to fetch feed: %v", err), feedURL) + return + } + + // Score articles + threshold, _ := c.getThreshold(model) + vectorizer := core.CreateVectorizerFromModel(model) + scored := c.scoreAndFormatArticles(articles, vectorizer, model, threshold) + + sort.Slice(scored, func(i, j int) bool { + iScore := scored[i]["Score"].(float64) + jScore := scored[j]["Score"].(float64) + return iScore > jScore + }) + + data := map[string]interface{}{ + "Page": "tools", + "IsScanResult": true, + "FeedURL": feedURL, + "Articles": scored, + "Threshold": threshold, + "PageTitle": displayTitle(c.Title), + } + + w.Header().Set("Content-Type", "text/html; charset=utf-8") + if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil { + http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) + } +} + +// ============================================================================ +// ┏━┓┏━┓╻ ┏━╸┏┓╻╺┳┓┏━┓┏━┓╻┏┓╻╺┳╸┏━┓ +// ┣━┫┣━┛┃ ┣╸ ┃┗┫ ┃┃┣━┛┃ ┃┃┃┗┫ ┃ ┗━┓ +// ╹ ╹╹ ╹ ┗━╸╹ ╹╺┻┛╹ ┗━┛╹╹ ╹ ╹ ┗━┛ +// ============================================================================ + +func (c *ServeCommand) handleFilteredFeed(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + c.resultsMu.RLock() + articles := c.filteredResults + resultsTime := c.filteredResultsTime + c.resultsMu.RUnlock() + + c.modelMu.RLock() + model := c.model + c.modelMu.RUnlock() + + threshold, _ := c.getThreshold(model) + + type ArticleResponse struct { + Title string `json:"title"` + URL string `json:"url"` + Source string `json:"source,omitempty"` + Rating int `json:"rating"` + Score float64 `json:"score"` + } + + scored := make([]ArticleResponse, 0, len(articles)) + for _, article := range articles { + score := 0.0 + if article.Score != nil { + score = *article.Score + } + rating := core.ScoreToScale(score, threshold) + + scored = append(scored, ArticleResponse{ + Title: article.Title, + URL: article.URL, + Source: article.Source, + Rating: rating, + Score: score, + }) + } + + w.Header().Set("Content-Type", "application/json") + w.Header().Set("Cache-Control", "no-cache, no-store, must-revalidate") + + if err := json.NewEncoder(w).Encode(map[string]interface{}{ + "total": len(articles), + "threshold": threshold, + "updated_at": resultsTime, + "articles": scored, + }); err != nil { + http.Error(w, "Failed to encode response", http.StatusInternalServerError) + } +} + +func (c *ServeCommand) handleFilteredRSS(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + c.resultsMu.RLock() + articles := c.filteredResults + c.resultsMu.RUnlock() + + c.modelMu.RLock() + model := c.model + c.modelMu.RUnlock() + + w.Header().Set("Content-Type", "application/rss+xml") + w.Header().Set("Cache-Control", "public, max-age=3600") + + // Generate RSS feed + fmt.Fprintf(w, `<?xml version="1.0" encoding="UTF-8"?> + <rss version="2.0"> + <channel> + <title>%s - Filtered Articles</title> + <link>http://scholscan.local</link> + <description>Articles filtered by your learned preferences (scored 1-10)</description> + `, displayTitle(c.Title)) + + for _, article := range articles { + rawScore := 0.0 + if article.Score != nil { + rawScore = *article.Score + } + + threshold, _ := c.getThreshold(model) + scaledScore := core.ScoreToScale(rawScore, threshold) + + title := escapeXML(article.Title) + url := escapeXML(article.URL) + description := fmt.Sprintf("SCHOLSCAN SCORE = %d/10 (raw: %.3f)", scaledScore, rawScore) + + fmt.Fprintf(w, ` <item> + <title>%s</title> + <link>%s</link> + <description>%s</description> + </item> + `, title, url, description) + } + + fmt.Fprint(w, ` </channel> + </rss>`) +} + +func (c *ServeCommand) handleHealth(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + c.modelMu.RLock() + modelLoaded := c.model != nil + c.modelMu.RUnlock() + + status := "ok" + if !modelLoaded { + status = "model_not_loaded" + w.WriteHeader(http.StatusInternalServerError) + } + + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(map[string]interface{}{ + "status": status, + "model_loaded": modelLoaded, + "timestamp": time.Now().Unix(), + }); err != nil { + http.Error(w, "Failed to encode response", http.StatusInternalServerError) + } +} + +// ============================================================================ +// ╻ ╻┏━╸╻ ┏━┓┏━╸┏━┓┏━┓ +// ┣━┫┣╸ ┃ ┣━┛┣╸ ┣┳┛┗━┓ +// ╹ ╹┗━╸┗━╸╹ ┗━╸╹┗╸┗━┛ +// ============================================================================ + +func displayTitle(custom string) string { + if custom != "" { + return custom + } + return "ScholScan" +} + +// extractTitleFromURL fetches the content from a URL and extracts the title from the HTML. +// Designed to be resilient: tries multiple title sources, handles various URL formats, +// and provides meaningful error feedback if extraction fails. +func extractTitleFromURL(rawURL string) (string, error) { + if rawURL == "" { + return "", fmt.Errorf("empty URL") + } + + // Check if it's a DOI + if strings.HasPrefix(rawURL, "10.") { + // Convert DOI to URL + rawURL = fmt.Sprintf("https://doi.org/%s", rawURL) + } else if !strings.HasPrefix(rawURL, "http://") && !strings.HasPrefix(rawURL, "https://") { + rawURL = "https://" + rawURL + } + + ctx, cancel := context.WithTimeout(context.Background(), core.DefaultContextTimeout) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil) + if err != nil { + return "", fmt.Errorf("invalid URL: %w", err) + } + req.Header.Set("User-Agent", core.PoliteUserAgent) + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + + resp, err := core.DoRequestWithRetry(ctx, core.DefaultHTTPClient, req) + if err != nil { + return "", fmt.Errorf("failed to fetch URL: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) + } + + doc, err := goquery.NewDocumentFromReader(resp.Body) + if err != nil { + return "", fmt.Errorf("failed to parse HTML: %w", err) + } + + // Fallback chain: <title> → og:title → twitter:title → <h1> + // Different sites populate these differently; trying multiple increases success rate. + title := "" + + if t := doc.Find("title").Text(); t != "" { + title = strings.TrimSpace(t) + } + + if title == "" { + if t, exists := doc.Find(`meta[property="og:title"]`).Attr("content"); exists && t != "" { + title = strings.TrimSpace(t) + } + } + + if title == "" { + if t, exists := doc.Find(`meta[name="twitter:title"]`).Attr("content"); exists && t != "" { + title = strings.TrimSpace(t) + } + } + + if title == "" { + if t := doc.Find("h1").First().Text(); t != "" { + title = strings.TrimSpace(t) + } + } + + if title == "" { + return "", fmt.Errorf("could not extract title from page") + } + + // Clean up common title patterns + reClean := regexp.MustCompile(`\s*\|\s*`) + title = reClean.ReplaceAllString(title, "") + + rePub := regexp.MustCompile(`^[^|]*\|\s*`) + title = rePub.ReplaceAllString(title, "") + title = strings.TrimSpace(title) + + if len(title) < 10 { + return "", fmt.Errorf("extracted title too short: %q", title) + } + + return title, nil +} + +// validateTitle checks that a title is suitable for scoring. +// Returns an error message string if invalid, empty string if valid. +func (c *ServeCommand) validateTitle(title string) string { + if strings.TrimSpace(title) == "" { + return "Title cannot be empty" + } + if len(title) > 1000 { + return "Title too long (max 1000 characters)" + } + return "" +} + +// renderResultsError renders the results template with an error message. +func (c *ServeCommand) renderResultsError(w http.ResponseWriter, errMsg, title string) { + data := map[string]interface{}{ + "Page": "tools", + "IsScoreResult": true, + "Error": errMsg, + "Title": title, + "PageTitle": displayTitle(c.Title), + } + w.Header().Set("Content-Type", "text/html; charset=utf-8") + if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil { + http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) + } +} + +// validateFeedURL checks that a feed URL is non-empty and has valid format. +// Returns an error message string if invalid, empty string if valid. +func (c *ServeCommand) validateFeedURL(feedURL string) string { + if feedURL == "" { + return "Feed URL cannot be empty" + } + if _, err := url.Parse(feedURL); err != nil { + return "Invalid URL format" + } + return "" +} + +// renderScanResultsError renders the results template with an error for scan operation. +func (c *ServeCommand) renderScanResultsError(w http.ResponseWriter, errMsg, feedURL string) { + data := map[string]interface{}{ + "Page": "tools", + "IsScanResult": true, + "Error": errMsg, + "FeedURL": feedURL, + "PageTitle": displayTitle(c.Title), + } + w.Header().Set("Content-Type", "text/html; charset=utf-8") + if err := c.tmpl.ExecuteTemplate(w, "results", data); err != nil { + http.Error(w, fmt.Sprintf("Template error: %v", err), http.StatusInternalServerError) + } +} + +func escapeXML(s string) string { + s = strings.ReplaceAll(s, "&", "&") + s = strings.ReplaceAll(s, "<", "<") + s = strings.ReplaceAll(s, ">", ">") + s = strings.ReplaceAll(s, "\"", """) + s = strings.ReplaceAll(s, "'", "'") + return s +} diff --git a/cmds/templates/live-feed.html b/cmds/templates/live-feed.html new file mode 100644 index 0000000..1529ee1 --- /dev/null +++ b/cmds/templates/live-feed.html @@ -0,0 +1,158 @@ +{{define "live-feed"}} +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <title>{{.Title}} - Live Feed</title> + <style> + /* ======================================== + BASE STYLE + ======================================== */ + * { margin: 0; padding: 0; box-sizing: border-box; } + body { + font-family: monospace; + background: #fff; + color: #000; + padding: 20px; + line-height: 1.6; + } + h1 { + font-size: 1.2em; + font-weight: bold; + margin-bottom: 20px; + } + + /* ======================================== + NAV (live-feed | score-scan) + ======================================== */ + .nav { + margin-bottom: 30px; + display: flex; + gap: 30px; + border-bottom: 1px solid #000; + padding-bottom: 10px; + } + .nav a { + text-decoration: none; + color: #000; + font-family: monospace; + } + .nav a.active { + border-bottom: 2px solid #000; + padding-bottom: 5px; + } + + /* ======================================== + ARTICLE LIST + ======================================== */ + .article { + margin-bottom: 15px; + padding: 10px; + border: 1px solid #ccc; + } + .article a { + color: #00f; + text-decoration: underline; + } + .article-meta { + margin-top: 8px; + color: #666; + font-size: 0.9em; + } + + /* ======================================== + ARTICLE LIST STUFF + ======================================== */ + .summary { + margin-bottom: 15px; + padding: 10px; + border: 1px solid #000; + background: #f9f9f9; + } + .rss-link { + background: #f9f9f9; + padding: 15px; + border: 1px solid #000; + margin-bottom: 20px; + } + .rss-link a { + color: #00f; + text-decoration: underline; + } + .feed-list { + max-height: 600px; + overflow-y: auto; + border: 1px solid #000; + padding: 10px; + } + + .error { + color: #f00; + margin-top: 10px; + padding: 10px; + border: 1px solid #f00; + } + </style> +</head> +<body> + <h1><a href="/live-feed" style="color: inherit; text-decoration: none;">{{.Title}}</a></h1> + <div class="nav"> + <a href="/live-feed" class="active">Live Feed</a> + <a href="/tools">Score & Scan</a> + </div> + + <div class="rss-link"> + <strong>Filtered RSS Feed:</strong> + <a href="/api/filtered/rss" target="_blank">Subscribe to filtered articles</a> + <span style="margin-left: 10px; color: #666; font-size: 0.9em;">(rss link for feed readers)</span> + <div style="margin-top: 10px; padding-top: 10px; border-top: 1px solid #ccc; color: #666; font-size: 0.9em;"> + Last updated: <span id="feedTimestamp">{{if .UpdatedAt}}{{.UpdatedAt}}{{else}}—{{end}}</span> + </div> + </div> + + <div style="margin-bottom: 20px;"> + <strong>Filter by date:</strong> + <div style="margin-top: 8px; display: flex; gap: 10px;"> + <a href="/live-feed?filter=day" style="padding: 6px 12px; text-decoration: none; {{if eq .Filter "day"}}background: #000; color: #fff;{{else}}border: 1px solid #000; color: #000;{{end}}">Last 24h</a> + <a href="/live-feed?filter=week" style="padding: 6px 12px; text-decoration: none; {{if eq .Filter "week"}}background: #000; color: #fff;{{else}}border: 1px solid #000; color: #000;{{end}}">Last 7 days</a> + <a href="/live-feed?filter=all" style="padding: 6px 12px; text-decoration: none; {{if eq .Filter "all"}}background: #000; color: #fff;{{else}}border: 1px solid #000; color: #000;{{end}}">All</a> + </div> + </div> + + <div class="feed-list"> + {{if .Error}} + <div class="error">{{.Error}}</div> + {{else if .Articles}} + <div class="summary"> + <strong>{{len .Articles}}</strong> articles (threshold: {{printf "%.2f" .Threshold}}) + </div> + {{$threshold := .Threshold}} + {{range .Articles}} + {{$isGood := ge .Score $threshold}} + {{$bgColor := "white"}} + {{if $isGood}} + {{$bgColor = "#e8f5e9"}} + {{else}} + {{$bgColor = "#ffebee"}} + {{end}} + {{$indicator := "✗"}} + {{if $isGood}} + {{$indicator = "✓"}} + {{end}} + <div class="article" style="background-color: {{$bgColor}};"> + <div style="font-weight: bold;"> + <a href="{{.URL}}" target="_blank">{{.Title}}</a> + </div> + <div class="article-meta"> + Rating: {{$indicator}} {{.Rating}}/10 (raw: {{printf "%.3f" .Score}}) · {{.Source}}{{if .PublishedAt}} · {{.PublishedAt}}{{end}} + </div> + </div> + {{end}} + {{else}} + <p>No articles to display</p> + {{end}} + </div> +</body> +</html> +{{end}} diff --git a/cmds/templates/results.html b/cmds/templates/results.html new file mode 100644 index 0000000..13f68e0 --- /dev/null +++ b/cmds/templates/results.html @@ -0,0 +1,279 @@ +{{define "results"}} +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <title>{{.PageTitle}} - Results</title> + <style> + /* ======================================== + BASE STYLE + ======================================== */ + * { margin: 0; padding: 0; box-sizing: border-box; } + body { + font-family: monospace; + background: #fff; + color: #000; + padding: 20px; + line-height: 1.6; + } + h1 { + font-size: 1.2em; + font-weight: bold; + margin-bottom: 20px; + } + h2 { + font-size: 1em; + font-weight: bold; + margin-bottom: 15px; + border-bottom: 1px solid #000; + padding-bottom: 10px; + } + + /* ======================================== + NAV (live-feed | score-scan) + ======================================== */ + .nav { + margin-bottom: 30px; + display: flex; + gap: 30px; + border-bottom: 1px solid #000; + padding-bottom: 10px; + } + .nav a { + text-decoration: none; + color: #000; + font-family: monospace; + } + .nav a.active { + border-bottom: 2px solid #000; + padding-bottom: 5px; + } + + /* ======================================== + LAYOUT (2-column grid for score-scan) + ======================================== */ + .container { + max-width: 1200px; + margin: 0 auto; + display: grid; + grid-template-columns: 1fr 1fr; + gap: 30px; + } + .section { + border: 1px solid #000; + padding: 20px; + } + + /* ======================================== + FORMS (input, textarea, button) + ======================================== */ + label { + display: block; + margin-top: 15px; + font-weight: bold; + } + input, textarea { + display: block; + width: 100%; + margin-top: 5px; + padding: 5px; + border: 1px solid #000; + font-family: monospace; + } + textarea { + resize: vertical; + min-height: 80px; + } + button { + margin-top: 15px; + padding: 5px 15px; + border: 1px solid #000; + background: #fff; + cursor: pointer; + font-family: monospace; + } + button:hover { + background: #000; + color: #fff; + } + button:active { + opacity: 0.8; + } + + /* ======================================== + RESULT BOXES + ======================================== */ + .result { + margin-top: 20px; + padding: 15px; + border: 1px solid #000; + background: #f5f5f5; + } + .score { + font-size: 3em; + font-weight: bold; + text-align: center; + margin: 20px 0; + } + .error { + color: #f00; + margin-top: 10px; + padding: 10px; + border: 1px solid #f00; + } + + /* ======================================== + ARTICLE LIST + ======================================== */ + .article { + margin-bottom: 15px; + padding: 10px; + border: 1px solid #ccc; + } + .article a { + color: #00f; + text-decoration: underline; + } + .article-meta { + margin-top: 8px; + color: #666; + font-size: 0.9em; + } + + /* ======================================== + ARTICLE LIST STUFF + ======================================== */ + .summary { + margin-bottom: 15px; + padding: 10px; + border: 1px solid #000; + background: #f9f9f9; + } + + small { + display: block; + margin-top: 5px; + color: #666; + } + + /* ======================================== + MOBILE + ======================================== */ + @media (max-width: 960px) { + .container { + grid-template-columns: 1fr; + gap: 20px; + } + } + </style> +</head> +<body> + <h1><a href="/live-feed" style="color: inherit; text-decoration: none;">{{.PageTitle}}</a></h1> + <div class="nav"> + <a href="/live-feed">Live Feed</a> + <a href="/tools" class="active">Score & Scan</a> + </div> + + <div class="container"> + {{if .IsScoreResult}} + <div class="section"> + <h2>Score Article</h2> + {{if .Error}} + <div class="error">{{.Error}}</div> + <form method="POST" action="/score" style="margin-top: 20px;"> + <label for="scoreTitle">Title:</label> + <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" value="{{.Title}}" /> + <label for="scoreURL">URL or DOI:</label> + <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" /> + <small>If URL is provided, title will be automatically extracted</small> + <button type="submit">Score</button> + </form> + {{else}} + <div class="result"> + <div class="score">{{.Rating}}/10</div> + <p style="text-align: center; color: #666;">Score: {{printf "%.3f" .Score}}</p> + <p style="text-align: center; margin-top: 10px; font-size: 0.9em;">{{.Title}}</p> + </div> + <form method="POST" action="/score" style="margin-top: 20px;"> + <label for="scoreTitle">Title:</label> + <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" /> + <label for="scoreURL">URL or DOI:</label> + <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" /> + <small>If URL is provided, title will be automatically extracted</small> + <button type="submit">Score Another</button> + </form> + {{end}} + </div> + + <div class="section"> + <h2>Scan Feed</h2> + <form method="POST" action="/scan"> + <label for="feedURL">RSS Feed URL:</label> + <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" required /> + <button type="submit">Scan</button> + </form> + </div> + + {{else if .IsScanResult}} + <div class="section"> + <h2>Score Article</h2> + <form method="POST" action="/score"> + <label for="scoreTitle">Title:</label> + <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" /> + <label for="scoreURL">URL or DOI:</label> + <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" /> + <small>If URL is provided, title will be automatically extracted</small> + <button type="submit">Score</button> + </form> + </div> + + <div class="section"> + <h2>Scan Feed</h2> + {{if .Error}} + <div class="error">{{.Error}}</div> + <form method="POST" action="/scan" style="margin-top: 20px;"> + <label for="feedURL">RSS Feed URL:</label> + <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" value="{{.FeedURL}}" required /> + <button type="submit">Try Again</button> + </form> + {{else}} + <div class="summary"> + <strong>{{len .Articles}}</strong> articles from {{.FeedURL}} (threshold: {{printf "%.2f" .Threshold}}) + </div> + <div style="max-height: 500px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;"> + {{$threshold := .Threshold}} + {{range .Articles}} + {{$isGood := ge .Score $threshold}} + {{$bgColor := "white"}} + {{if $isGood}} + {{$bgColor = "#e8f5e9"}} + {{else}} + {{$bgColor = "#ffebee"}} + {{end}} + {{$indicator := "✗"}} + {{if $isGood}} + {{$indicator = "✓"}} + {{end}} + <div class="article" style="background-color: {{$bgColor}};"> + <div style="font-weight: bold;"> + <a href="{{.URL}}" target="_blank">{{.Title}}</a> + </div> + <div class="article-meta"> + Rating: {{$indicator}} {{.Rating}}/10 (raw: {{printf "%.3f" .Score}}) · {{.Source}} + </div> + </div> + {{end}} + </div> + <form method="POST" action="/scan" style="margin-top: 20px;"> + <label for="feedURL">RSS Feed URL:</label> + <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" required /> + <button type="submit">Scan Another</button> + </form> + {{end}} + </div> + {{end}} + </div> +</body> +</html> +{{end}} diff --git a/cmds/templates/tools.html b/cmds/templates/tools.html new file mode 100644 index 0000000..def04fe --- /dev/null +++ b/cmds/templates/tools.html @@ -0,0 +1,202 @@ +{{define "tools"}} +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <title>{{.Title}} - Score & Scan</title> + <style> + /* ======================================== + BASE STYLE + ======================================== */ + * { margin: 0; padding: 0; box-sizing: border-box; } + body { + font-family: monospace; + background: #fff; + color: #000; + padding: 20px; + line-height: 1.6; + } + h1 { + font-size: 1.2em; + font-weight: bold; + margin-bottom: 20px; + } + h2 { + font-size: 1em; + font-weight: bold; + margin-bottom: 15px; + border-bottom: 1px solid #000; + padding-bottom: 10px; + } + + /* ======================================== + NAV (live-feed | score-scan) + ======================================== */ + .nav { + margin-bottom: 30px; + display: flex; + gap: 30px; + border-bottom: 1px solid #000; + padding-bottom: 10px; + } + .nav a { + text-decoration: none; + color: #000; + font-family: monospace; + } + .nav a.active { + border-bottom: 2px solid #000; + padding-bottom: 5px; + } + + /* ======================================== + LAYOUT (2-column grid for score-scan) + ======================================== */ + .container { + max-width: 1200px; + margin: 0 auto; + display: grid; + grid-template-columns: 1fr 1fr; + gap: 30px; + } + .section { + border: 1px solid #000; + padding: 20px; + } + + /* ======================================== + FORMS (input, textarea, button) + ======================================== */ + label { + display: block; + margin-top: 15px; + font-weight: bold; + } + input, textarea { + display: block; + width: 100%; + margin-top: 5px; + padding: 5px; + border: 1px solid #000; + font-family: monospace; + } + textarea { + resize: vertical; + min-height: 80px; + } + button { + margin-top: 15px; + padding: 5px 15px; + border: 1px solid #000; + background: #fff; + cursor: pointer; + font-family: monospace; + } + button:hover { + background: #000; + color: #fff; + } + button:active { + opacity: 0.8; + } + + /* ======================================== + RESULT BOXES + ======================================== */ + .result { + margin-top: 20px; + padding: 15px; + border: 1px solid #000; + background: #f5f5f5; + } + .score { + font-size: 3em; + font-weight: bold; + text-align: center; + margin: 20px 0; + } + .error { + color: #f00; + margin-top: 10px; + padding: 10px; + border: 1px solid #f00; + } + + /* ======================================== + ARTICLE LIST + ======================================== */ + .article { + margin-bottom: 15px; + padding: 10px; + border: 1px solid #ccc; + } + .article a { + color: #00f; + text-decoration: underline; + } + .article-meta { + margin-top: 8px; + color: #666; + font-size: 0.9em; + } + + /* ======================================== + ARTICLE LIST STUFF + ======================================== */ + .summary { + margin-bottom: 15px; + padding: 10px; + border: 1px solid #000; + background: #f9f9f9; + } + + small { + display: block; + margin-top: 5px; + color: #666; + } + + /* ======================================== + MOBILE + ======================================== */ + @media (max-width: 960px) { + .container { + grid-template-columns: 1fr; + gap: 20px; + } + } + </style> +</head> +<body> + <h1><a href="/live-feed" style="color: inherit; text-decoration: none;">{{.Title}}</a></h1> + <div class="nav"> + <a href="/live-feed">Live Feed</a> + <a href="/tools" class="active">Score & Scan</a> + </div> + + <div class="container"> + <div class="section"> + <h2>Score Article</h2> + <form method="POST" action="/score"> + <label for="scoreTitle">Title:</label> + <input type="text" id="scoreTitle" name="title" placeholder="Enter article title" /> + <label for="scoreURL" style="margin-top: 10px;">URL or DOI:</label> + <input type="text" id="scoreURL" name="url" placeholder="https://example.com/article or 10.xxxx/doi" /> + <small>If URL is provided, title will be automatically extracted</small> + <button type="submit">Score</button> + </form> + </div> + + <div class="section"> + <h2>Scan Feed</h2> + <form method="POST" action="/scan"> + <label for="feedURL">RSS Feed URL:</label> + <input type="text" id="feedURL" name="feed_url" placeholder="https://example.com/rss.xml" required /> + <button type="submit">Scan</button> + </form> + </div> + </div> +</body> +</html> +{{end}} diff --git a/cmds/train.go b/cmds/train.go new file mode 100644 index 0000000..e7e8915 --- /dev/null +++ b/cmds/train.go @@ -0,0 +1,841 @@ +// Train command learns model from positive examples and RSS feeds. +// Loads positives, fetches RSS feeds as negatives, excludes overlap, +// trains TF-IDF + logistic regression with 1:1 class balancing. +// Outputs model with validation threshold to stdout. +package cmds + +import ( + "bufio" + "bytes" + "context" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "math" + "math/rand" + "net/http" + "net/url" + "os" + "path/filepath" + "strings" + "time" + + "github.com/mmcdole/gofeed" + "scholscan/core" +) + +// ============================================================================ +// ┏━╸┏┳┓╺┳┓ ┏━┓┏┓ ┏┓ +// ┃ ┃┃┃ ┃┃ ┃ ┃┣┻┓ ┃ +// ┗━╸╹ ╹╺┻┛ ┗━┛┗━┛┗━┛ +// ============================================================================ + +// Learns model from positive examples and RSS feeds +// Outputs trained model JSON to stdout +type TrainCommand struct { + positivesFile string + rssFeedsFile string + verboseOutput bool + lambda float64 + minDF int + maxDF float64 + ngramMax int +} + +func (c *TrainCommand) Name() string { return "train" } + +func (c *TrainCommand) Init(args []string) error { + fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError) + fs.Usage = func() { + fmt.Fprint(fs.Output(), `Usage: scholscan train POSITIVES_FILE --rss-feeds RSS_FEEDS_FILE > model.json + +Train a TF-IDF + logistic regression model from positive examples and RSS feeds. + +The training workflow: + 1. Load positive examples from POSITIVES_FILE + 2. Fetch articles from RSS feeds list + 3. Exclude any positive examples from RSS feed articles + 4. Train model with balanced classes + 5. Output trained model to stdout as JSON + +Flags: +`) + fs.PrintDefaults() + fmt.Fprint(fs.Output(), ` +Arguments: + POSITIVES_FILE Path to JSONL file with positive examples (required) + +Example: + scholscan train positives.jsonl --rss-feeds rss_world.txt > model.json +`) + } + + fs.StringVar(&c.rssFeedsFile, "rss-feeds", "", "Path to text file with RSS feed URLs (required)") + fs.BoolVar(&c.verboseOutput, "verbose", false, "Show progress information") + fs.Float64Var(&c.lambda, "lambda", 0.001, "L2 regularization parameter for logistic regression") + fs.IntVar(&c.minDF, "min-df", 2, "Minimum document frequency (absolute count)") + fs.Float64Var(&c.maxDF, "max-df", 0.8, "Maximum document frequency (ratio, 0-1)") + fs.IntVar(&c.ngramMax, "ngram-max", 2, "Maximum n-gram size (e.g., 1=unigrams, 2=unigrams+bigrams)") + + // Check for help flag first + for _, arg := range args { + if arg == "--help" || arg == "-h" { + fs.Usage() + return flag.ErrHelp + } + } + + // Extract positional argument (POSITIVES_FILE) before parsing flags + if len(args) == 0 { + return fmt.Errorf("POSITIVES_FILE argument is required") + } + // The first argument should be the positives file, the rest are flags + c.positivesFile = args[0] + flagArgs := args[1:] + + if err := fs.Parse(flagArgs); err != nil { + return err + } + + if c.rssFeedsFile == "" { + return fmt.Errorf("--rss-feeds flag is required") + } + + // Validate paths are safe (prevent directory traversal) + if strings.Contains(filepath.Clean(c.positivesFile), "..") { + return fmt.Errorf("invalid positives file path: directory traversal not allowed") + } + if strings.Contains(filepath.Clean(c.rssFeedsFile), "..") { + return fmt.Errorf("invalid RSS feeds file path: directory traversal not allowed") + } + + return nil +} + +func (c *TrainCommand) Run(stdin io.Reader, stdout io.Writer) error { + if c.verboseOutput { + log.SetOutput(os.Stderr) + log.Println("Starting training workflow...") + log.Printf("Positives: %s", c.positivesFile) + log.Printf("RSS feeds: %s", c.rssFeedsFile) + } + + if c.verboseOutput { + log.Printf("Loading positives from %s...", c.positivesFile) + } + positives, err := c.loadArticles(c.positivesFile) + if err != nil { + return fmt.Errorf("failed to load positives: %w", err) + } + if c.verboseOutput { + log.Printf("Loaded %d positive examples", len(positives)) + } + + if c.verboseOutput { + log.Printf("Loading RSS feeds from %s...", c.rssFeedsFile) + } + rssURLs, err := c.loadRSSURLs(c.rssFeedsFile) + if err != nil { + return fmt.Errorf("failed to load RSS feeds: %w", err) + } + if c.verboseOutput { + log.Printf("Found %d RSS feeds to fetch", len(rssURLs)) + } + + negatives, err := c.fetchFromRSSFeeds(rssURLs) + if err != nil { + return fmt.Errorf("failed to fetch from RSS feeds: %w", err) + } + if c.verboseOutput { + log.Printf("Fetched %d articles from RSS feeds", len(negatives)) + } + + negatives = c.excludePositives(negatives, positives) + if c.verboseOutput { + log.Printf("After exclusion: %d negative examples", len(negatives)) + } + + if len(positives) == 0 || len(negatives) == 0 { + return fmt.Errorf("need both positive (%d) and negative (%d) examples for training", len(positives), len(negatives)) + } + + if c.verboseOutput { + log.Println("Training model...") + } + model, err := c.trainModel(positives, negatives) + if err != nil { + return fmt.Errorf("failed to train model: %w", err) + } + + // Output model + encoder := json.NewEncoder(stdout) + encoder.SetIndent("", " ") + if err := encoder.Encode(model); err != nil { + return fmt.Errorf("failed to write model: %w", err) + } + + return nil +} + +// ============================================================================ +// ╺┳┓┏━┓╺┳╸┏━┓ ╻ ┏━┓┏━┓╺┳┓╻┏┓╻┏━╸ +// ┃┃┣━┫ ┃ ┣━┫ ┃ ┃ ┃┣━┫ ┃┃┃┃┗┫┃╺┓ +// ╺┻┛╹ ╹ ╹ ╹ ╹ ┗━╸┗━┛╹ ╹╺┻┛╹╹ ╹┗━┛ +// ============================================================================ + +func (c *TrainCommand) loadArticles(filename string) ([]*core.Article, error) { + file, err := os.Open(filename) + if err != nil { + return nil, err + } + defer file.Close() + + var articles []*core.Article + decoder := json.NewDecoder(file) + lineCount := 0 + for { + var article core.Article + if err := decoder.Decode(&article); err != nil { + if err == io.EOF { + break + } + // Skip malformed json lines, don't fail on bad input. + lineCount++ + continue + } + articles = append(articles, &article) + lineCount++ + if lineCount%500 == 0 && c.verboseOutput { + log.Printf(" Loaded %d articles so far", len(articles)) + } + } + return articles, nil +} + +// loadRSSURLs loads RSS feed URLs from a text file +func (c *TrainCommand) loadRSSURLs(filename string) ([]string, error) { + file, err := os.Open(filename) + if err != nil { + return nil, err + } + defer file.Close() + + var urls []string + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line != "" && !strings.HasPrefix(line, "#") { + urls = append(urls, line) + } + } + return urls, scanner.Err() +} + +// fetchFromRSSFeeds fetches articles from multiple RSS feeds in parallel +func (c *TrainCommand) fetchFromRSSFeeds(rssURLs []string) ([]*core.Article, error) { + client := core.DefaultHTTPClient + type result struct { + url string + articles []*core.Article + err error + } + resultChan := make(chan result, len(rssURLs)) + + for _, rssURL := range rssURLs { + go func(url string) { + articles, err := c.fetchRSSFeed(client, url) + resultChan <- result{url: url, articles: articles, err: err} + }(rssURL) + } + + var allArticles []*core.Article + for i := 0; i < len(rssURLs); i++ { + res := <-resultChan + if res.err != nil { + if c.verboseOutput { + log.Printf("%s: failed to fetch", shortURL(res.url)) + } + } else { + if c.verboseOutput { + log.Printf("%s: %d articles", shortURL(res.url), len(res.articles)) + } + allArticles = append(allArticles, res.articles...) + } + } + + return allArticles, nil +} + +// ParseRSSFeed parses an RSS/Atom feed from the provided body into a slice of Articles. +func ParseRSSFeed(body []byte, baseURL string) ([]*core.Article, error) { + fp := gofeed.NewParser() + feed, err := fp.Parse(bytes.NewReader(body)) + if err != nil { + return nil, err + } + + var articles []*core.Article + for _, item := range feed.Items { + // Prefer explicit content; fall back to description. + content := strings.TrimSpace(item.Content) + if content == "" { + content = item.Description + } + // Also check custom content field (for <content> tags in RSS) + if content == "" && item.Custom != nil { + if c, ok := item.Custom["content"]; ok && c != "" { + content = c + } + } + + // Clean and limit content length + content = core.CleanFeedContent(content) + + articles = append(articles, &core.Article{ + URL: item.Link, + Title: item.Title, + Content: content, + }) + } + return articles, nil +} + +// fetchRSSFeed fetches and parses a single RSS feed +func (c *TrainCommand) fetchRSSFeed(client *http.Client, rssURL string) ([]*core.Article, error) { + var body []byte + var err error + + // Handle file:// URLs locally + if strings.HasPrefix(rssURL, "file://") { + // Remove file:// prefix + filePath := strings.TrimPrefix(rssURL, "file://") + body, err = os.ReadFile(filePath) + if err != nil { + return nil, fmt.Errorf("error reading file %s: %w", filePath, err) + } + } else { + // Handle HTTP/HTTPS URLs normally + req, err := http.NewRequest("GET", rssURL, nil) + if err != nil { + return nil, fmt.Errorf("error building request: %w", err) + } + req.Header.Set("User-Agent", core.PoliteUserAgent) + + // Make request with retry logic + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + resp, err := core.DoRequestWithRetry(ctx, client, req) + if err != nil { + return nil, fmt.Errorf("error fetching %s: %w", rssURL, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, rssURL) + } + + // Read response body + body, err = io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("error reading response from %s: %w", rssURL, err) + } + } + + // Parse RSS/Atom feed + return ParseRSSFeed(body, rssURL) +} + +// ============================================================================ +// ╺┳┓┏━┓╺┳╸┏━┓ ┏━┓┏━┓┏━╸┏━┓ +// ┃┃┣━┫ ┃ ┣━┫ ┣━┛┣┳┛┣╸ ┣━┛ +// ╺┻┛╹ ╹ ╹ ╹ ╹ ╹ ╹┗╸┗━╸╹ +// ============================================================================ + +func (c *TrainCommand) excludePositives(negatives, positives []*core.Article) []*core.Article { + // Build set of positive URLs for O(1) lookup + positiveURLs := make(map[string]bool) + for _, pos := range positives { + positiveURLs[pos.URL] = true + } + + // Filter out positives + var filtered []*core.Article + for _, neg := range negatives { + if !positiveURLs[neg.URL] { + filtered = append(filtered, neg) + } + } + + return filtered +} + +// splitTrainingData performs a deterministic 80/20 split (seed=42). +// Deterministic ensures reproducible model training across runs. +func (c *TrainCommand) splitTrainingData(documents []string, labels []float64) ( + trainDocs, valDocs []string, + trainLabels, valLabels []float64, +) { + const validationSplitRatio = 0.2 + const splitSeed = 42 + + if len(documents) < 3 { + // Not enough data to split, use all for training. + // A split requires at least 2 training documents to avoid MaxDF issues + // and at least 1 validation document. + return documents, nil, labels, nil + } + + // Create a reproducible random source and shuffle indices. + rng := rand.New(rand.NewSource(splitSeed)) + indices := make([]int, len(documents)) + for i := range indices { + indices[i] = i + } + rng.Shuffle(len(indices), func(i, j int) { + indices[i], indices[j] = indices[j], indices[i] + }) + + splitIndex := int(float64(len(documents)) * (1.0 - validationSplitRatio)) + trainIndices := indices[:splitIndex] + valIndices := indices[splitIndex:] + + trainDocs = make([]string, len(trainIndices)) + trainLabels = make([]float64, len(trainIndices)) + for i, idx := range trainIndices { + trainDocs[i] = documents[idx] + trainLabels[i] = labels[idx] + } + + valDocs = make([]string, len(valIndices)) + valLabels = make([]float64, len(valIndices)) + for i, idx := range valIndices { + valDocs[i] = documents[idx] + valLabels[i] = labels[idx] + } + + return trainDocs, valDocs, trainLabels, valLabels +} + +// Downsample majority class to 1:1 ratio AFTER vectorizer.Fit() to preserve IDF values. +func (c *TrainCommand) downsampleToBalance(docs []string, labels []float64) ([]string, []float64) { + // Count positives and negatives + var posDocs, negDocs []string + var posLabels, negLabels []float64 + + for i, label := range labels { + if label == 1.0 { + posDocs = append(posDocs, docs[i]) + posLabels = append(posLabels, label) + } else { + negDocs = append(negDocs, docs[i]) + negLabels = append(negLabels, label) + } + } + + // If already balanced, return as-is + if len(posDocs) == len(negDocs) { + return docs, labels + } + + // Determine which class is majority + var majorityDocs, minorityDocs []string + var majorityLabels, minorityLabels []float64 + + if len(negDocs) > len(posDocs) { + // Negatives are majority + majorityDocs, minorityDocs = negDocs, posDocs + majorityLabels, minorityLabels = negLabels, posLabels + } else { + // Positives are majority (unlikely but handle) + majorityDocs, minorityDocs = posDocs, negDocs + majorityLabels, minorityLabels = posLabels, negLabels + } + + // Downsample majority to match minority size + minoritySize := len(minorityDocs) + rng := rand.New(rand.NewSource(42)) // Use fixed seed for reproducibility + + // Create random indices for downsampling + indices := make([]int, len(majorityDocs)) + for i := range indices { + indices[i] = i + } + rng.Shuffle(len(indices), func(i, j int) { + indices[i], indices[j] = indices[j], indices[i] + }) + + // Select downsampled majority + downsampledDocs := make([]string, 0, minoritySize*2) + downsampledLabels := make([]float64, 0, minoritySize*2) + + // Add all minority samples + downsampledDocs = append(downsampledDocs, minorityDocs...) + downsampledLabels = append(downsampledLabels, minorityLabels...) + + // Add downsampled majority + for i := 0; i < minoritySize; i++ { + idx := indices[i] + downsampledDocs = append(downsampledDocs, majorityDocs[idx]) + downsampledLabels = append(downsampledLabels, majorityLabels[idx]) + } + + return downsampledDocs, downsampledLabels +} + +// ============================================================================ +// ╺┳╸┏━┓┏━┓╻┏┓╻ ┏┳┓┏━┓╺┳┓┏━╸╻ +// ┃ ┣┳┛┣━┫┃┃┗┫ ┃┃┃┃ ┃ ┃┃┣╸ ┃ +// ╹ ╹┗╸╹ ╹╹╹ ╹ ╹ ╹┗━┛╺┻┛┗━╸┗━╸ +// ============================================================================ + +// trainModel trains a TF-IDF + logistic regression model +func (c *TrainCommand) trainModel(positives, negatives []*core.Article) (*core.ModelEnvelope, error) { + // Combine datasets and create labels + var documents []string + var labels []float64 + + // Process positives + for _, article := range positives { + // Skip articles with titles that are too short + if len(article.Title) < 15 { + continue + } + documents = append(documents, article.Title) + labels = append(labels, 1.0) + } + + // Process negatives + for _, article := range negatives { + // Skip articles with titles that are too short + if len(article.Title) < 15 { + continue + } + documents = append(documents, article.Title) + labels = append(labels, 0.0) + } + + // Use parameters from CLI flags (with defaults matching Julia implementation) + const vocabCap = 50000 + + // Deterministic 80/20 split for train/validation + trainDocs, valDocs, trainLabels, valLabels := c.splitTrainingData(documents, labels) + + // Create TF-IDF vectorizer with the specified parameters + vectorizer := &core.TFIDFVectorizer{ + NgramMin: 1, + NgramMax: c.ngramMax, + MinDF: c.minDF, + MaxDF: c.maxDF, + VocabCap: vocabCap, + Vocabulary: make(map[string]float64), + } + // Fit vectorizer on UNBALANCED training data to match Julia implementation + // This preserves document frequencies properly + vectorizer.Fit(trainDocs) + + // Downsample negatives to 1:1 ratio AFTER fitting (match Julia approach) + balancedTrainDocs, balancedTrainLabels := c.downsampleToBalance(trainDocs, trainLabels) + + // Transform both training and validation sets + trainVectors := vectorizer.Transform(balancedTrainDocs) + valVectors := vectorizer.Transform(valDocs) + + // Use uniform class weights since we've balanced the dataset + classWeights := map[float64]float64{ + 1.0: 1.0, + 0.0: 1.0, + } + + // Train logistic regression with the specified lambda parameter + lr := &core.LogisticRegression{ + LearningRate: 0.5, + Lambda: c.lambda, + Iterations: 500, + Tolerance: 0.000001, + } + lr.Validate() + weights, err := lr.Fit(trainVectors, balancedTrainLabels, classWeights) + if err != nil { + return nil, fmt.Errorf("failed to train logistic regression model: %w", err) + } + + // Find the best threshold on the validation set + recommendedThreshold, scoreDistributions := c.findBestThreshold(valVectors, valLabels, weights) + + // Count classes for metadata + var posCount, negCount float64 + for _, label := range labels { + if label == 1.0 { + posCount++ + } else { + negCount++ + } + } + + // Create model envelope + model := &core.ModelEnvelope{ + Algorithm: "tfidf-go", + Impl: "go", + Version: "1", + CreatedAt: time.Now().UTC(), + Meta: map[string]any{ + "positives": len(positives), + "negatives": len(negatives), + "class_counts": map[string]int{ + "pos": int(posCount), + "neg": int(negCount), + }, + "vectorizer_params": map[string]any{ + "ngram_min": vectorizer.NgramMin, + "ngram_max": vectorizer.NgramMax, + "min_df": vectorizer.MinDF, + "max_df": vectorizer.MaxDF, + "vocab_cap": vectorizer.VocabCap, + }, + "model_params": map[string]any{ + "learning_rate": lr.LearningRate, + "lambda": lr.Lambda, + "iterations": lr.Iterations, + "tolerance": lr.Tolerance, + }, + "recommended_threshold": recommendedThreshold, + "score_distributions": scoreDistributions, + }, + Vectorizer: vectorizer.Vocabulary, + OrderedVocab: vectorizer.OrderedVocab, + Weights: weights, + } + + return model, nil +} + +// ============================================================================ +// ┏┳┓┏━╸╺┳╸┏━┓╻┏━╸┏━┓ +// ┃┃┃┣╸ ┃ ┣┳┛┃┃ ┗━┓ +// ╹ ╹┗━╸ ╹ ╹┗╸╹┗━╸┗━┛ +// ============================================================================ + +// ClassificationMetrics holds the evaluation metrics +type ClassificationMetrics struct { + TruePositives int + TrueNegatives int + FalsePositives int + FalseNegatives int + Accuracy float64 + Precision float64 + Recall float64 + F1Score float64 +} + +// Calculate computes the metrics from raw counts +func (m *ClassificationMetrics) Calculate() { + total := m.TruePositives + m.TrueNegatives + m.FalsePositives + m.FalseNegatives + + if total > 0 { + m.Accuracy = float64(m.TruePositives+m.TrueNegatives) / float64(total) + } + + if m.TruePositives+m.FalsePositives > 0 { + m.Precision = float64(m.TruePositives) / float64(m.TruePositives+m.FalsePositives) + } + + if m.TruePositives+m.FalseNegatives > 0 { + m.Recall = float64(m.TruePositives) / float64(m.TruePositives+m.FalseNegatives) + } + + if m.Precision+m.Recall > 0 { + m.F1Score = 2 * (m.Precision * m.Recall) / (m.Precision + m.Recall) + } +} + +// findBestThreshold sweeps a range of thresholds on a validation set to find +// the one that maximizes combined F1 + separation score. +func (c *TrainCommand) findBestThreshold( + validationVectors [][]float64, + validationLabels []float64, + weights []float64, +) (float64, map[string]any) { + if len(validationVectors) == 0 { + return 0.5, nil // Default if no validation data + } + + scores := make([]float64, len(validationVectors)) + for i, vector := range validationVectors { + score, err := core.PredictScore(vector, weights) + if err != nil { + // This should not happen with valid data, but as a fallback: + return 0.5, nil + } + scores[i] = score + } + + // Collect score distributions by label + var posScores, negScores []float64 + for i, score := range scores { + if validationLabels[i] == 1.0 { + posScores = append(posScores, score) + } else { + negScores = append(negScores, score) + } + } + + // Compute stats for each class + posStats := computeScoreStats(posScores) + negStats := computeScoreStats(negScores) + + // Calculate Cohen's d (effect size) to measure class separation in the learned space + posMean := posStats["mean"] + negMean := negStats["mean"] + posStd := posStats["std"] + negStd := negStats["std"] + + var cohensD float64 + if posStd > 0 && negStd > 0 { + pooledStd := math.Sqrt((posStd*posStd + negStd*negStd) / 2) + cohensD = math.Abs(posMean-negMean) / pooledStd + } + + // Calculate separation ratio to understand how much the classes overlap on the score scale + totalRange := math.Max(posStats["max"], negStats["max"]) - math.Min(posStats["min"], negStats["min"]) + overlapStart := math.Max(posStats["min"], negStats["min"]) + overlapEnd := math.Min(posStats["max"], negStats["max"]) + overlapRange := math.Max(0, overlapEnd-overlapStart) + separationRatio := 0.0 + if totalRange > 0 { + separationRatio = (totalRange - overlapRange) / totalRange + } + + // Find threshold that balances false positives and false negatives using Youden's J. + // This metric (Sensitivity + Specificity - 1) equally weights both false positive + // and false negative rates. Why not F1? F1 biases toward precision when classes + // are imbalanced; a validation set of 10 positives and 1000 negatives would push + // the threshold too high. Youden's J treats both types of error equally, which + // better reflects real use: missing a relevant article (false negative) is as bad + // as showing an irrelevant one (false positive). + bestCombinedScore := -1.0 + bestThreshold := 0.5 + var bestMetrics ClassificationMetrics + + boolLabels := make([]bool, len(validationLabels)) + for i, l := range validationLabels { + boolLabels[i] = l == 1.0 + } + + for i := 5; i <= 95; i++ { + threshold := float64(i) / 100.0 + metrics := computeMetrics(scores, boolLabels, threshold) + + sensitivity := metrics.Recall // TPR: TP / (TP + FN) + specificity := 0.0 + if metrics.TrueNegatives+metrics.FalsePositives > 0 { + specificity = float64(metrics.TrueNegatives) / float64(metrics.TrueNegatives+metrics.FalsePositives) + } + youdenJ := sensitivity + specificity - 1.0 + + if youdenJ > bestCombinedScore { + bestCombinedScore = youdenJ + bestThreshold = threshold + bestMetrics = metrics + } + } + + distributions := map[string]any{ + "positive": posStats, + "negative": negStats, + "cohens_d": cohensD, + "separation_ratio": separationRatio, + "best_f1": bestMetrics.F1Score, + "best_precision": bestMetrics.Precision, + "best_recall": bestMetrics.Recall, + } + + return bestThreshold, distributions +} + +// computeScoreStats computes min, max, mean, and std for a slice of scores +func computeScoreStats(scores []float64) map[string]float64 { + if len(scores) == 0 { + return map[string]float64{ + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + } + } + + min, max := scores[0], scores[0] + sum := 0.0 + + for _, score := range scores { + if score < min { + min = score + } + if score > max { + max = score + } + sum += score + } + + mean := sum / float64(len(scores)) + + // Calculate standard deviation + variance := 0.0 + for _, score := range scores { + diff := score - mean + variance += diff * diff + } + variance /= float64(len(scores)) + std := math.Sqrt(variance) + + return map[string]float64{ + "min": min, + "max": max, + "mean": mean, + "std": std, + } +} + +// computeMetrics calculates classification metrics +func computeMetrics(scores []float64, labels []bool, threshold float64) ClassificationMetrics { + var metrics ClassificationMetrics + for i, score := range scores { + predicted := score > threshold + actual := labels[i] + + if predicted && actual { + metrics.TruePositives++ + } else if predicted && !actual { + metrics.FalsePositives++ + } else if !predicted && actual { + metrics.FalseNegatives++ + } else { + metrics.TrueNegatives++ + } + } + metrics.Calculate() + return metrics +} + +// ============================================================================ +// ╻ ╻┏━╸╻ ┏━┓┏━╸┏━┓┏━┓ +// ┣━┫┣╸ ┃ ┣━┛┣╸ ┣┳┛┗━┓ +// ╹ ╹┗━╸┗━╸╹ ┗━╸╹┗╸┗━┛ +// ============================================================================ + +// shortURL formats a URL to be human-readable and not too long +func shortURL(urlStr string) string { + u, err := url.Parse(urlStr) + if err != nil { + return urlStr + } + + path := u.Path + if len(path) > 30 { + path = path[:30] + "..." + } + + return u.Host + path +} diff --git a/cmds/train_test.go b/cmds/train_test.go new file mode 100644 index 0000000..8298494 --- /dev/null +++ b/cmds/train_test.go @@ -0,0 +1,66 @@ +package cmds + +import ( + "scholscan/core" + "strings" + "testing" +) + +// test RSS parsing +func TestParseRSSFeed(t *testing.T) { + rssXML := `<?xml version="1.0" encoding="UTF-8"?> +<rss version="2.0"> +<channel> +<title>Test Feed</title> +<item> +<title>Test Article 1</title> +<link>https://example.com/article1</link> +<description>This is a test article with some content.</description> +</item> +<item> +<title>Test Article 2</title> +<link>https://example.com/article2</link> +<content><![CDATA[<p>This is content with <b>HTML</b> tags.</p>]]></content> +</item> +</channel> +</rss>` + + articles, err := ParseRSSFeed([]byte(rssXML), "https://example.com/feed") + if err != nil { + t.Fatalf("Failed to parse RSS feed: %v", err) + } + + if len(articles) != 2 { + t.Fatalf("Expected 2 articles, got %d", len(articles)) + } + + if articles[0].Title != "Test Article 1" { + t.Errorf("Expected title 'Test Article 1', got '%s'", articles[0].Title) + } + if articles[0].URL != "https://example.com/article1" { + t.Errorf("Expected URL 'https://example.com/article1', got '%s'", articles[0].URL) + } + if articles[0].Content != "This is a test article with some content." { + t.Errorf("Expected content 'This is a test article with some content.', got '%s'", articles[0].Content) + } + + if articles[1].Title != "Test Article 2" { + t.Errorf("Expected title 'Test Article 2', got '%s'", articles[1].Title) + } + if articles[1].Content != "This is content with HTML tags." { + t.Errorf("Expected 'This is content with HTML tags.', got '%s'", articles[1].Content) + } +} + +func TestCleanFeedContent(t *testing.T) { + longInput := strings.Repeat("test content ", 500) // 6000+ bytes + result := core.CleanFeedContent(longInput) + + if len(result) <= 5000 { + t.Errorf("Expected content to be truncated to >5000 chars, got %d", len(result)) + } + + if !strings.HasSuffix(result, "...") { + t.Errorf("Expected truncated content to end with '...', got '%s'", result[len(result)-3:]) + } +} diff --git a/core/constants.go b/core/constants.go new file mode 100644 index 0000000..2dadac4 --- /dev/null +++ b/core/constants.go @@ -0,0 +1,21 @@ +// Default configuration constants. +// +// Timeouts are defensive: 30s for HTTP requests, 5s for graceful shutdown. +// Score threshold 0.5 is neutral; models should learn their own. +// MinTitleLength filters junk/broken titles (<15 chars rarely meaningful). +// ChunkSize 50 balances memory usage vs batch efficiency. +package core + +import "time" + +const ( + DefaultHTTPTimeout = 30 * time.Second + DefaultContextTimeout = 10 * time.Second + DefaultReadTimeout = 30 * time.Second + DefaultWriteTimeout = 30 * time.Second + DefaultIdleTimeout = 120 * time.Second + DefaultShutdownTimeout = 5 * time.Second + DefaultScoreThreshold = 0.5 + MinTitleLength = 15 + DefaultChunkSize = 50 +) diff --git a/core/http.go b/core/http.go new file mode 100644 index 0000000..8629676 --- /dev/null +++ b/core/http.go @@ -0,0 +1,196 @@ +// HTTP client with exponential backoff retry. +// +// Handles transient network failures, timeouts, and rate limiting. +// - Backoff: 500ms → 1s → 2s → 4s max +// - Jitter prevents thundering herd +// - Respects 429 Retry-After header +package core + +import ( + "context" + "errors" + "fmt" + "math/rand" + "net" + "net/http" + "os" + "strconv" + "strings" + "time" +) + + +// ============================================================================ +// ╻ ╻╺┳╸╺┳╸┏━┓ ┏━┓┏━╸╺┳╸┏━┓╻ ╻ +// ┣━┫ ┃ ┃ ┣━┛ ┣┳┛┣╸ ┃ ┣┳┛┗┳┛ +// ╹ ╹ ╹ ╹ ╹ ╹┗╸┗━╸ ╹ ╹┗╸ ╹ +// ============================================================================ + + +const PoliteUserAgent = "scholscan/1.0 (https://github.com/mrichman/scholscan; mailto:matt@mrichman.net)" + +var DefaultHTTPClient = &http.Client{ + Timeout: 30 * time.Second, +} + +var ( + retryMaxAttempts = 4 + retryInitialBackoff = 500 * time.Millisecond + retryMaxBackoff = 5 * time.Second +) + +// Makes HTTP request with exponential backoff retry +func DoRequestWithRetry( + ctx context.Context, + client *http.Client, + req *http.Request, +) (*http.Response, error) { + if client == nil { + client = DefaultHTTPClient + } + var lastErr error + backoff := retryInitialBackoff + + for attempt := 1; attempt <= retryMaxAttempts; attempt++ { + // Make the request cancellable + reqWithCtx := req.WithContext(ctx) + resp, err := client.Do(reqWithCtx) + if err == nil { + if isRetriableStatus(resp.StatusCode) { + retryAfter := parseRetryAfter(resp.Header.Get("Retry-After")) + _ = resp.Body.Close() + sleep := backoff + if retryAfter > sleep { + sleep = retryAfter + } + + // Add jitter to avoid thundering herd. + jitter := time.Duration(rand.Intn(int(backoff / 2))) + sleep += jitter + + // Make sleep cancellable + timer := time.NewTimer(sleep) + select { + case <-ctx.Done(): + timer.Stop() + return nil, ctx.Err() + case <-timer.C: + } + + backoff = minDuration(backoff*2, retryMaxBackoff) + continue + } + return resp, nil + } + // Check for context cancellation + if ctx.Err() != nil { + return nil, ctx.Err() + } + // Network error: retry on timeouts, context deadline, transient net errors, and HTTP/2 stream errors + if os.IsTimeout(err) || errors.Is(err, context.DeadlineExceeded) || isTransientNetError(err) || isHTTP2StreamErr(err) { + lastErr = err + + // Add jitter to avoid thundering herd. + jitter := time.Duration(rand.Intn(int(backoff / 2))) + sleep := backoff + jitter + + // Make sleep cancellable + timer := time.NewTimer(sleep) + select { + case <-ctx.Done(): + timer.Stop() + return nil, ctx.Err() + case <-timer.C: + } + + backoff = minDuration(backoff*2, retryMaxBackoff) + continue + } + // Non-retriable error + return nil, err + } + if lastErr == nil { + lastErr = fmt.Errorf("request retries exhausted") + } + return nil, lastErr +} + + +// ============================================================================ +// ╻ ╻┏━╸╻ ┏━┓┏━╸┏━┓┏━┓ +// ┣━┫┣╸ ┃ ┣━┛┣╸ ┣┳┛┗━┓ +// ╹ ╹┗━╸┗━╸╹ ┗━╸╹┗╸┗━┛ +// ============================================================================ + + +func isRetriableStatus(code int) bool { + if code == http.StatusTooManyRequests { + return true + } + return code >= 500 && code != http.StatusNotImplemented +} + +func parseRetryAfter(v string) time.Duration { + if v == "" { + return 0 + } + if secs, err := strconv.Atoi(strings.TrimSpace(v)); err == nil && secs > 0 { + return time.Duration(secs) * time.Second + } + if t, err := http.ParseTime(v); err == nil { + if d := time.Until(t); d > 0 { + return d + } + } + return 0 +} + +func minDuration(a, b time.Duration) time.Duration { + if a < b { + return a + } + return b +} + +// isTransientNetError returns true for network errors which are commonly transient, +// such as timeouts and common connection reset/closed cases. +func isTransientNetError(err error) bool { + if err == nil { + return false + } + var ne net.Error + if errors.As(err, &ne) { + if ne.Timeout() { + return true + } + } + msg := strings.ToLower(err.Error()) + switch { + case strings.Contains(msg, "use of closed network connection"): + return true + case strings.Contains(msg, "connection reset by peer"): + return true + case strings.Contains(msg, "connection aborted"): + return true + case strings.Contains(msg, "broken pipe"): + return true + case strings.Contains(msg, "eof"): + // Treat unexpected EOFs as transient when occurring at transport level. + return true + default: + return false + } +} + +// isHTTP2StreamErr detects HTTP/2 stream-level errors which are often transient. +func isHTTP2StreamErr(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "stream error") || + strings.Contains(msg, "internal_error") || + strings.Contains(msg, "rst_stream") || + strings.Contains(msg, "goaway") || + strings.Contains(msg, "http2:") +} diff --git a/core/ml.go b/core/ml.go new file mode 100644 index 0000000..afdd2f3 --- /dev/null +++ b/core/ml.go @@ -0,0 +1,427 @@ +// ML implementation: TF-IDF + Logistic Regression for article filtering. +// +// Why title-only: Avoids content scraping overhead, titles are already informative. +// MinDF=2: Removes typos and rare terms that don't generalize. +// MaxDF=0.8: Removes common words that appear in >80% of documents. +// λ=0.001: Light L2 regularization to prevent overfitting on small datasets. +// +// Public API: +// - TFIDFVectorizer.Fit(): Learn vocabulary from documents +// - TFIDFVectorizer.Transform(): Convert documents to TF-IDF vectors +// - LogisticRegression.Fit(): Train classifier on vectors +// - CreateVectorizerFromModel(): Reconstruct vectorizer from saved model +// - PredictScore(): Score article using trained weights +package core + +import ( + "fmt" + "math" + "regexp" + "sort" + "strings" +) + + +// ============================================================================ +// ╻ ╻┏━╸┏━╸╺┳╸┏━┓┏━┓╻┏━┓┏━╸┏━┓ +// ┃┏┛┣╸ ┃ ┃ ┃ ┃┣┳┛┃┗━┓┣╸ ┣┳┛ +// ┗┛ ┗━╸┗━╸ ╹ ┗━┛╹┗╸╹┗━┛┗━╸╹┗╸ +// ============================================================================ + + +var wordHyphenRegex = regexp.MustCompile("[^a-zA-Z0-9-]+") + +// StopWords: Common words that don't help distinguish articles. +// Why: Reduces noise and improves model generalization. +var stopWords = map[string]struct{}{ + // Single letters and symbols + "s": {}, "-": {}, "0": {}, "1": {}, "2": {}, "3": {}, "4": {}, "5": {}, "6": {}, "7": {}, "8": {}, "9": {}, + + // Common English stop words + "a": {}, "about": {}, "above": {}, "after": {}, "again": {}, "against": {}, "al": {}, "all": {}, "am": {}, "an": {}, "and": {}, "any": {}, "are": {}, "aren't": {}, "as": {}, "at": {}, "be": {}, "because": {}, "been": {}, "before": {}, "being": {}, "below": {}, "between": {}, "both": {}, "but": {}, "by": {}, "can't": {}, "cannot": {}, "could": {}, "couldn't": {}, "did": {}, "didn't": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "don't": {}, "down": {}, "during": {}, "each": {}, "et": {}, "few": {}, "for": {}, "from": {}, "further": {}, "had": {}, "hadn't": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "he's": {}, "her": {}, "here": {}, "here's": {}, "hers": {}, "herself": {}, "him": {}, "himself": {}, "his": {}, "how": {}, "how's": {}, "i": {}, "i'd": {}, "i'll": {}, "i'm": {}, "i've": {}, "if": {}, "in": {}, "into": {}, "is": {}, "isn't": {}, "it": {}, "it's": {}, "its": {}, "itself": {}, "let's": {}, "me": {}, "more": {}, "most": {}, "mustn't": {}, "my": {}, "myself": {}, "no": {}, "nor": {}, "not": {}, "of": {}, "off": {}, "on": {}, "once": {}, "only": {}, "or": {}, "other": {}, "ought": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "over": {}, "own": {}, "same": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "so": {}, "some": {}, "such": {}, "than": {}, "that": {}, "that's": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "there": {}, "there's": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "this": {}, "those": {}, "through": {}, "to": {}, "too": {}, "under": {}, "until": {}, "up": {}, "very": {}, "was": {}, "wasn't": {}, "we": {}, "we'd": {}, "we'll": {}, "we're": {}, "we've": {}, "were": {}, "weren't": {}, "what": {}, "what's": {}, "when": {}, "when's": {}, "where": {}, "where's": {}, "which": {}, "while": {}, "who": {}, "who's": {}, "whom": {}, "why": {}, "why's": {}, "with": {}, "won't": {}, "would": {}, "wouldn't": {}, "you": {}, "you'd": {}, "you'll": {}, "you're": {}, "you've": {}, "your": {}, "yours": {}, "yourself": {}, "yourselves": {}, +} + +type TFIDFVectorizer struct { + Vocabulary map[string]float64 + OrderedVocab []string + NgramMin int + NgramMax int + MinDF int // Minimum document frequency (absolute) + MaxDF float64 // Maximum document frequency (ratio) + VocabCap int +} + +func CreateVectorizerFromModel(model *ModelEnvelope) *TFIDFVectorizer { + return &TFIDFVectorizer{ + Vocabulary: model.Vectorizer, + OrderedVocab: model.OrderedVocab, + } +} + + +// Learns vocabulary and IDF from documents +func (v *TFIDFVectorizer) Fit(documents []string) { + numDocs := len(documents) + docFreqs := make(map[string]int) + + for _, doc := range documents { + unigrams := Tokenize(doc) + ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax) + seenInDoc := make(map[string]struct{}) + for _, ngram := range ngrams { + if _, seen := seenInDoc[ngram]; !seen { + docFreqs[ngram]++ + seenInDoc[ngram] = struct{}{} + } + } + } + + maxDocs := int(v.MaxDF * float64(numDocs)) + filteredVocab := make(map[string]int) + for term, freq := range docFreqs { + if freq >= v.MinDF && freq <= maxDocs { + filteredVocab[term] = freq + } + } + + if v.VocabCap > 0 && len(filteredVocab) > v.VocabCap { + type termFreq struct { + term string + freq int + } + terms := make([]termFreq, 0, len(filteredVocab)) + for term, freq := range filteredVocab { + terms = append(terms, termFreq{term, freq}) + } + sort.Slice(terms, func(i, j int) bool { + return terms[i].freq > terms[j].freq + }) + + cappedTerms := terms[:v.VocabCap] + filteredVocab = make(map[string]int, v.VocabCap) + for _, tf := range cappedTerms { + filteredVocab[tf.term] = tf.freq + } + } + + v.OrderedVocab = make([]string, 0, len(filteredVocab)) + for term := range filteredVocab { + v.OrderedVocab = append(v.OrderedVocab, term) + } + sort.Strings(v.OrderedVocab) // deterministic order + + v.Vocabulary = make(map[string]float64, len(v.OrderedVocab)) + for _, term := range v.OrderedVocab { + // IDF = log(total num of docs / num of docs with term) + idf := math.Log(float64(numDocs) / float64(filteredVocab[term])) + v.Vocabulary[term] = idf + } +} + +// Converts documents to TF-IDF vectors using learned vocabulary +func (v *TFIDFVectorizer) Transform(documents []string) [][]float64 { + vectors := make([][]float64, len(documents)) + + for i, doc := range documents { + unigrams := Tokenize(doc) + ngrams := generateNgrams(unigrams, v.NgramMin, v.NgramMax) + vector := make([]float64, len(v.OrderedVocab)) + + if len(ngrams) > 0 { + // tf: term frequency (normalized count of each n-gram in document) + tf := make(map[string]float64) + for _, ngram := range ngrams { + tf[ngram]++ + } + numNgrams := float64(len(ngrams)) + for ngram, count := range tf { + tf[ngram] = count / numNgrams + } + + for j, term := range v.OrderedVocab { + if tfValue, ok := tf[term]; ok { + // only score terms that were in our training vocabulary + if idfValue, inVocab := v.Vocabulary[term]; inVocab { + vector[j] = tfValue * idfValue + } + } + } + } + vectors[i] = vector + } + + return vectors +} + +func Tokenize(text string) []string { + text = strings.ToLower(text) + words := wordHyphenRegex.Split(text, -1) + tokens := make([]string, 0, len(words)) + for _, word := range words { + if word == "" { + continue + } + if _, isStopWord := stopWords[word]; isStopWord { + continue + } + tokens = append(tokens, word) + } + return tokens +} + +func generateNgrams(tokens []string, minN, maxN int) []string { + if minN <= 0 { + minN = 1 + } + if maxN < minN { + maxN = minN + } + + numTokens := len(tokens) + + estimatedCap := 0 + for n := minN; n <= maxN; n++ { + if numTokens >= n { + estimatedCap += numTokens - n + 1 + } + } + ngrams := make([]string, 0, estimatedCap) + + for n := minN; n <= maxN; n++ { + if numTokens < n { + continue + } + for i := 0; i <= numTokens-n; i++ { + ngrams = append(ngrams, strings.Join(tokens[i:i+n], " ")) + } + } + return ngrams +} + + +// ============================================================================ +// ┏━╸╻ ┏━┓┏━┓┏━┓╻┏━╸╻┏━╸┏━┓ +// ┃ ┃ ┣━┫┗━┓┗━┓┃┣╸ ┃┣╸ ┣┳┛ +// ┗━╸┗━╸╹ ╹┗━┛┗━┛╹╹ ╹┗━╸╹┗╸ +// ============================================================================ + + +// Binary logistic regression with L2 regularization +// Bias term stored separately (not regularized) +type LogisticRegression struct { + LearningRate float64 + Lambda float64 // L2 regularization parameter + Iterations int + Tolerance float64 // Convergence tolerance on loss improvement +} + +// validate checks and clamps hyperparams to reasonable bounds. +func (lr *LogisticRegression) Validate() *LogisticRegression { + const ( + defaultLearningRate = 0.5 + defaultIterations = 500 + defaultTolerance = 0.000001 + ) + + if lr.LearningRate <= 0 { + lr.LearningRate = defaultLearningRate + } + if lr.LearningRate > 10 { + lr.LearningRate = 10.0 + } + if lr.Lambda < 0 { + lr.Lambda = 0.0 + } + if lr.Iterations <= 0 { + lr.Iterations = defaultIterations + } + if lr.Tolerance <= 0 { + lr.Tolerance = defaultTolerance + } + return lr +} + +// Fit trains via SGD with L2 regularization on feature weights (not bias). +// Class weights reweight samples; unused in our pipeline (we downsample instead). +// Returns weights with bias as last element. +func (lr *LogisticRegression) Fit(vectors [][]float64, labels []float64, classWeights map[float64]float64) ([]float64, error) { + if len(vectors) == 0 { + return nil, fmt.Errorf("cannot train on empty dataset") + } + if len(vectors) != len(labels) { + return nil, fmt.Errorf( + "mismatch between number of vectors (%d) and labels (%d)", + len(vectors), + len(labels), + ) + } + + for i, y := range labels { + if y != 0 && y != 1 { + return nil, fmt.Errorf("invalid label at %d: %v (expected 0 or 1)", i, y) + } + } + + numFeatures := len(vectors[0]) + if numFeatures == 0 { + return nil, fmt.Errorf("cannot train with zero-length feature vectors") + } + for i := 1; i < len(vectors); i++ { + if len(vectors[i]) != numFeatures { + return nil, fmt.Errorf( + "inconsistent feature vector length at index %d: got %d, expected %d", + i, + len(vectors[i]), + numFeatures, + ) + } + } + useUniformWeights := classWeights == nil + if useUniformWeights { + classWeights = map[float64]float64{0.0: 1.0, 1.0: 1.0} + } + + numSamples := float64(len(vectors)) + var totalWeight float64 + if useUniformWeights { + totalWeight = numSamples + } else { + for _, y := range labels { + totalWeight += classWeights[y] + } + } + if totalWeight == 0 { + totalWeight = numSamples // Fallback + } + + weights := make([]float64, numFeatures) + var bias float64 + + prevLoss := math.MaxFloat64 + + for i := 0; i < lr.Iterations; i++ { + gradWeights := make([]float64, numFeatures) + var gradBias float64 + var currentLoss float64 + + for j, x := range vectors { + y := labels[j] + sampleWeight := classWeights[y] + + z, err := dot(weights, x) + if err != nil { + return nil, fmt.Errorf("error calculating dot product for vector %d: %w", j, err) + } + p := Sigmoid(z + bias) + + // Compute prediction error. This term gets multiplied by each feature value + // to accumulate gradients (higher error pushes weights harder). + errTerm := p - y + for k := 0; k < numFeatures; k++ { + gradWeights[k] += sampleWeight * errTerm * x[k] + } + gradBias += sampleWeight * errTerm + + cp := clamp(p) + currentLoss += sampleWeight * (-(y*math.Log(cp) + (1-y)*math.Log(1-cp))) + } + + // Update weights with L2 regularization (only on feature weights, not bias). + // This pulls weights toward zero, preventing overfitting on small datasets. + for k := 0; k < numFeatures; k++ { + regularizedGrad := (gradWeights[k] / totalWeight) + (lr.Lambda * weights[k]) + weights[k] -= lr.LearningRate * regularizedGrad + } + gradBias /= totalWeight + bias -= lr.LearningRate * gradBias + + // Check convergence: if loss change is below tolerance, we're done. + // We include the L2 penalty in total loss to assess true convergence. + avgLoss := currentLoss / totalWeight + var l2Penalty float64 + for _, w := range weights { + l2Penalty += w * w + } + totalLoss := avgLoss + 0.5*lr.Lambda*l2Penalty + if math.Abs(prevLoss-totalLoss) < lr.Tolerance { + break + } + prevLoss = totalLoss + } + + // bias is stored as the last element + return append(weights, bias), nil +} + +// PredictScore computes the probability for a single vec given weights. +// the last element of weights is the bias. +func PredictScore(vector []float64, weights []float64) (float64, error) { + if len(weights) == 0 { + return 0, fmt.Errorf("weights cannot be empty") + } + if len(vector) != len(weights)-1 { + return 0, fmt.Errorf( + "vector length mismatch: expected %d features, got %d", + len(weights)-1, + len(vector), + ) + } + + for i, v := range vector { + if math.IsNaN(v) || math.IsInf(v, 0) { + return 0, fmt.Errorf("invalid value at vector[%d]: %v", i, v) + } + } + for i, w := range weights { + if math.IsNaN(w) || math.IsInf(w, 0) { + return 0, fmt.Errorf("invalid value at weights[%d]: %v", i, w) + } + } + + featureWeights := weights[:len(weights)-1] + bias := weights[len(weights)-1] + + z, err := dot(featureWeights, vector) + if err != nil { + return 0, fmt.Errorf("failed to compute dot product: %w", err) + } + return Sigmoid(z + bias), nil +} + + +// ============================================================================ +// ┏┳┓┏━┓╺┳╸╻ ╻┏━┓ +// ┃┃┃┣━┫ ┃ ┣━┫┗━┓ +// ╹ ╹╹ ╹ ╹ ╹ ╹┗━┛ +// ============================================================================ + + +func Sigmoid(z float64) float64 { + if z >= 0 { + return 1.0 / (1.0 + math.Exp(-z)) + } + ez := math.Exp(z) + return ez / (1.0 + ez) +} + +func dot(a, b []float64) (float64, error) { + if len(a) != len(b) { + return 0, fmt.Errorf("vector length mismatch: %d != %d", len(a), len(b)) + } + var sum float64 + for i := range a { + sum += a[i] * b[i] + } + return sum, nil +} + +func clamp(p float64) float64 { + const probabilityClamp = 1e-15 + if p < probabilityClamp { + return probabilityClamp + } + if p > 1.0-probabilityClamp { + return 1.0 - probabilityClamp + } + return p +} diff --git a/core/model.go b/core/model.go new file mode 100644 index 0000000..28f4045 --- /dev/null +++ b/core/model.go @@ -0,0 +1,20 @@ +// Model envelope persists trained model to JSON. Contains Vectorizer for IDF values, +// OrderedVocab for feature ordering, and Weights for logistic regression. +// To score: recreate TFIDFVectorizer, transform, then PredictScore. +package core + +import ( + "time" +) + +// ModelEnvelope - complete trained model for scoring articles +type ModelEnvelope struct { + Algorithm string `json:"algorithm"` + Impl string `json:"impl"` + Version string `json:"version"` + CreatedAt time.Time `json:"created_at"` + Meta map[string]any `json:"meta"` + Vectorizer map[string]float64 `json:"vectorizer"` + OrderedVocab []string `json:"ordered_vocab"` + Weights []float64 `json:"weights"` +} diff --git a/core/scoring.go b/core/scoring.go new file mode 100644 index 0000000..9896c80 --- /dev/null +++ b/core/scoring.go @@ -0,0 +1,14 @@ +// Score conversion utilities. +// +// ScoreToScale: Maps probability (0-1) to user-friendly 1-10 scale. +// Why: Users understand "8/10" better than "0.82 probability". +package core + +import "math" + +// ScoreToScale turns probability into 1-10 display score +func ScoreToScale(rawScore, threshold float64) int { + k := 10.0 + adjustedScore := 1.0 / (1.0 + math.Exp(-k*(rawScore-threshold))) + return int(math.Round(1.0 + (adjustedScore * 9.0))) +} diff --git a/core/text.go b/core/text.go new file mode 100644 index 0000000..ef4f861 --- /dev/null +++ b/core/text.go @@ -0,0 +1,36 @@ +// Text processing for RSS feed content. +// Used for web UI previews and search indexing - not ML (title-only scoring). +package core + +import ( + "regexp" + "strings" +) + +// CleanFeedContent strips HTML, normalizes whitespace, truncates to 5KB +func CleanFeedContent(content string) string { + if content == "" { + return "" + } + + content = StripHTMLTags(content) + content = NormalizeSpace(content) + + maxLength := 5000 + if len(content) > maxLength { + content = content[:maxLength] + "..." + } + + return content +} + +// StripHTMLTags removes HTML tags +func StripHTMLTags(content string) string { + re := regexp.MustCompile(`<[^>]*>`) + return re.ReplaceAllString(content, "") +} + +// NormalizeSpace collapses whitespace and trims +func NormalizeSpace(s string) string { + return strings.Join(strings.Fields(strings.TrimSpace(s)), " ") +} diff --git a/core/types.go b/core/types.go new file mode 100644 index 0000000..3bfa311 --- /dev/null +++ b/core/types.go @@ -0,0 +1,84 @@ +// Core type definitions for article filtering. +// +// Article: Represents paper with metadata, URL, title, optional content. +// +// Score, LabelPositive, Classification for ML pipeline state. +// +// Config: Application settings (timeouts, user agent, enrich). +// Command: Interface for CLI subcommands (train, scan, serve). +package core + +import ( + "io" + "time" +) + +// Article represents a single article with enriched metadata and scoring. +type Article struct { + // Basic article information + Title string `json:"title"` + Content string `json:"content,omitempty"` + URL string `json:"url"` + + // Enrichment metadata + FetchedAt *time.Time `json:"fetched_at,omitempty"` + PublishedAt *time.Time `json:"published_at,omitempty"` + Source string `json:"source,omitempty"` + + // Machine learning fields + Score *float64 `json:"score,omitempty"` + LabelPositive *bool `json:"label_positive,omitempty"` + Classification string `json:"classification,omitempty"` + + // Additional metadata + Authors []string `json:"authors,omitempty"` + Journal string `json:"journal,omitempty"` + Year *int `json:"year,omitempty"` + DOI string `json:"doi,omitempty"` + + // Raw extracted text from APIs or HTML + // Fields that may populate Title/Content + RawTitle string `json:"raw_title,omitempty"` + RawContent string `json:"raw_content,omitempty"` +} + +// Config represents the application configuration. +type Config struct { + // Default model and threshold + Defaults struct { + Model string `json:"model"` + Threshold *float64 `json:"threshold"` + EventsOut string `json:"events_out"` + } `json:"defaults"` + + // HTTP behavior + UserAgent string `json:"user_agent"` + ContactEmail string `json:"contact_email"` + + // Enrichment settings + Enrich struct { + MinTitleLength int `json:"min_title_length"` + ChunkSize int `json:"chunk_size"` + } `json:"enrich"` + + // API provider settings + Providers struct { + SemanticScholar struct { + APIKey string `json:"api_key"` + } `json:"semantic_scholar"` + } `json:"providers"` +} + +// Command defines the interface that all CLI subcommands must implement. +type Command interface { + // Name returns the command name (e.g., "train", "scan", "clean"). + Name() string + + // Init parses command-line arguments and initializes the command. + // It should return flag.ErrHelp if --help was requested. + Init(args []string) error + + // Run executes the command, reading from stdin and writing to stdout. + // The command should handle its own error reporting to stderr. + Run(stdin io.Reader, stdout io.Writer) error +} @@ -0,0 +1,19 @@ +module scholscan + +go 1.25.1 + +require ( + github.com/PuerkitoBio/goquery v1.10.3 + github.com/mmcdole/gofeed v1.3.0 +) + +require ( + github.com/andybalholm/cascadia v1.3.3 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/stretchr/testify v1.10.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/text v0.28.0 // indirect +) @@ -0,0 +1,96 @@ +github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo= +github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/mmcdole/gofeed v1.3.0 h1:5yn+HeqlcvjMeAI4gu6T+crm7d0anY85+M+v6fIFNG4= +github.com/mmcdole/gofeed v1.3.0/go.mod h1:9TGv2LcJhdXePDzxiuMnukhV2/zb6VtnZt1mS+SjkLE= +github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 h1:Zr92CAlFhy2gL+V1F+EyIuzbQNbSgP4xhTODZtrXUtk= +github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/justfile b/justfile new file mode 100644 index 0000000..eabf06c --- /dev/null +++ b/justfile @@ -0,0 +1,39 @@ +# ScholScan Go Implementation + +# Default recipe +default: + @just --list + +# Build the binary +build: + go build -o scholscan . + +# Install to system (optional) +install: + go install . + +# Run tests +test: + go test ./... + +# Clean cache (only works if running from project directory) +clean-cache: + ./scholscan clean + +# Format Go code +fmt: + go fmt ./... + +# Run linter (requires golangci-lint) +lint: + golangci-lint run + +# Example: Train model from articles and RSS feeds (provide your own paths) +example-train articles feeds: + @mkdir -p /tmp/scholscan + ./scholscan train {{articles}} --rss-feeds {{feeds}} > /tmp/scholscan/model.json + @echo "Model saved to /tmp/scholscan/model.json" + +# Example: Scan with trained model (provide your own paths) +example-scan model url: + ./scholscan scan --model {{model}} --url {{url}} @@ -0,0 +1,83 @@ +// scholscan command-line tool +// this is the main entry point, commands are implemented in cmds/ +// and basic logic in core/ +package main + +import ( + "errors" + "flag" + "fmt" + "os" + "scholscan/cmds" + "scholscan/core" +) + +func main() { + if len(os.Args) < 2 { + printHelp() + os.Exit(1) + } + + cmdName := os.Args[1] + args := os.Args[2:] + + // handle the help stuff + if cmdName == "help" || cmdName == "--help" || cmdName == "-h" { + printHelp() + return + } + + // flag -> command + var cmd core.Command + switch cmdName { + case "train": + cmd = &cmds.TrainCommand{} + case "scan": + cmd = &cmds.ScanCommand{} + case "serve": + cmd = &cmds.ServeCommand{} + default: + fmt.Fprintf(os.Stderr, "Unknown command: %s\n\n", cmdName) + printHelp() + os.Exit(1) + } + + // init the command, then run it + if err := cmd.Init(args); err != nil { + if errors.Is(err, flag.ErrHelp) { + os.Exit(0) + } + fmt.Fprintf(os.Stderr, "Error initializing %s command: %v\n", cmdName, err) + os.Exit(1) + } + + if err := cmd.Run(os.Stdin, os.Stdout); err != nil { + fmt.Fprintf(os.Stderr, "Error running %s command: %v\n", cmdName, err) + os.Exit(1) + } +} + +func printHelp() { + fmt.Printf(`scholscan <command> [arguments] + +A command-line tool for filtering articles based on learned user preferences. + +Commands: + train Train a model from positives and RSS feeds + scan Filter articles using a trained model + serve Start HTTP server with filtered RSS and scoring API + +Usage: + scholscan train POSITIVES_FILE --rss-feeds RSS_FEEDS_FILE > model.json + scholscan scan --url RSS_URL --model MODEL > results.jsonl + scholscan serve --model MODEL --rss-world RSS_FEEDS_FILE # Start server + scholscan serve --title "My Custom ScholScan" # Custom title for web interface + scholscan help # Show this help message + +Examples: + scholscan train positives.jsonl --rss-feeds rss_world.txt > model.json + scholscan scan --url "https://feeds.reuters.com/reuters/topNews" --model model.json + scholscan serve --port 8080 --model model.json --rss-world rss_world.txt + +`) +} |
