// Scan command: filters articles using trained model. // // takes articles from RSS feed, text, or JSONL. Scores & outputs those passing. // Batches processing (default 50) to allow continuous streaming. package cmds import ( "bufio" "context" "encoding/json" "flag" "fmt" "io" "log" "net/http" "net/url" "os" "path/filepath" "strings" "github.com/mmcdole/gofeed" "scholscan/core" ) // ============================================================================ // ┏━╸┏━┓┏┳┓┏┳┓┏━┓┏┓╻╺┳┓ // ┃ ┃ ┃┃┃┃┃┃┃┣━┫┃┗┫ ┃┃ // ┗━╸┗━┛╹ ╹╹ ╹╹ ╹╹ ╹╺┻┛ // ============================================================================ // scores articles with trained model and outputs filtered results above thresh type ScanCommand struct { URL string FromText bool FromArticles bool ModelPath string Threshold string MinTitleLength int ChunkSize int EventsOut string MetricsOut string Verbose bool } func (c *ScanCommand) Name() string { return "scan" } func (c *ScanCommand) Init(args []string) error { fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError) fs.Usage = func() { fmt.Fprint(fs.Output(), `Usage: scholscan scan [options] Fetches articles, scores with model, outputs matched (>thresh) ones. Source options (exactly one required): --url Fetch articles from RSS/Atom feed --from-text Extract URLs from text on stdin --from-articles Use Article JSONL from stdin directly Model and filtering: --model Path to trained model JSON file (required) --threshold Score threshold (if not provided, uses model's recommended threshold) Enrichment options: `) fs.PrintDefaults() fmt.Fprint(fs.Output(), ` Examples: scholscan scan --url "http://some.blog/rss.xml" --model model.json > interesting.jsonl echo "see https://example.com" | scholscan scan --from-text --model model.json cat articles.jsonl | scholscan scan --from-articles --model model.json `) } fs.StringVar(&c.URL, "url", "", "RSS/Atom feed URL to fetch") fs.BoolVar(&c.FromText, "from-text", false, "Extract URLs from text on stdin") fs.BoolVar(&c.FromArticles, "from-articles", false, "Use Article JSONL from stdin") fs.StringVar(&c.ModelPath, "model", "", "Path to trained model JSON file (required)") fs.StringVar(&c.Threshold, "threshold", "", "Score threshold for filtering (if not provided, uses model's recommended threshold)") fs.IntVar(&c.MinTitleLength, "min-title-length", core.MinTitleLength, "Minimum title length to consider valid") fs.IntVar(&c.ChunkSize, "chunk-size", core.DefaultChunkSize, "Number of articles to process in each batch") fs.StringVar(&c.EventsOut, "events-out", "events.jsonl", "Write per-article events to a JSONL file") fs.StringVar(&c.MetricsOut, "metrics-out", "metrics.json", "Write summary metrics to a JSON file") fs.BoolVar(&c.Verbose, "verbose", false, "Show progress information") if err := fs.Parse(args); err != nil { return err } if fs.NArg() != 0 { return fmt.Errorf("unexpected arguments provided: %v", fs.Args()) } // one src opt required sourceCount := 0 if c.URL != "" { sourceCount++ } if c.FromText { sourceCount++ } if c.FromArticles { sourceCount++ } if sourceCount == 0 { return fmt.Errorf("exactly one source option must be specified: --url, --from-text, or --from-articles") } if sourceCount > 1 { return fmt.Errorf("only one source option may be specified: --url, --from-text, or --from-articles") } if c.ModelPath == "" { return fmt.Errorf("--model flag is required") } // prevent dir traversal if strings.Contains(filepath.Clean(c.ModelPath), "..") { return fmt.Errorf("invalid model path: directory traversal not allowed") } if c.URL != "" { if _, err := url.Parse(c.URL); err != nil { return fmt.Errorf("invalid URL format: %w", err) } } return nil } // Run runs the scan: load the model, decide on a threshold, get articles, then score them in chunks. // We bail out early on config problems but try to keep going even if some articles fail to fetch. func (c *ScanCommand) Run(stdin io.Reader, stdout io.Writer) error { if c.Verbose { log.SetOutput(os.Stderr) log.Println("Starting scan workflow...") log.Printf("Source: %v", c.getSourceDescription()) log.Printf("Model: %s", c.ModelPath) } model, err := c.loadModel() if err != nil { return fmt.Errorf("failed to load model: %w", err) } threshold, err := c.getThreshold(model) if err != nil { return fmt.Errorf("failed to determine threshold: %w", err) } if c.Verbose { log.Printf("Using threshold: %.3f", threshold) } var articles []*core.Article if c.FromArticles { articles, err = c.readArticlesFromStdin(stdin) } else { articles, err = c.fetchArticles() } if err != nil { return fmt.Errorf("failed to get articles: %w", err) } if c.Verbose { log.Printf("Processing %d articles", len(articles)) } // process articles in chunks return c.processArticles(articles, model, threshold, stdout, stdin) } // ============================================================================ // ┏┳┓┏━┓╺┳┓┏━╸╻ ┏┓ ┏━╸┏━┓┏┓╻┏━╸╻┏━╸ // ┃┃┃┃ ┃ ┃┃┣╸ ┃ ┃╺╋╸ ┃ ┃ ┃┃┗┫┣╸ ┃┃╺┓ // ╹ ╹┗━┛╺┻┛┗━╸┗━╸ ┗━┛ ┗━╸┗━┛╹ ╹╹ ╹┗━┛ // ============================================================================ func (c *ScanCommand) getSourceDescription() string { if c.URL != "" { return fmt.Sprintf("RSS feed: %s", c.URL) } if c.FromText { return "text from stdin" } if c.FromArticles { return "articles from stdin" } return "unknown" } // loadModel reads and parses the model JSON file. // The envelope contains weights, vocabulary, and optionally a recommended threshold. func (c *ScanCommand) loadModel() (*core.ModelEnvelope, error) { f, err := os.Open(c.ModelPath) if err != nil { return nil, fmt.Errorf("failed to open model file %s: %w", c.ModelPath, err) } defer f.Close() var model core.ModelEnvelope if err := json.NewDecoder(f).Decode(&model); err != nil { return nil, fmt.Errorf("failed to decode model: %w", err) } return &model, nil } func (c *ScanCommand) getThreshold(model *core.ModelEnvelope) (float64, error) { if c.Threshold != "" { var threshold float64 _, err := fmt.Sscanf(c.Threshold, "%f", &threshold) if err == nil { return threshold, nil } } if model.Meta != nil { if meta, ok := model.Meta["recommended_threshold"].(float64); ok { return meta, nil } } return core.DefaultScoreThreshold, nil } // ============================================================================ // ┏━┓┏━┓╺┳╸╻┏━╸╻ ┏━╸ ┏━┓┏━┓┏━╸┏━┓ // ┣━┫┣┳┛ ┃ ┃┃ ┃ ┣╸ ┗━┓┣┳┛┃ ┗━┓ // ╹ ╹╹┗╸ ╹ ╹┗━╸┗━╸┗━╸ ┗━┛╹┗╸┗━╸┗━┛ // ============================================================================ func (c *ScanCommand) fetchArticles() ([]*core.Article, error) { if c.FromText { return c.extractURLsFromText(os.Stdin) } if c.URL != "" { return c.fetchRSSFeed(c.URL) } return nil, fmt.Errorf("no valid source specified") } // extractURLsFromText pulls URLs from plain text on stdin. // We create minimal Article objects since only the URL is needed for scoring. func (c *ScanCommand) extractURLsFromText(stdin io.Reader) ([]*core.Article, error) { var urls []string s := bufio.NewScanner(stdin) for s.Scan() { line := s.Text() // url extraction fields := strings.Fields(line) for _, field := range fields { if strings.HasPrefix(field, "http://") || strings.HasPrefix(field, "https://") { urls = append(urls, field) } } } // create Article objs for URLs articles := make([]*core.Article, len(urls)) for i, url := range urls { articles[i] = &core.Article{ URL: url, Title: fmt.Sprintf("Article from %s", url), Content: "", } } return articles, s.Err() } // fetchRSSFeed fetches and parses a single RSS feed with a 30s timeout. // We skip articles with short titles since they're usually noise or truncated. func (c *ScanCommand) fetchRSSFeed(url string) ([]*core.Article, error) { client := &http.Client{Timeout: core.DefaultHTTPTimeout} req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, fmt.Errorf("error building request: %w", err) } req.Header.Set("User-Agent", core.PoliteUserAgent) ctx, cancel := context.WithTimeout(context.Background(), core.DefaultHTTPTimeout) defer cancel() resp, err := client.Do(req.WithContext(ctx)) if err != nil { return nil, fmt.Errorf("error fetching %s: %w", url, err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url) } body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("error reading response from %s: %w", url, err) } // parse feed fp := gofeed.NewParser() feed, err := fp.Parse(strings.NewReader(string(body))) if err != nil { return nil, fmt.Errorf("error parsing feed from %s: %w", url, err) } var articles []*core.Article for _, item := range feed.Items { article := &core.Article{ URL: item.Link, Title: strings.TrimSpace(item.Title), } if len(article.Title) >= c.MinTitleLength { articles = append(articles, article) } } return articles, nil } // readArticlesFromStdin reads Article objects from JSONL on stdin. // Malformed lines are skipped to allow partial processing of corrupted input. func (c *ScanCommand) readArticlesFromStdin(stdin io.Reader) ([]*core.Article, error) { var articles []*core.Article decoder := json.NewDecoder(stdin) for { var article core.Article if err := decoder.Decode(&article); err != nil { if err == io.EOF { break } continue } if len(article.Title) >= c.MinTitleLength { articles = append(articles, &article) } } return articles, nil } // ============================================================================ // ┏━┓┏━┓┏━┓┏━╸┏━╸┏━┓┏━┓ ┏━┓┏━┓╺┳╸╻┏━╸╻ ┏━╸┏━┓ // ┣━┛┣┳┛┃ ┃┃ ┣╸ ┗━┓┗━┓ ┣━┫┣┳┛ ┃ ┃┃ ┃ ┣╸ ┗━┓ // ╹ ╹┗╸┗━┛┗━╸┗━╸┗━┛┗━┛ ╹ ╹╹┗╸ ╹ ╹┗━╸┗━╸┗━╸┗━┛ // ============================================================================ // processArticles handles scoring and filtering in batches to keep memory usage predictable. // Scoring errors don't crash the process - we log them and continue with the next article. func (c *ScanCommand) processArticles(articles []*core.Article, model *core.ModelEnvelope, threshold float64, stdout io.Writer, stdin io.Reader) error { vectorizer := core.CreateVectorizerFromModel(model) encoder := json.NewEncoder(stdout) // process each batch for i := 0; i < len(articles); i += c.ChunkSize { end := i + c.ChunkSize if end > len(articles) { end = len(articles) } chunk := articles[i:end] if c.Verbose { log.Printf("Processing chunk %d-%d of %d articles", i+1, end, len(articles)) } // calc score for batch docs := make([]string, len(chunk)) for j, article := range chunk { docs[j] = strings.TrimSpace(article.Title) } vectors := vectorizer.Transform(docs) scores := make([]float64, len(chunk)) for j, vector := range vectors { score, err := core.PredictScore(vector, model.Weights) if err != nil { log.Printf("Error computing score for article %d: %v", i+j, err) scores[j] = 0.0 } else { scores[j] = score } } for j, article := range chunk { score := scores[j] article.Score = &score if score >= threshold { if err := encoder.Encode(article); err != nil { log.Printf("Error encoding article: %v", err) } } } } if c.Verbose { log.Println("Scan complete") } return nil }