// scholfetch - URL to article converter for scholscan // takes URLs on stdin, outputs Article structs on stdout (JSONL) // logs everything to scholfetch.log package main import ( "bufio" "encoding/json" "flag" "fmt" "io" "log" "os" "strings" ) type Article struct { Title string `json:"title"` Content string `json:"content,omitempty"` // Optional - expensive to fetch URL string `json:"url"` Route string `json:"-"` // Internal: tracks which handler succeeded } type Result struct { Urls []string FailureIndices []int ArticlesWritten int Errors int } func main() { var withContent bool var verbose bool fs := flag.NewFlagSet("scholfetch", flag.ExitOnError) fs.Usage = func() { fmt.Fprintf(fs.Output(), `Usage: scholfetch [options] < urls.txt > articles.jsonl Converts URLs to Article JSONL format for scholscan processing. Default mode: Title-only extraction (fast) Optional mode: Full content extraction with --with-content Input: Text file with one URL per line (stdin) Output: Article JSONL (stdout) Options: `) fs.PrintDefaults() fmt.Fprint(fs.Output(), ` Examples: # Title-only mode (default) cat urls.txt | scholfetch > articles.jsonl # With full content cat urls.txt | scholfetch --with-content > articles.jsonl Note: Set S2_API_KEY environment variable for higher Semantic Scholar rate limits. `) } fs.BoolVar(&withContent, "with-content", false, "Fetch full article content (slower)") fs.BoolVar(&verbose, "verbose", false, "Show progress information") // validate args and exit early on err if err := fs.Parse(os.Args[1:]); err != nil { fmt.Fprintf(os.Stderr, "Error: %v\n", err) os.Exit(1) } if fs.NArg() > 0 { fmt.Fprintf(os.Stderr, "Error: Unexpected arguments: %v\n", fs.Args()) os.Exit(1) } // set up logger var logger *log.Logger if verbose { logger = log.New(os.Stderr, "", log.LstdFlags) } else { logger = log.New(io.Discard, "", 0) } // config controls how URLs are handled and what data is extracted config := NewConfigWithLogger(logger) config.WithContent = withContent config.Verbose = verbose urls := readURLs(os.Stdin) // notify user about S2 key found/not if config.S2APIKey != "" { fmt.Fprintln(os.Stderr, "Semantic Scholar API key detected: using authenticated rate limits.") } else { fmt.Fprintln(os.Stderr, "Semantic Scholar API key not set: using public rate limits.") } // log file for processing info, sep from stderr to keep output clean logFile, err := os.Create("scholfetch.log") if err != nil { fmt.Fprintf(os.Stderr, "Error: could not create log file: %v\n", err) os.Exit(1) } defer logFile.Close() fmt.Fprintf(os.Stderr, "Processing %d URLs (content=%t)...\n", len(urls), withContent) fmt.Fprintln(os.Stderr, "Monitor progress: tail -f scholfetch.log") encoder := json.NewEncoder(os.Stdout) // DO THE ACTUAL WORK result := ProcessURLsWithConfig(urls, config, encoder, logFile) // report final stats to stderr fmt.Fprintf(os.Stderr, "Finished: %d articles written, %d errors\n", result.ArticlesWritten, result.Errors) fmt.Fprintln(os.Stderr, "See scholfetch.log for details") } // readURLs parses stdin into a URL slice // filters out empty lines and comments (#) func readURLs(r io.Reader) []string { var urls []string scanner := bufio.NewScanner(r) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if line != "" && !strings.HasPrefix(line, "#") { urls = append(urls, line) } } return urls }