diff options
Diffstat (limited to 'main.go')
| -rw-r--r-- | main.go | 131 |
1 files changed, 131 insertions, 0 deletions
@@ -0,0 +1,131 @@ +// scholfetch - URL to article converter for scholscan +// takes URLs on stdin, outputs Article structs on stdout (JSONL) +// logs everything to scholfetch.log +package main + +import ( + "bufio" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "os" + "strings" +) + +type Article struct { + Title string `json:"title"` + Content string `json:"content,omitempty"` // Optional - expensive to fetch + URL string `json:"url"` + Route string `json:"-"` // Internal: tracks which handler succeeded +} + +type Result struct { + Urls []string + FailureIndices []int + ArticlesWritten int + Errors int +} + +func main() { + var withContent bool + var verbose bool + + fs := flag.NewFlagSet("scholfetch", flag.ExitOnError) + fs.Usage = func() { + fmt.Fprintf(fs.Output(), `Usage: scholfetch [options] < urls.txt > articles.jsonl + +Converts URLs to Article JSONL format for scholscan processing. + +Default mode: Title-only extraction (fast) +Optional mode: Full content extraction with --with-content + +Input: Text file with one URL per line (stdin) +Output: Article JSONL (stdout) + +Options: +`) + fs.PrintDefaults() + fmt.Fprint(fs.Output(), ` +Examples: + # Title-only mode (default) + cat urls.txt | scholfetch > articles.jsonl + + # With full content + cat urls.txt | scholfetch --with-content > articles.jsonl + +Note: Set S2_API_KEY environment variable for higher Semantic Scholar rate limits. +`) + } + + fs.BoolVar(&withContent, "with-content", false, "Fetch full article content (slower)") + fs.BoolVar(&verbose, "verbose", false, "Show progress information") + + // validate args and exit early on err + if err := fs.Parse(os.Args[1:]); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + if fs.NArg() > 0 { + fmt.Fprintf(os.Stderr, "Error: Unexpected arguments: %v\n", fs.Args()) + os.Exit(1) + } + + // set up logger + var logger *log.Logger + if verbose { + logger = log.New(os.Stderr, "", log.LstdFlags) + } else { + logger = log.New(io.Discard, "", 0) + } + + // config controls how URLs are handled and what data is extracted + config := NewConfigWithLogger(logger) + config.WithContent = withContent + config.Verbose = verbose + + urls := readURLs(os.Stdin) + + // notify user about S2 key found/not + if config.S2APIKey != "" { + fmt.Fprintln(os.Stderr, "Semantic Scholar API key detected: using authenticated rate limits.") + } else { + fmt.Fprintln(os.Stderr, "Semantic Scholar API key not set: using public rate limits.") + } + + // log file for processing info, sep from stderr to keep output clean + logFile, err := os.Create("scholfetch.log") + if err != nil { + fmt.Fprintf(os.Stderr, "Error: could not create log file: %v\n", err) + os.Exit(1) + } + defer logFile.Close() + + fmt.Fprintf(os.Stderr, "Processing %d URLs (content=%t)...\n", len(urls), withContent) + fmt.Fprintln(os.Stderr, "Monitor progress: tail -f scholfetch.log") + + encoder := json.NewEncoder(os.Stdout) + + // DO THE ACTUAL WORK + result := ProcessURLsWithConfig(urls, config, encoder, logFile) + + // report final stats to stderr + fmt.Fprintf(os.Stderr, "Finished: %d articles written, %d errors\n", result.ArticlesWritten, result.Errors) + fmt.Fprintln(os.Stderr, "See scholfetch.log for details") +} + +// readURLs parses stdin into a URL slice +// filters out empty lines and comments (#) +func readURLs(r io.Reader) []string { + var urls []string + scanner := bufio.NewScanner(r) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line != "" && !strings.HasPrefix(line, "#") { + urls = append(urls, line) + } + } + return urls +} |
