aboutsummaryrefslogtreecommitdiff
path: root/main.go
diff options
context:
space:
mode:
Diffstat (limited to 'main.go')
-rw-r--r--main.go131
1 files changed, 131 insertions, 0 deletions
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..1c286ea
--- /dev/null
+++ b/main.go
@@ -0,0 +1,131 @@
+// scholfetch - URL to article converter for scholscan
+// takes URLs on stdin, outputs Article structs on stdout (JSONL)
+// logs everything to scholfetch.log
+package main
+
+import (
+ "bufio"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "strings"
+)
+
+type Article struct {
+ Title string `json:"title"`
+ Content string `json:"content,omitempty"` // Optional - expensive to fetch
+ URL string `json:"url"`
+ Route string `json:"-"` // Internal: tracks which handler succeeded
+}
+
+type Result struct {
+ Urls []string
+ FailureIndices []int
+ ArticlesWritten int
+ Errors int
+}
+
+func main() {
+ var withContent bool
+ var verbose bool
+
+ fs := flag.NewFlagSet("scholfetch", flag.ExitOnError)
+ fs.Usage = func() {
+ fmt.Fprintf(fs.Output(), `Usage: scholfetch [options] < urls.txt > articles.jsonl
+
+Converts URLs to Article JSONL format for scholscan processing.
+
+Default mode: Title-only extraction (fast)
+Optional mode: Full content extraction with --with-content
+
+Input: Text file with one URL per line (stdin)
+Output: Article JSONL (stdout)
+
+Options:
+`)
+ fs.PrintDefaults()
+ fmt.Fprint(fs.Output(), `
+Examples:
+ # Title-only mode (default)
+ cat urls.txt | scholfetch > articles.jsonl
+
+ # With full content
+ cat urls.txt | scholfetch --with-content > articles.jsonl
+
+Note: Set S2_API_KEY environment variable for higher Semantic Scholar rate limits.
+`)
+ }
+
+ fs.BoolVar(&withContent, "with-content", false, "Fetch full article content (slower)")
+ fs.BoolVar(&verbose, "verbose", false, "Show progress information")
+
+ // validate args and exit early on err
+ if err := fs.Parse(os.Args[1:]); err != nil {
+ fmt.Fprintf(os.Stderr, "Error: %v\n", err)
+ os.Exit(1)
+ }
+
+ if fs.NArg() > 0 {
+ fmt.Fprintf(os.Stderr, "Error: Unexpected arguments: %v\n", fs.Args())
+ os.Exit(1)
+ }
+
+ // set up logger
+ var logger *log.Logger
+ if verbose {
+ logger = log.New(os.Stderr, "", log.LstdFlags)
+ } else {
+ logger = log.New(io.Discard, "", 0)
+ }
+
+ // config controls how URLs are handled and what data is extracted
+ config := NewConfigWithLogger(logger)
+ config.WithContent = withContent
+ config.Verbose = verbose
+
+ urls := readURLs(os.Stdin)
+
+ // notify user about S2 key found/not
+ if config.S2APIKey != "" {
+ fmt.Fprintln(os.Stderr, "Semantic Scholar API key detected: using authenticated rate limits.")
+ } else {
+ fmt.Fprintln(os.Stderr, "Semantic Scholar API key not set: using public rate limits.")
+ }
+
+ // log file for processing info, sep from stderr to keep output clean
+ logFile, err := os.Create("scholfetch.log")
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "Error: could not create log file: %v\n", err)
+ os.Exit(1)
+ }
+ defer logFile.Close()
+
+ fmt.Fprintf(os.Stderr, "Processing %d URLs (content=%t)...\n", len(urls), withContent)
+ fmt.Fprintln(os.Stderr, "Monitor progress: tail -f scholfetch.log")
+
+ encoder := json.NewEncoder(os.Stdout)
+
+ // DO THE ACTUAL WORK
+ result := ProcessURLsWithConfig(urls, config, encoder, logFile)
+
+ // report final stats to stderr
+ fmt.Fprintf(os.Stderr, "Finished: %d articles written, %d errors\n", result.ArticlesWritten, result.Errors)
+ fmt.Fprintln(os.Stderr, "See scholfetch.log for details")
+}
+
+// readURLs parses stdin into a URL slice
+// filters out empty lines and comments (#)
+func readURLs(r io.Reader) []string {
+ var urls []string
+ scanner := bufio.NewScanner(r)
+ for scanner.Scan() {
+ line := strings.TrimSpace(scanner.Text())
+ if line != "" && !strings.HasPrefix(line, "#") {
+ urls = append(urls, line)
+ }
+ }
+ return urls
+}