aboutsummaryrefslogtreecommitdiff
path: root/main.go
blob: 1c286ea1242191bbac5b06820d9be337ad5c13e4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
// scholfetch - URL to article converter for scholscan
// takes URLs on stdin, outputs Article structs on stdout (JSONL)
// logs everything to scholfetch.log
package main

import (
	"bufio"
	"encoding/json"
	"flag"
	"fmt"
	"io"
	"log"
	"os"
	"strings"
)

type Article struct {
	Title   string `json:"title"`
	Content string `json:"content,omitempty"` // Optional - expensive to fetch
	URL     string `json:"url"`
	Route   string `json:"-"`                 // Internal: tracks which handler succeeded
}

type Result struct {
	Urls           []string
	FailureIndices []int
	ArticlesWritten int
	Errors         int
}

func main() {
	var withContent bool
	var verbose bool

	fs := flag.NewFlagSet("scholfetch", flag.ExitOnError)
	fs.Usage = func() {
		fmt.Fprintf(fs.Output(), `Usage: scholfetch [options] < urls.txt > articles.jsonl

Converts URLs to Article JSONL format for scholscan processing.

Default mode: Title-only extraction (fast)
Optional mode: Full content extraction with --with-content

Input:  Text file with one URL per line (stdin)
Output: Article JSONL (stdout)

Options:
`)
		fs.PrintDefaults()
		fmt.Fprint(fs.Output(), `
Examples:
  # Title-only mode (default)
  cat urls.txt | scholfetch > articles.jsonl
  
  # With full content
  cat urls.txt | scholfetch --with-content > articles.jsonl

Note: Set S2_API_KEY environment variable for higher Semantic Scholar rate limits.
`)
	}

	fs.BoolVar(&withContent, "with-content", false, "Fetch full article content (slower)")
	fs.BoolVar(&verbose, "verbose", false, "Show progress information")

	// validate args and exit early on err
	if err := fs.Parse(os.Args[1:]); err != nil {
		fmt.Fprintf(os.Stderr, "Error: %v\n", err)
		os.Exit(1)
	}

	if fs.NArg() > 0 {
		fmt.Fprintf(os.Stderr, "Error: Unexpected arguments: %v\n", fs.Args())
		os.Exit(1)
	}

	// set up logger
	var logger *log.Logger
	if verbose {
		logger = log.New(os.Stderr, "", log.LstdFlags)
	} else {
		logger = log.New(io.Discard, "", 0)
	}

	// config controls how URLs are handled and what data is extracted
	config := NewConfigWithLogger(logger)
	config.WithContent = withContent
	config.Verbose = verbose

	urls := readURLs(os.Stdin)

	// notify user about S2 key found/not
	if config.S2APIKey != "" {
		fmt.Fprintln(os.Stderr, "Semantic Scholar API key detected: using authenticated rate limits.")
	} else {
		fmt.Fprintln(os.Stderr, "Semantic Scholar API key not set: using public rate limits.")
	}

	// log file for processing info, sep from stderr to keep output clean
	logFile, err := os.Create("scholfetch.log")
	if err != nil {
		fmt.Fprintf(os.Stderr, "Error: could not create log file: %v\n", err)
		os.Exit(1)
	}
	defer logFile.Close()

	fmt.Fprintf(os.Stderr, "Processing %d URLs (content=%t)...\n", len(urls), withContent)
	fmt.Fprintln(os.Stderr, "Monitor progress: tail -f scholfetch.log")

	encoder := json.NewEncoder(os.Stdout)
	
	// DO THE ACTUAL WORK
	result := ProcessURLsWithConfig(urls, config, encoder, logFile)

	// report final stats to stderr
	fmt.Fprintf(os.Stderr, "Finished: %d articles written, %d errors\n", result.ArticlesWritten, result.Errors)
	fmt.Fprintln(os.Stderr, "See scholfetch.log for details")
}

// readURLs parses stdin into a URL slice
// filters out empty lines and comments (#)
func readURLs(r io.Reader) []string {
	var urls []string
	scanner := bufio.NewScanner(r)
	for scanner.Scan() {
		line := strings.TrimSpace(scanner.Text())
		if line != "" && !strings.HasPrefix(line, "#") {
			urls = append(urls, line)
		}
	}
	return urls
}