1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
// scholfetch - URL to article converter for scholscan
// takes URLs on stdin, outputs Article structs on stdout (JSONL)
// logs everything to scholfetch.log
package main
import (
"bufio"
"encoding/json"
"flag"
"fmt"
"io"
"log"
"os"
"strings"
)
type Article struct {
Title string `json:"title"`
Content string `json:"content,omitempty"` // Optional - expensive to fetch
URL string `json:"url"`
Route string `json:"-"` // Internal: tracks which handler succeeded
}
type Result struct {
Urls []string
FailureIndices []int
ArticlesWritten int
Errors int
}
func main() {
var withContent bool
var verbose bool
fs := flag.NewFlagSet("scholfetch", flag.ExitOnError)
fs.Usage = func() {
fmt.Fprintf(fs.Output(), `Usage: scholfetch [options] < urls.txt > articles.jsonl
Converts URLs to Article JSONL format for scholscan processing.
Default mode: Title-only extraction (fast)
Optional mode: Full content extraction with --with-content
Input: Text file with one URL per line (stdin)
Output: Article JSONL (stdout)
Options:
`)
fs.PrintDefaults()
fmt.Fprint(fs.Output(), `
Examples:
# Title-only mode (default)
cat urls.txt | scholfetch > articles.jsonl
# With full content
cat urls.txt | scholfetch --with-content > articles.jsonl
Note: Set S2_API_KEY environment variable for higher Semantic Scholar rate limits.
`)
}
fs.BoolVar(&withContent, "with-content", false, "Fetch full article content (slower)")
fs.BoolVar(&verbose, "verbose", false, "Show progress information")
// validate args and exit early on err
if err := fs.Parse(os.Args[1:]); err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}
if fs.NArg() > 0 {
fmt.Fprintf(os.Stderr, "Error: Unexpected arguments: %v\n", fs.Args())
os.Exit(1)
}
// set up logger
var logger *log.Logger
if verbose {
logger = log.New(os.Stderr, "", log.LstdFlags)
} else {
logger = log.New(io.Discard, "", 0)
}
// config controls how URLs are handled and what data is extracted
config := NewConfigWithLogger(logger)
config.WithContent = withContent
config.Verbose = verbose
urls := readURLs(os.Stdin)
// notify user about S2 key found/not
if config.S2APIKey != "" {
fmt.Fprintln(os.Stderr, "Semantic Scholar API key detected: using authenticated rate limits.")
} else {
fmt.Fprintln(os.Stderr, "Semantic Scholar API key not set: using public rate limits.")
}
// log file for processing info, sep from stderr to keep output clean
logFile, err := os.Create("scholfetch.log")
if err != nil {
fmt.Fprintf(os.Stderr, "Error: could not create log file: %v\n", err)
os.Exit(1)
}
defer logFile.Close()
fmt.Fprintf(os.Stderr, "Processing %d URLs (content=%t)...\n", len(urls), withContent)
fmt.Fprintln(os.Stderr, "Monitor progress: tail -f scholfetch.log")
encoder := json.NewEncoder(os.Stdout)
// DO THE ACTUAL WORK
result := ProcessURLsWithConfig(urls, config, encoder, logFile)
// report final stats to stderr
fmt.Fprintf(os.Stderr, "Finished: %d articles written, %d errors\n", result.ArticlesWritten, result.Errors)
fmt.Fprintln(os.Stderr, "See scholfetch.log for details")
}
// readURLs parses stdin into a URL slice
// filters out empty lines and comments (#)
func readURLs(r io.Reader) []string {
var urls []string
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line != "" && !strings.HasPrefix(line, "#") {
urls = append(urls, line)
}
}
return urls
}
|