1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
// CLIENT LAYER - HTTP AND RATE LIMITING
//
// manages HTTP requests with retry logic and API-specific rate limits.
//
// RATE LIMITS:
// - arXiv: 1 second between requests (enforced to be safe)
// - Semantic Scholar: 100ms between requests (configurable via API key)
//
// STRATEGY:
// - retries on network failures and HTTP 429
// - exponential backoff: 1s, 2s, 4s
// - all delays respect context cancellation
package main
import (
"context"
"net/http"
"os"
"time"
)
// HTTPClient wraps an HTTP client with common behavior like user agent,
// rate limiting, and retry logic.
type HTTPClient struct {
client *http.Client
userAgent string
arxivDelay time.Duration
s2Delay time.Duration
maxRetries int
}
// NewHTTPClient creates a new HTTP client wrapper with defaults.
func NewHTTPClient() *HTTPClient {
return &HTTPClient{
client: &http.Client{
Timeout: 30 * time.Second,
},
userAgent: "scholfetch/1.0 (+https://samsci.com)",
arxivDelay: 1 * time.Second,
s2Delay: 100 * time.Millisecond,
maxRetries: 3,
}
}
// Do performs an HTTP request with retry logic.
// retries on network errors and 429 (rate limit) responses.
func (c *HTTPClient) Do(req *http.Request) (*http.Response, error) {
// Set user agent if not already set
if req.Header.Get("User-Agent") == "" {
req.Header.Set("User-Agent", c.userAgent)
}
var lastErr error
for attempt := 0; attempt < c.maxRetries; attempt++ {
if attempt > 0 {
// Exponential backoff: 1s, 2s, 4s
backoff := time.Duration(1<<uint(attempt-1)) * time.Second
select {
case <-time.After(backoff):
case <-req.Context().Done():
return nil, req.Context().Err()
}
}
resp, err := c.client.Do(req)
if err != nil {
lastErr = err
continue
}
// Retry on 429 (rate limit) but not other errors
if resp.StatusCode == http.StatusTooManyRequests {
resp.Body.Close()
lastErr = nil // Reset error for retryable status code
continue
}
return resp, nil
}
return nil, lastErr
}
// RateLimitArxiv adds a delay for arXiv API requests.
func (c *HTTPClient) RateLimitArxiv(ctx context.Context) error {
select {
case <-time.After(c.arxivDelay):
return nil
case <-ctx.Done():
return ctx.Err()
}
}
// RateLimitS2 adds a delay for Semantic Scholar API requests.
func (c *HTTPClient) RateLimitS2(ctx context.Context) error {
select {
case <-time.After(c.s2Delay):
return nil
case <-ctx.Done():
return ctx.Err()
}
}
// config for scholfetch.
type Config struct {
WithContent bool
Verbose bool
Logger Logger
HTTP *HTTPClient
ArxivBatch int
S2APIKey string
}
// Logger interface for dependency injection
type Logger interface {
Printf(format string, v ...interface{})
}
func NewConfig() *Config {
return &Config{
WithContent: false,
Verbose: false,
HTTP: NewHTTPClient(),
ArxivBatch: 50,
S2APIKey: os.Getenv("S2_API_KEY"),
}
}
func NewConfigWithLogger(logger Logger) *Config {
cfg := NewConfig()
cfg.Logger = logger
return cfg
}
|