client.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

// CLIENT LAYER - HTTP AND RATE LIMITING
//
// manages HTTP requests with retry logic and API-specific rate limits.
// 
// RATE LIMITS:
// - arXiv: 1 second between requests (enforced to be safe)
// - Semantic Scholar: 100ms between requests (configurable via API key)
//
// STRATEGY:
// - retries on network failures and HTTP 429
// - exponential backoff: 1s, 2s, 4s
// - all delays respect context cancellation
package main

import (
	"context"
	"net/http"
	"os"
	"time"
)

// HTTPClient wraps an HTTP client with common behavior like user agent,
// rate limiting, and retry logic.
type HTTPClient struct {
	client     *http.Client
	userAgent  string
	arxivDelay time.Duration
	s2Delay    time.Duration
	maxRetries int
}

// NewHTTPClient creates a new HTTP client wrapper with defaults.
func NewHTTPClient() *HTTPClient {
	return &HTTPClient{
		client: &http.Client{
			Timeout: 30 * time.Second,
		},
		userAgent:  "scholfetch/1.0 (+https://samsci.com)",
		arxivDelay: 1 * time.Second,
		s2Delay:    100 * time.Millisecond,
		maxRetries: 3,
	}
}

// Do performs an HTTP request with retry logic.
// retries on network errors and 429 (rate limit) responses.
func (c *HTTPClient) Do(req *http.Request) (*http.Response, error) {
	// Set user agent if not already set
	if req.Header.Get("User-Agent") == "" {
		req.Header.Set("User-Agent", c.userAgent)
	}

	var lastErr error
	for attempt := 0; attempt < c.maxRetries; attempt++ {
		if attempt > 0 {
			// Exponential backoff: 1s, 2s, 4s
			backoff := time.Duration(1<<uint(attempt-1)) * time.Second
			select {
			case <-time.After(backoff):
			case <-req.Context().Done():
				return nil, req.Context().Err()
			}
		}

		resp, err := c.client.Do(req)
		if err != nil {
			lastErr = err
			continue
		}

		// Retry on 429 (rate limit) but not other errors
		if resp.StatusCode == http.StatusTooManyRequests {
			resp.Body.Close()
			lastErr = nil // Reset error for retryable status code
			continue
		}

		return resp, nil
	}

	return nil, lastErr
}

// RateLimitArxiv adds a delay for arXiv API requests.
func (c *HTTPClient) RateLimitArxiv(ctx context.Context) error {
	select {
	case <-time.After(c.arxivDelay):
		return nil
	case <-ctx.Done():
		return ctx.Err()
	}
}

// RateLimitS2 adds a delay for Semantic Scholar API requests.
func (c *HTTPClient) RateLimitS2(ctx context.Context) error {
	select {
	case <-time.After(c.s2Delay):
		return nil
	case <-ctx.Done():
		return ctx.Err()
	}
}

// config for scholfetch.
type Config struct {
	WithContent bool
	Verbose     bool
	Logger      Logger
	HTTP        *HTTPClient
	ArxivBatch  int	
	S2APIKey     string
}

// Logger interface for dependency injection
type Logger interface {
	Printf(format string, v ...interface{})
}

func NewConfig() *Config {
	return &Config{
		WithContent: false,
		Verbose:     false,
		HTTP:        NewHTTPClient(),
		ArxivBatch:  50,
		S2APIKey:    os.Getenv("S2_API_KEY"),
	}
}

func NewConfigWithLogger(logger Logger) *Config {
	cfg := NewConfig()
	cfg.Logger = logger
	return cfg
}