diff options
| -rw-r--r-- | .gitignore | 10 | ||||
| -rw-r--r-- | README.md | 83 | ||||
| -rw-r--r-- | arxiv.go | 196 | ||||
| -rw-r--r-- | client.go | 133 | ||||
| -rw-r--r-- | go.mod | 18 | ||||
| -rw-r--r-- | go.sum | 96 | ||||
| -rw-r--r-- | html.go | 198 | ||||
| -rw-r--r-- | justfile | 20 | ||||
| -rw-r--r-- | main.go | 131 | ||||
| -rw-r--r-- | processor.go | 295 | ||||
| -rw-r--r-- | routes.go | 75 | ||||
| -rw-r--r-- | scholar.go | 217 | ||||
| -rw-r--r-- | scholfetch_test.go | 193 | ||||
| -rw-r--r-- | util.go | 14 |
14 files changed, 1679 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4a79922 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# Built binaries +scholfetch + +# Test files +test_urls.txt + +# Environment and configuration files +.env* +config.* +secrets.* diff --git a/README.md b/README.md new file mode 100644 index 0000000..9190dc1 --- /dev/null +++ b/README.md @@ -0,0 +1,83 @@ +# ScholFetch + +URL → Article metadata (JSONL) converter. Fetches title-only by default for speed. + +## Overview + +ScholFetch extracts academic article metadata from URLs. +It supports arXiv, Semantic Scholar, and generic HTML sources. +The tool outputs structured JSONL format suitable for downstream processing by ScholScan (see below). + +## Usage +```bash +cat urls.txt | scholfetch > articles.jsonl +# or: +cat urls.txt | scholfetch --with-content > articles.jsonl +``` + +## Monitoring Progress + +ScholFetch writes a structured log file `scholfetch.log` during processing. Monitor it in another terminal: + +```bash +tail -f scholfetch.log +``` + +## Semantic Scholar API key + +Get higher rate limits by setting your S2 API key (*not required*): + +```bash +export S2_API_KEY="your-key-here" +cat urls.txt | scholfetch > articles.jsonl +``` + +Get your free key at: https://www.semanticscholar.org/product/api + +ScholFetch will notify you on startup whether the key is detected. + +## Integration with ScholScan + +Once you have structured article data, pipe it to [ScholScan](https://git.samsci.com/scholscan) for ML-based filtering: + +```bash +# Get articles from URLs +cat urls.txt | scholfetch > articles.jsonl + +# Train a classification model +scholscan train articles.jsonl --rss-feeds feeds.txt > model.json + +# Score articles from an RSS feed +scholscan scan --model model.json --url "https://example.com/feed.rss" > results.jsonl +``` + +ScholFetch extracts and enriches article metadata, while ScholScan handles classification. Together they provide a complete pipeline for filtering academic literature. + +## Input/Output +- Input: URLs (one per line) on stdin +- Output: JSONL with `title` and `url` fields (stdout) +- Add `--with-content` for `content` field + +## How it works + +URLs get routed by pattern (arXiv IDs → arXiv API, DOIs → Semantic Scholar, everything else → HTML scrape). +Batched in chunks of 50 for efficiency. If batch fails, falls back to individual requests. Rate limited per API. + +## Code + +- `main.go` - reads stdin, sets up flags/output +- `routes.go` - determines which handler (arxiv/s2/html) for each URL +- `processor.go` - batching, fallback logic +- `arxiv.go`, `scholar.go`, `html.go` - the actual extractors +- `client.go` - HTTP client with retries and rate limiting + +## Build and Development + +```bash +just build +just test +``` + +## Roadmap + +Future work could integrate crossref, pubmed quite easily (especially for title-only approach). diff --git a/arxiv.go b/arxiv.go new file mode 100644 index 0000000..6e7fad5 --- /dev/null +++ b/arxiv.go @@ -0,0 +1,196 @@ +// ARXIV HANDLER +// +// Uses arXiv's API to fetch article metadata. +// +// STRATEGY: +// - single requests and batched requests supported +// - uses gofeed to parse Atom XML responses +// - rate limited to 1 request per second (conservative) +// - handles both old (math-ph/0301015) and new (2109.05857) ID formats +package main + +import ( + "bytes" + "context" + "fmt" + "io" + "net/http" + "strings" + + "github.com/mmcdole/gofeed" +) + +const ( + arxivQueryFmt = "http://export.arxiv.org/api/query?id_list=%s" +) + +// fetchArxiv fetches content for a single arXiv article. +func fetchArxiv(ctx context.Context, config *Config, urlStr string) (*Article, error) { + arxivID, err := getArxivIdentifier(urlStr) + if err != nil || arxivID == "" { + return nil, fmt.Errorf("arXiv: invalid URL format, expected arxiv.org/abs/ID: %s", urlStr) + } + + // rate limit + if err := config.HTTP.RateLimitArxiv(ctx); err != nil { + return nil, err + } + + apiURL := fmt.Sprintf(arxivQueryFmt, arxivID) + req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) + if err != nil { + return nil, fmt.Errorf("failed to construct arXiv request: %w", err) + } + req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8") + + resp, err := config.HTTP.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to fetch arXiv feed for ID %s: %w", arxivID, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("arXiv API returned non-200 status for ID %s: %s", arxivID, resp.Status) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read arXiv response body for ID %s: %w", arxivID, err) + } + + fp := gofeed.NewParser() + feed, err := fp.Parse(bytes.NewReader(body)) + if err != nil { + return nil, fmt.Errorf("failed to parse arXiv feed for ID %s: %w", arxivID, err) + } + + if len(feed.Items) == 0 { + return nil, fmt.Errorf("no items found in arXiv feed for ID %s", arxivID) + } + + item := feed.Items[0] + title := normalizeSpace(item.Title) + content := normalizeSpace(item.Description) + + if config.Verbose { + config.Logger.Printf("arXiv single fetch result: ID=%s, Title=%s", arxivID, title) + } + + // del content if not requested + if !config.WithContent { + content = "" + } + + if title == "" { + return nil, fmt.Errorf("no title found for arXiv ID %s", arxivID) + } + + return &Article{ + Title: title, + Content: content, + URL: urlStr, + }, nil +} + +// fetchArxivBatch fetches metadata for a list of arXiv URLs in batches. +func fetchArxivBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) { + if len(urls) == 0 { + return nil, nil + } + + idToURL := make(map[string]string) + batchIDs := make([]string, 0, len(urls)) + + for _, urlStr := range urls { + id, err := getArxivIdentifier(urlStr) + if err != nil { + continue + } + batchIDs = append(batchIDs, id) + stripped := stripArxivVersion(id) + idToURL[stripped] = urlStr + } + + if len(batchIDs) == 0 { + return nil, nil + } + + var articles []*Article + + for i := 0; i < len(batchIDs); i += config.ArxivBatch { + end := i + config.ArxivBatch + if end > len(batchIDs) { + end = len(batchIDs) + } + + // rate limit + if err := config.HTTP.RateLimitArxiv(ctx); err != nil { + return nil, err + } + + chunk := batchIDs[i:end] + apiURL := fmt.Sprintf(arxivQueryFmt, strings.Join(chunk, ",")) + + req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) + if err != nil { + continue + } + req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8") + + resp, err := config.HTTP.Do(req) + if err != nil { + continue + } + + if resp.StatusCode != http.StatusOK { + resp.Body.Close() + continue + } + + body, err := io.ReadAll(resp.Body) + resp.Body.Close() + if err != nil { + continue + } + + fp := gofeed.NewParser() + feed, err := fp.Parse(bytes.NewReader(body)) + if err != nil { + continue + } + + for _, item := range feed.Items { + id, err := getArxivIdentifier(item.GUID) + if err != nil || id == "" { + id, err = getArxivIdentifier(item.Link) + if err != nil || id == "" { + continue + } + } + + title := normalizeSpace(item.Title) + if title == "" { + continue + } + + baseID := stripArxivVersion(id) + originalURL, exists := idToURL[baseID] + if !exists { + continue + } + + content := "" + if config.WithContent { + content = normalizeSpace(item.Description) + } + + articles = append(articles, &Article{ + Title: title, + Content: content, + URL: originalURL, + }) + } + } + + return articles, nil +} diff --git a/client.go b/client.go new file mode 100644 index 0000000..39a3e34 --- /dev/null +++ b/client.go @@ -0,0 +1,133 @@ +// CLIENT LAYER - HTTP AND RATE LIMITING +// +// manages HTTP requests with retry logic and API-specific rate limits. +// +// RATE LIMITS: +// - arXiv: 1 second between requests (enforced to be safe) +// - Semantic Scholar: 100ms between requests (configurable via API key) +// +// STRATEGY: +// - retries on network failures and HTTP 429 +// - exponential backoff: 1s, 2s, 4s +// - all delays respect context cancellation +package main + +import ( + "context" + "net/http" + "os" + "time" +) + +// HTTPClient wraps an HTTP client with common behavior like user agent, +// rate limiting, and retry logic. +type HTTPClient struct { + client *http.Client + userAgent string + arxivDelay time.Duration + s2Delay time.Duration + maxRetries int +} + +// NewHTTPClient creates a new HTTP client wrapper with defaults. +func NewHTTPClient() *HTTPClient { + return &HTTPClient{ + client: &http.Client{ + Timeout: 30 * time.Second, + }, + userAgent: "scholfetch/1.0 (+https://samsci.com)", + arxivDelay: 1 * time.Second, + s2Delay: 100 * time.Millisecond, + maxRetries: 3, + } +} + +// Do performs an HTTP request with retry logic. +// retries on network errors and 429 (rate limit) responses. +func (c *HTTPClient) Do(req *http.Request) (*http.Response, error) { + // Set user agent if not already set + if req.Header.Get("User-Agent") == "" { + req.Header.Set("User-Agent", c.userAgent) + } + + var lastErr error + for attempt := 0; attempt < c.maxRetries; attempt++ { + if attempt > 0 { + // Exponential backoff: 1s, 2s, 4s + backoff := time.Duration(1<<uint(attempt-1)) * time.Second + select { + case <-time.After(backoff): + case <-req.Context().Done(): + return nil, req.Context().Err() + } + } + + resp, err := c.client.Do(req) + if err != nil { + lastErr = err + continue + } + + // Retry on 429 (rate limit) but not other errors + if resp.StatusCode == http.StatusTooManyRequests { + resp.Body.Close() + lastErr = nil // Reset error for retryable status code + continue + } + + return resp, nil + } + + return nil, lastErr +} + +// RateLimitArxiv adds a delay for arXiv API requests. +func (c *HTTPClient) RateLimitArxiv(ctx context.Context) error { + select { + case <-time.After(c.arxivDelay): + return nil + case <-ctx.Done(): + return ctx.Err() + } +} + +// RateLimitS2 adds a delay for Semantic Scholar API requests. +func (c *HTTPClient) RateLimitS2(ctx context.Context) error { + select { + case <-time.After(c.s2Delay): + return nil + case <-ctx.Done(): + return ctx.Err() + } +} + +// config for scholfetch. +type Config struct { + WithContent bool + Verbose bool + Logger Logger + HTTP *HTTPClient + ArxivBatch int + S2APIKey string +} + +// Logger interface for dependency injection +type Logger interface { + Printf(format string, v ...interface{}) +} + +func NewConfig() *Config { + return &Config{ + WithContent: false, + Verbose: false, + HTTP: NewHTTPClient(), + ArxivBatch: 50, + S2APIKey: os.Getenv("S2_API_KEY"), + } +} + +func NewConfigWithLogger(logger Logger) *Config { + cfg := NewConfig() + cfg.Logger = logger + return cfg +} @@ -0,0 +1,18 @@ +module scholfetch + +go 1.25.5 + +require ( + github.com/PuerkitoBio/goquery v1.11.0 + github.com/mmcdole/gofeed v1.3.0 +) + +require ( + github.com/andybalholm/cascadia v1.3.3 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + golang.org/x/net v0.47.0 // indirect + golang.org/x/text v0.31.0 // indirect +) @@ -0,0 +1,96 @@ +github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw= +github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/mmcdole/gofeed v1.3.0 h1:5yn+HeqlcvjMeAI4gu6T+crm7d0anY85+M+v6fIFNG4= +github.com/mmcdole/gofeed v1.3.0/go.mod h1:9TGv2LcJhdXePDzxiuMnukhV2/zb6VtnZt1mS+SjkLE= +github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 h1:Zr92CAlFhy2gL+V1F+EyIuzbQNbSgP4xhTODZtrXUtk= +github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= @@ -0,0 +1,198 @@ +// RAW HTML HANDLER +// +// Fallback handler for URLs that don't match arXiv or Semantic Scholar patterns. +// +// STRATEGY: +// - progressive extraction tries multiple metadata sources in order +// - JSON-LD structured data first (highest quality) +// - citation meta tags (scholarly articles) +// - open Graph (social media) +// - basic HTML tags (last resort) +package main + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + + "github.com/PuerkitoBio/goquery" +) + +// fetchRawHTML attempts to fetch article content by parsing HTML metadata. +func fetchRawHTML(ctx context.Context, config *Config, urlStr string) (*Article, error) { + req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) + if err != nil { + return nil, fmt.Errorf("failed to build request for %s: %w", urlStr, err) + } + + resp, err := config.HTTP.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to fetch %s: %w", urlStr, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("HTTP %d when fetching %s", resp.StatusCode, urlStr) + } + + doc, err := goquery.NewDocumentFromReader(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to parse HTML from %s: %w", urlStr, err) + } + + // extract title using various strategies + strategies := []func(*goquery.Document) string{ + extractTitleFromJSONLD, + extractTitleFromCitationMeta, + extractTitleFromOpenGraph, + func(d *goquery.Document) string { return normalizeSpace(d.Find("title").First().Text()) }, + } + + var title string + for _, strategy := range strategies { + title = strategy(doc) + if title != "" { + break + } + } + + if title == "" { + return nil, fmt.Errorf("no title found for %s", urlStr) + } + + article := &Article{ + URL: urlStr, + Title: title, + } + + // only fetch content if requested + if config.WithContent { + contentStrategies := []func(*goquery.Document) string{ + extractContentFromJSONLD, + extractContentFromCitationMeta, + extractContentFromOpenGraph, + extractContentFromBasicMeta, + } + + var content string + for _, strategy := range contentStrategies { + content = strategy(doc) + if content != "" && len(content) > 50 { + break + } + } + + if content != "" { + article.Content = content + } + } + + return article, nil +} + +func extractTitleFromJSONLD(doc *goquery.Document) string { + var title string + doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) { + if title != "" { + return + } + var data map[string]interface{} + if json.Unmarshal([]byte(s.Text()), &data) == nil { + t := getStringFromJSONLD(data, []string{"name", "headline", "title"}) + if t != "" { + title = normalizeSpace(t) + } + } + }) + return title +} + +func extractContentFromJSONLD(doc *goquery.Document) string { + var content string + doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) { + if content != "" { + return + } + var data map[string]interface{} + if json.Unmarshal([]byte(s.Text()), &data) == nil { + d := getStringFromJSONLD(data, []string{"description", "abstract", "summary"}) + if d != "" { + content = normalizeSpace(d) + if len(content) > 5000 { + content = content[:5000] + "..." + } + } + } + }) + return content +} + +func extractTitleFromCitationMeta(doc *goquery.Document) string { + title, _ := doc.Find("meta[name='citation_title']").Attr("content") + return normalizeSpace(title) +} + +func extractContentFromCitationMeta(doc *goquery.Document) string { + content, _ := doc.Find("meta[name='citation_abstract']").Attr("content") + return normalizeSpace(content) +} + +func extractTitleFromOpenGraph(doc *goquery.Document) string { + title, _ := doc.Find("meta[property='og:title']").Attr("content") + return normalizeSpace(title) +} + +func extractContentFromOpenGraph(doc *goquery.Document) string { + content, _ := doc.Find("meta[property='og:description']").Attr("content") + return normalizeSpace(content) +} + +func extractContentFromBasicMeta(doc *goquery.Document) string { + contentRaw, _ := doc.Find("meta[name='description']").Attr("content") + content := normalizeSpace(contentRaw) + + // if meta description is too short, try to extract from the body + if len(content) < 100 { + selectors := []string{"article", "main", ".abstract", ".summary", "[role='main']", ".content", ".entry-content"} + for _, selector := range selectors { + if contentText := extractCleanText(doc, selector); len(contentText) > len(content) { + content = contentText + break + } + } + } + + if len(content) > 5000 { + content = content[:5000] + } + + if len(content) < 50 { + return "" + } + + return content +} + +func extractCleanText(doc *goquery.Document, selector string) string { + element := doc.Find(selector).First() + if element.Length() == 0 { + return "" + } + element.Find("script, style, nav, header, footer, aside").Remove() + text := element.Text() + text = normalizeSpace(text) + if len(text) > 5000 { + text = text[:5000] + } + return text +} + +func getStringFromJSONLD(data map[string]interface{}, fields []string) string { + for _, field := range fields { + if val, ok := data[field].(string); ok && val != "" { + return val + } + } + return "" +} diff --git a/justfile b/justfile new file mode 100644 index 0000000..2e0f285 --- /dev/null +++ b/justfile @@ -0,0 +1,20 @@ +# ScholFetch - URL to article metadata converter + +default: + @just --list + +# Build the binary +build: + go build -o scholfetch . + +# Run tests +test: + go test ./... + +# Format code +fmt: + go fmt ./... + +# Run linter (requires golangci-lint) +lint: + golangci-lint run @@ -0,0 +1,131 @@ +// scholfetch - URL to article converter for scholscan +// takes URLs on stdin, outputs Article structs on stdout (JSONL) +// logs everything to scholfetch.log +package main + +import ( + "bufio" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "os" + "strings" +) + +type Article struct { + Title string `json:"title"` + Content string `json:"content,omitempty"` // Optional - expensive to fetch + URL string `json:"url"` + Route string `json:"-"` // Internal: tracks which handler succeeded +} + +type Result struct { + Urls []string + FailureIndices []int + ArticlesWritten int + Errors int +} + +func main() { + var withContent bool + var verbose bool + + fs := flag.NewFlagSet("scholfetch", flag.ExitOnError) + fs.Usage = func() { + fmt.Fprintf(fs.Output(), `Usage: scholfetch [options] < urls.txt > articles.jsonl + +Converts URLs to Article JSONL format for scholscan processing. + +Default mode: Title-only extraction (fast) +Optional mode: Full content extraction with --with-content + +Input: Text file with one URL per line (stdin) +Output: Article JSONL (stdout) + +Options: +`) + fs.PrintDefaults() + fmt.Fprint(fs.Output(), ` +Examples: + # Title-only mode (default) + cat urls.txt | scholfetch > articles.jsonl + + # With full content + cat urls.txt | scholfetch --with-content > articles.jsonl + +Note: Set S2_API_KEY environment variable for higher Semantic Scholar rate limits. +`) + } + + fs.BoolVar(&withContent, "with-content", false, "Fetch full article content (slower)") + fs.BoolVar(&verbose, "verbose", false, "Show progress information") + + // validate args and exit early on err + if err := fs.Parse(os.Args[1:]); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + if fs.NArg() > 0 { + fmt.Fprintf(os.Stderr, "Error: Unexpected arguments: %v\n", fs.Args()) + os.Exit(1) + } + + // set up logger + var logger *log.Logger + if verbose { + logger = log.New(os.Stderr, "", log.LstdFlags) + } else { + logger = log.New(io.Discard, "", 0) + } + + // config controls how URLs are handled and what data is extracted + config := NewConfigWithLogger(logger) + config.WithContent = withContent + config.Verbose = verbose + + urls := readURLs(os.Stdin) + + // notify user about S2 key found/not + if config.S2APIKey != "" { + fmt.Fprintln(os.Stderr, "Semantic Scholar API key detected: using authenticated rate limits.") + } else { + fmt.Fprintln(os.Stderr, "Semantic Scholar API key not set: using public rate limits.") + } + + // log file for processing info, sep from stderr to keep output clean + logFile, err := os.Create("scholfetch.log") + if err != nil { + fmt.Fprintf(os.Stderr, "Error: could not create log file: %v\n", err) + os.Exit(1) + } + defer logFile.Close() + + fmt.Fprintf(os.Stderr, "Processing %d URLs (content=%t)...\n", len(urls), withContent) + fmt.Fprintln(os.Stderr, "Monitor progress: tail -f scholfetch.log") + + encoder := json.NewEncoder(os.Stdout) + + // DO THE ACTUAL WORK + result := ProcessURLsWithConfig(urls, config, encoder, logFile) + + // report final stats to stderr + fmt.Fprintf(os.Stderr, "Finished: %d articles written, %d errors\n", result.ArticlesWritten, result.Errors) + fmt.Fprintln(os.Stderr, "See scholfetch.log for details") +} + +// readURLs parses stdin into a URL slice +// filters out empty lines and comments (#) +func readURLs(r io.Reader) []string { + var urls []string + scanner := bufio.NewScanner(r) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line != "" && !strings.HasPrefix(line, "#") { + urls = append(urls, line) + } + } + return urls +} diff --git a/processor.go b/processor.go new file mode 100644 index 0000000..1079e3d --- /dev/null +++ b/processor.go @@ -0,0 +1,295 @@ +// PROCESSING PIPELINE +// +// Handles batch processing of URLs with rate limiting and fallback strategies. +// +// DESIGN: +// - fixed chunk size (50) to balance API efficiency and error recovery +// - batching for arxiv/s2 APIs, individual fallback on batch failure +// - sep handlers for each route type (arxiv, s2, rawhtml) +// - JSONL logging of every attempt (success/failure) with timestamps +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "os" + "time" +) + +type ProcessResult struct { + ArticlesWritten int + Errors int +} + +type URLLogEntry struct { + Time string `json:"time"` + URL string `json:"url"` + Success int `json:"success"` + API string `json:"api"` + Error string `json:"error,omitempty"` +} + +func logArticleAttempt(logEncoder *json.Encoder, url, api string, err error) error { + success := 0 + errMsg := "" + if err == nil { + success = 1 + } else { + errMsg = err.Error() + } + return logEncoder.Encode(URLLogEntry{ + Time: time.Now().Format(time.RFC3339), + URL: url, + Success: success, + API: api, + Error: errMsg, + }) +} + +func logEncodingFailure(logEncoder *json.Encoder, url string, err error) error { + return logEncoder.Encode(URLLogEntry{ + Time: time.Now().Format(time.RFC3339), + URL: url, + Success: 0, + API: "", + Error: fmt.Sprintf("encoding error: %v", err), + }) +} + +// ProcessURLsWithConfig orchestrates the entire processing pipeline +// chunks URLs to balance API efficiency with error recovery +func ProcessURLsWithConfig(urls []string, config *Config, encoder *json.Encoder, logFile io.Writer) ProcessResult { + result := ProcessResult{} + ctx := context.Background() + logEncoder := json.NewEncoder(logFile) + + chunkSize := 50 + + processedCount := 0 + + // process URLs in chunks + for i := 0; i < len(urls); i += chunkSize { + end := i + chunkSize + if end > len(urls) { + end = len(urls) + } + + chunk := urls[i:end] + + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Processing chunk %d-%d of %d URLs", i+1, end, len(urls)) + } + + // do the work + chunkResult := processChunk(ctx, chunk, config, encoder, logEncoder) + + result.ArticlesWritten += chunkResult.ArticlesWritten + result.Errors += chunkResult.Errors + processedCount += len(chunk) + + if config.Verbose && config.Logger != nil { + fmt.Fprintf(os.Stderr, "Processed %d articles...\n", processedCount) + } + } + + return result +} + +// processChunk handles routing, batching, and fallback for a given chunk of URLs. +func processChunk(ctx context.Context, urls []string, config *Config, encoder *json.Encoder, logEncoder *json.Encoder) ProcessResult { + result := ProcessResult{} + + // create temporary articles for routing and processing + articles := make([]*Article, len(urls)) + for i, url := range urls { + articles[i] = &Article{URL: url} + } + + // 1. toute all articles in the chunk + for _, article := range articles { + routeArticle(article) + } + + // 2. group by type for batching + arxivURLs := []string{} + s2URLs := []string{} + htmlURLs := []string{} + + for _, article := range articles { + switch article.Route { + case "arxiv": + arxivURLs = append(arxivURLs, article.URL) + case "s2": + s2URLs = append(s2URLs, article.URL) + default: + htmlURLs = append(htmlURLs, article.URL) + } + } + + // 3. process each type (lim to chunk size) + if len(arxivURLs) > 0 { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Processing %d arXiv URLs in chunk", len(arxivURLs)) + } + n, err := processArxiv(ctx, arxivURLs, encoder, config, logEncoder) + result.ArticlesWritten += n + if err != nil { + result.Errors++ + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error processing arXiv URLs: %v", err) + } + } + } + + if len(s2URLs) > 0 { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Processing %d Semantic Scholar URLs in chunk", len(s2URLs)) + } + n, err := processSemanticScholar(ctx, s2URLs, encoder, config, logEncoder) + result.ArticlesWritten += n + if err != nil { + result.Errors++ + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error processing S2 URLs: %v", err) + } + } + } + + if len(htmlURLs) > 0 { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Processing %d raw HTML URLs in chunk", len(htmlURLs)) + } + n, err := processHTML(ctx, htmlURLs, encoder, config, logEncoder) + result.ArticlesWritten += n + if err != nil { + result.Errors++ + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error processing HTML URLs: %v", err) + } + } + } + + return result +} + +func processArxiv(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) { + articles, err := fetchArxivBatch(ctx, config, urls) + if err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("arXiv batch failed: %v, falling back to individual processing", err) + } + return processIndividualArxiv(ctx, urls, encoder, config, logEncoder) + } + + written := 0 + for _, article := range articles { + if err := encoder.Encode(article); err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error encoding article: %v", err) + } + _ = logEncodingFailure(logEncoder, article.URL, err) + } else { + written++ + _ = logArticleAttempt(logEncoder, article.URL, "arxiv", nil) + } + } + return written, nil +} + +func processSemanticScholar(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) { + articles, err := fetchSemanticScholarBatch(ctx, config, urls) + if err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("S2 batch failed: %v, falling back to individual processing", err) + } + return processIndividualS2(ctx, urls, encoder, config, logEncoder) + } + + written := 0 + for _, article := range articles { + if err := encoder.Encode(article); err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error encoding article: %v", err) + } + _ = logEncodingFailure(logEncoder, article.URL, err) + } else { + written++ + _ = logArticleAttempt(logEncoder, article.URL, "s2", nil) + } + } + return written, nil +} + +func processHTML(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) { + written := 0 + for _, url := range urls { + article, err := fetchRawHTML(ctx, config, url) + if err != nil { + _ = logArticleAttempt(logEncoder, url, "", err) + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error fetching HTML %s: %v", url, err) + } + continue + } + if err := encoder.Encode(article); err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error encoding article: %v", err) + } + _ = logEncodingFailure(logEncoder, url, err) + } else { + written++ + _ = logArticleAttempt(logEncoder, url, "html", nil) + } + } + return written, nil +} + +func processIndividualArxiv(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) { + written := 0 + for _, url := range urls { + article, err := fetchArxiv(ctx, config, url) + if err != nil { + _ = logArticleAttempt(logEncoder, url, "", err) + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error fetching arXiv %s: %v", url, err) + } + continue + } + if err := encoder.Encode(article); err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error encoding article: %v", err) + } + _ = logEncodingFailure(logEncoder, url, err) + } else { + written++ + _ = logArticleAttempt(logEncoder, url, "arxiv", nil) + } + } + return written, nil +} + +func processIndividualS2(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) { + written := 0 + for _, url := range urls { + article, err := fetchSemanticScholar(ctx, config, url) + if err != nil { + _ = logArticleAttempt(logEncoder, url, "", err) + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error fetching S2 %s: %v", url, err) + } + continue + } + if err := encoder.Encode(article); err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error encoding article: %v", err) + } + _ = logEncodingFailure(logEncoder, url, err) + } else { + written++ + _ = logArticleAttempt(logEncoder, url, "s2", nil) + } + } + return written, nil +} diff --git a/routes.go b/routes.go new file mode 100644 index 0000000..39bb7a9 --- /dev/null +++ b/routes.go @@ -0,0 +1,75 @@ +// ROUTING STRATEGY +// +// Routes URLs to the appropriate extraction handler. The order matters: +// 1. arxiv - direct arXiv URLs and IDs, use specialized arXiv API +// 2. s2 - Semantic Scholar for DOI-based sources, richer metadata +// 3. rawhtml - fallback for direct publisher URLs, generic extraction +package main + +import ( + "fmt" + "net/url" + "regexp" + "strings" +) + +var ( + // regex to extract arXiv identifier from various arXiv URLs. + // supports new (2109.05857) and old (math-ph/0301015) formats. + arxivIdentifierRegex = regexp.MustCompile(`(?:arxiv\.org/(?:abs|pdf)/|arXiv:)([a-z-]+/[0-9]{7}|[0-9]{4}\.[0-9]{4,5}(?:v[0-9]+)?)(?:\.pdf)?`) + + // regex to find a DOI in a string. + doiRegex = regexp.MustCompile(`(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)`) +) + +// route determines the primary enrichment strategy for a URL. +// returns the route str: "arxiv", "s2", or "rawhtml". +func Route(urlStr string) string { + parsedURL, err := url.Parse(urlStr) + if err != nil { + return "rawhtml" // fallback if URL is unparseable + } + + hostname := parsedURL.Hostname() + + // 1. arXiv.org or arXiv ID pattern in URL + if hostname == "arxiv.org" || strings.Contains(urlStr, "arxiv.org/abs/") { + if _, err := getArxivIdentifier(urlStr); err == nil { + return "arxiv" + } + } + + // 2. direct DOI link from doi.org + if hostname == "doi.org" { + return "s2" + } + + // 3. DOI present in URL path (e.g. some publisher sites) + if doi := getDOI(urlStr); doi != "" { + return "s2" + } + + // 4. fallback to rawhtml + return "rawhtml" +} + +// routeArticle determines the route for an article and sets the Route field. +func routeArticle(article *Article) { + article.Route = Route(article.URL) +} + +func getArxivIdentifier(articleURL string) (string, error) { + matches := arxivIdentifierRegex.FindStringSubmatch(articleURL) + if len(matches) > 1 { + return matches[1], nil + } + return "", fmt.Errorf("no arXiv identifier found") +} + +func getDOI(text string) string { + matches := doiRegex.FindStringSubmatch(text) + if len(matches) > 1 { + return matches[1] + } + return "" +} diff --git a/scholar.go b/scholar.go new file mode 100644 index 0000000..ad1e5e0 --- /dev/null +++ b/scholar.go @@ -0,0 +1,217 @@ +// SEMANTIC SCHOLAR HANDLER +// +// Uses S2's Graph API to fetch paper metadata via DOI. +// +// STRATEGY: +// - requires valid DOI in URL or DOI.org redirect +// - batch API for efficiency (up to 500 papers per request) +// - positional matching: response[i] maps to URLs[i] +// - rate limited to 100ms per request (configurable with API key) +// +// AUTH: +// - S2_API_KEY environment variable increases rate limits +// - Without key: public limits apply +package main + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" +) + +const ( + semScholarPaperDOIFmtTitle = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title" + semScholarPaperDOIFmtFull = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title,abstract" + semScholarBatchURLTitle = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title" + semScholarBatchURLFull = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title,abstract" +) + +// escapeDOI URL-encodes a DOI for safe use in API endpoints. +// DOIs contain forward slashes which must be escaped for the URL path. +// Example: "10.1234/abcd5678" -> "10.1234/abcd5678" (already safe in this case) +func escapeDOI(doi string) string { + parts := strings.SplitN(doi, "/", 2) + if len(parts) != 2 { + return url.PathEscape(doi) + } + return url.PathEscape(parts[0]) + "/" + url.PathEscape(parts[1]) +} + +// S2BatchResponseItem represents a Semantic Scholar batch API response item +type S2BatchResponseItem struct { + PaperID string `json:"paperId"` + Title string `json:"title"` + Abstract string `json:"abstract"` +} + +// fetchSemanticScholar fetches content for a single DOI via Semantic Scholar. +func fetchSemanticScholar(ctx context.Context, config *Config, urlStr string) (*Article, error) { + doi := getDOI(urlStr) + if doi == "" { + return nil, fmt.Errorf("Semantic Scholar: URL doesn't contain valid DOI: %s", urlStr) + } + + // rate limit + if err := config.HTTP.RateLimitS2(ctx); err != nil { + return nil, err + } + + escapedDOI := escapeDOI(doi) + + // choose the appropriate URL based on whether we need content + var apiURL string + if config.WithContent { + apiURL = fmt.Sprintf(semScholarPaperDOIFmtFull, escapedDOI) + } else { + apiURL = fmt.Sprintf(semScholarPaperDOIFmtTitle, escapedDOI) + } + + req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) + if err != nil { + return nil, fmt.Errorf("failed to construct Semantic Scholar request: %w", err) + } + req.Header.Set("Accept", "application/json") + if config.S2APIKey != "" { + req.Header.Set("x-api-key", config.S2APIKey) + } + + resp, err := config.HTTP.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to make request to Semantic Scholar API: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("Semantic Scholar API returned non-200 status for DOI %s: %s", doi, resp.Status) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read Semantic Scholar API response body: %w", err) + } + + var s2 struct { + Title string `json:"title"` + Abstract string `json:"abstract"` + } + if err := json.Unmarshal(body, &s2); err != nil { + return nil, fmt.Errorf("failed to unmarshal Semantic Scholar JSON for DOI %s: %w", doi, err) + } + + title := normalizeSpace(s2.Title) + content := normalizeSpace(s2.Abstract) + + // del content if not requested + if !config.WithContent { + content = "" + } + + if title == "" { + return nil, fmt.Errorf("no title found for DOI %s", doi) + } + + return &Article{ + Title: title, + Content: content, + URL: urlStr, + }, nil +} + +// fetchSemanticScholarBatch fetches a batch of papers from the S2 API. +func fetchSemanticScholarBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) { + if len(urls) == 0 { + return nil, nil + } + + // rate limit + if err := config.HTTP.RateLimitS2(ctx); err != nil { + return nil, err + } + + // extract DOIs from URLs, maintaining order for pos matching + validURLs := make([]string, 0, len(urls)) + s2IDs := make([]string, 0, len(urls)) + + for _, urlStr := range urls { + doi := getDOI(urlStr) + if doi != "" { + validURLs = append(validURLs, urlStr) + s2IDs = append(s2IDs, "DOI:"+doi) + } + } + + if len(s2IDs) == 0 { + return nil, nil + } + + requestBody, err := json.Marshal(map[string][]string{"ids": s2IDs}) + if err != nil { + return nil, fmt.Errorf("failed to marshal S2 batch request body: %w", err) + } + + // choose the appropriate URL based on whether we need content + var batchURL string + if config.WithContent { + batchURL = semScholarBatchURLFull + } else { + batchURL = semScholarBatchURLTitle + } + + req, err := http.NewRequestWithContext(ctx, "POST", batchURL, bytes.NewReader(requestBody)) + if err != nil { + return nil, fmt.Errorf("failed to create S2 batch request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + if config.S2APIKey != "" { + req.Header.Set("x-api-key", config.S2APIKey) + } + + resp, err := config.HTTP.Do(req) + if err != nil { + return nil, fmt.Errorf("S2 batch request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("S2 batch API returned non-200 status: %s", resp.Status) + } + + var responseItems []*S2BatchResponseItem + if err := json.NewDecoder(resp.Body).Decode(&responseItems); err != nil { + return nil, fmt.Errorf("failed to decode S2 batch response: %w", err) + } + + var articles []*Article + // match responses positionally to input URLs + for i, item := range responseItems { + if i >= len(validURLs) { + break + } + if item == nil { + continue + } + + title := normalizeSpace(item.Title) + if title != "" { + content := normalizeSpace(item.Abstract) + + // skip content if not requested + if !config.WithContent { + content = "" + } + + articles = append(articles, &Article{ + Title: title, + Content: content, + URL: validURLs[i], + }) + } + } + + return articles, nil +} diff --git a/scholfetch_test.go b/scholfetch_test.go new file mode 100644 index 0000000..59adae7 --- /dev/null +++ b/scholfetch_test.go @@ -0,0 +1,193 @@ +package main + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +type TestLogger struct { + messages []string +} + +func (l *TestLogger) Printf(format string, v ...interface{}) { + l.messages = append(l.messages, fmt.Sprintf(format, v...)) +} + +func TestHTTPClientDefaults(t *testing.T) { + client := NewHTTPClient() + + if client.userAgent != "scholfetch/1.0 (+https://samsci.com)" { + t.Errorf("Expected default user agent, got %s", client.userAgent) + } + + if client.arxivDelay != 1*time.Second { + t.Errorf("Expected arxiv delay of 1s, got %v", client.arxivDelay) + } + + if client.maxRetries != 3 { + t.Errorf("Expected max retries of 3, got %d", client.maxRetries) + } +} + +func TestRateLimiting(t *testing.T) { + client := NewHTTPClient() + client.arxivDelay = 10 * time.Millisecond // Speed up test + client.s2Delay = 5 * time.Millisecond + + // Test arxiv rate limiting + start := time.Now() + err := client.RateLimitArxiv(context.Background()) + if err != nil { + t.Fatalf("RateLimitArxiv failed: %v", err) + } + duration := time.Since(start) + if duration < 10*time.Millisecond { + t.Errorf("Expected arxiv delay of ~10ms, got %v", duration) + } + + // Test S2 rate limiting + start = time.Now() + err = client.RateLimitS2(context.Background()) + if err != nil { + t.Fatalf("RateLimitS2 failed: %v", err) + } + duration = time.Since(start) + if duration < 5*time.Millisecond { + t.Errorf("Expected S2 delay of ~5ms, got %v", duration) + } +} + +func TestHTTPIPRequest(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("test response")) + })) + defer server.Close() + + client := NewHTTPClient() + req, _ := http.NewRequest("GET", server.URL, nil) + + resp, err := client.Do(req) + if err != nil { + t.Fatalf("Request failed: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + t.Errorf("Expected status 200, got %d", resp.StatusCode) + } +} + +func TestURLRouting(t *testing.T) { + tests := map[string]string{ + "https://arxiv.org/abs/2301.00001": "arxiv", + "https://arxiv.org/pdf/2301.00001.pdf": "arxiv", + "http://arxiv.org/abs/2301.00001v2": "arxiv", + "https://api.semanticscholar.org/DOI:10.1234": "rawhtml", + "https://doi.org/10.1234/abcd5678": "s2", + "https://example.com/paper": "rawhtml", + "https://pubmed.ncbi.nlm.nih.gov/12345678/": "rawhtml", + } + + for url, expected := range tests { + result := Route(url) + if result != expected { + t.Errorf("Route(%s) = %s, expected %s", url, result, expected) + } + } +} + +func TestConfigDefaults(t *testing.T) { + config := NewConfig() + + if config.WithContent != false { + t.Error("Expected WithContent=false by default") + } + + if config.Verbose != false { + t.Error("Expected Verbose=false by default") + } + + if config.ArxivBatch != 50 { + t.Errorf("Expected ArxivBatch=50, got %d", config.ArxivBatch) + } + + if config.HTTP == nil { + t.Error("Expected HTTP client to be initialized") + } +} + +func TestConfigWithLogger(t *testing.T) { + logger := &TestLogger{} + config := NewConfigWithLogger(logger) + + if config.Logger != logger { + t.Error("Logger not set correctly") + } +} + +func TestArxivURLParsing(t *testing.T) { + tests := map[string]string{ + "https://arxiv.org/abs/2301.00001": "2301.00001", + "http://arxiv.org/abs/2301.00001v2": "2301.00001v2", + "https://arxiv.org/pdf/2301.00001.pdf": "2301.00001", + "https://example.com/not-arxiv": "", + } + + for url, expected := range tests { + result, _ := getArxivIdentifier(url) + if result != expected { + t.Errorf("getArxivIdentifier(%s) = %s, expected %s", url, result, expected) + } + } +} + +func TestDOIParsing(t *testing.T) { + tests := map[string]string{ + "https://doi.org/10.1234/abcd5678": "10.1234/abcd5678", + "https://api.semanticscholar.org/DOI:10.1234": "", + "https://example.com/no-doi": "", + } + + for url, expected := range tests { + result := getDOI(url) + if result == expected { + t.Logf("✓ getDOI(%s) = %s", url, result) + } else { + t.Errorf("getDOI(%s) = %s, expected %s", url, result, expected) + } + } +} + +func TestBatchURLRouting(t *testing.T) { + urls := []string{ + "https://arxiv.org/abs/2301.00001", + "https://doi.org/10.1234/test1", + "https://example.com/paper1", + "https://arxiv.org/pdf/2301.00002.pdf", + "https://doi.org/10.5678/test2", + } + + routeCounts := make(map[string]int) + for _, url := range urls { + route := Route(url) + routeCounts[route]++ + } + + expected := map[string]int{ + "arxiv": 2, + "s2": 2, + "rawhtml": 1, + } + + for route, expectedCount := range expected { + if routeCounts[route] != expectedCount { + t.Errorf("Expected %d URLs for route %s, got %d", + expectedCount, route, routeCounts[route]) + } + } +}
\ No newline at end of file @@ -0,0 +1,14 @@ +package main + +import ( + "regexp" + "strings" +) + +func normalizeSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} + +func stripArxivVersion(id string) string { + return regexp.MustCompile(`v\d+$`).ReplaceAllString(id, "") +} |
