aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSam Scholten2025-12-15 19:35:46 +1000
committerSam Scholten2025-12-15 19:35:57 +1000
commit3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2 (patch)
tree42b1f0e0a346a1cf087df90e29a100edbd66b3eb
downloadscholfetch-3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2.tar.gz
scholfetch-3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2.zip
Init v0.1.0HEADmain
-rw-r--r--.gitignore10
-rw-r--r--README.md83
-rw-r--r--arxiv.go196
-rw-r--r--client.go133
-rw-r--r--go.mod18
-rw-r--r--go.sum96
-rw-r--r--html.go198
-rw-r--r--justfile20
-rw-r--r--main.go131
-rw-r--r--processor.go295
-rw-r--r--routes.go75
-rw-r--r--scholar.go217
-rw-r--r--scholfetch_test.go193
-rw-r--r--util.go14
14 files changed, 1679 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4a79922
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+# Built binaries
+scholfetch
+
+# Test files
+test_urls.txt
+
+# Environment and configuration files
+.env*
+config.*
+secrets.*
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9190dc1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,83 @@
+# ScholFetch
+
+URL → Article metadata (JSONL) converter. Fetches title-only by default for speed.
+
+## Overview
+
+ScholFetch extracts academic article metadata from URLs.
+It supports arXiv, Semantic Scholar, and generic HTML sources.
+The tool outputs structured JSONL format suitable for downstream processing by ScholScan (see below).
+
+## Usage
+```bash
+cat urls.txt | scholfetch > articles.jsonl
+# or:
+cat urls.txt | scholfetch --with-content > articles.jsonl
+```
+
+## Monitoring Progress
+
+ScholFetch writes a structured log file `scholfetch.log` during processing. Monitor it in another terminal:
+
+```bash
+tail -f scholfetch.log
+```
+
+## Semantic Scholar API key
+
+Get higher rate limits by setting your S2 API key (*not required*):
+
+```bash
+export S2_API_KEY="your-key-here"
+cat urls.txt | scholfetch > articles.jsonl
+```
+
+Get your free key at: https://www.semanticscholar.org/product/api
+
+ScholFetch will notify you on startup whether the key is detected.
+
+## Integration with ScholScan
+
+Once you have structured article data, pipe it to [ScholScan](https://git.samsci.com/scholscan) for ML-based filtering:
+
+```bash
+# Get articles from URLs
+cat urls.txt | scholfetch > articles.jsonl
+
+# Train a classification model
+scholscan train articles.jsonl --rss-feeds feeds.txt > model.json
+
+# Score articles from an RSS feed
+scholscan scan --model model.json --url "https://example.com/feed.rss" > results.jsonl
+```
+
+ScholFetch extracts and enriches article metadata, while ScholScan handles classification. Together they provide a complete pipeline for filtering academic literature.
+
+## Input/Output
+- Input: URLs (one per line) on stdin
+- Output: JSONL with `title` and `url` fields (stdout)
+- Add `--with-content` for `content` field
+
+## How it works
+
+URLs get routed by pattern (arXiv IDs → arXiv API, DOIs → Semantic Scholar, everything else → HTML scrape).
+Batched in chunks of 50 for efficiency. If batch fails, falls back to individual requests. Rate limited per API.
+
+## Code
+
+- `main.go` - reads stdin, sets up flags/output
+- `routes.go` - determines which handler (arxiv/s2/html) for each URL
+- `processor.go` - batching, fallback logic
+- `arxiv.go`, `scholar.go`, `html.go` - the actual extractors
+- `client.go` - HTTP client with retries and rate limiting
+
+## Build and Development
+
+```bash
+just build
+just test
+```
+
+## Roadmap
+
+Future work could integrate crossref, pubmed quite easily (especially for title-only approach).
diff --git a/arxiv.go b/arxiv.go
new file mode 100644
index 0000000..6e7fad5
--- /dev/null
+++ b/arxiv.go
@@ -0,0 +1,196 @@
+// ARXIV HANDLER
+//
+// Uses arXiv's API to fetch article metadata.
+//
+// STRATEGY:
+// - single requests and batched requests supported
+// - uses gofeed to parse Atom XML responses
+// - rate limited to 1 request per second (conservative)
+// - handles both old (math-ph/0301015) and new (2109.05857) ID formats
+package main
+
+import (
+ "bytes"
+ "context"
+ "fmt"
+ "io"
+ "net/http"
+ "strings"
+
+ "github.com/mmcdole/gofeed"
+)
+
+const (
+ arxivQueryFmt = "http://export.arxiv.org/api/query?id_list=%s"
+)
+
+// fetchArxiv fetches content for a single arXiv article.
+func fetchArxiv(ctx context.Context, config *Config, urlStr string) (*Article, error) {
+ arxivID, err := getArxivIdentifier(urlStr)
+ if err != nil || arxivID == "" {
+ return nil, fmt.Errorf("arXiv: invalid URL format, expected arxiv.org/abs/ID: %s", urlStr)
+ }
+
+ // rate limit
+ if err := config.HTTP.RateLimitArxiv(ctx); err != nil {
+ return nil, err
+ }
+
+ apiURL := fmt.Sprintf(arxivQueryFmt, arxivID)
+ req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
+ if err != nil {
+ return nil, fmt.Errorf("failed to construct arXiv request: %w", err)
+ }
+ req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8")
+
+ resp, err := config.HTTP.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("failed to fetch arXiv feed for ID %s: %w", arxivID, err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("arXiv API returned non-200 status for ID %s: %s", arxivID, resp.Status)
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("failed to read arXiv response body for ID %s: %w", arxivID, err)
+ }
+
+ fp := gofeed.NewParser()
+ feed, err := fp.Parse(bytes.NewReader(body))
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse arXiv feed for ID %s: %w", arxivID, err)
+ }
+
+ if len(feed.Items) == 0 {
+ return nil, fmt.Errorf("no items found in arXiv feed for ID %s", arxivID)
+ }
+
+ item := feed.Items[0]
+ title := normalizeSpace(item.Title)
+ content := normalizeSpace(item.Description)
+
+ if config.Verbose {
+ config.Logger.Printf("arXiv single fetch result: ID=%s, Title=%s", arxivID, title)
+ }
+
+ // del content if not requested
+ if !config.WithContent {
+ content = ""
+ }
+
+ if title == "" {
+ return nil, fmt.Errorf("no title found for arXiv ID %s", arxivID)
+ }
+
+ return &Article{
+ Title: title,
+ Content: content,
+ URL: urlStr,
+ }, nil
+}
+
+// fetchArxivBatch fetches metadata for a list of arXiv URLs in batches.
+func fetchArxivBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) {
+ if len(urls) == 0 {
+ return nil, nil
+ }
+
+ idToURL := make(map[string]string)
+ batchIDs := make([]string, 0, len(urls))
+
+ for _, urlStr := range urls {
+ id, err := getArxivIdentifier(urlStr)
+ if err != nil {
+ continue
+ }
+ batchIDs = append(batchIDs, id)
+ stripped := stripArxivVersion(id)
+ idToURL[stripped] = urlStr
+ }
+
+ if len(batchIDs) == 0 {
+ return nil, nil
+ }
+
+ var articles []*Article
+
+ for i := 0; i < len(batchIDs); i += config.ArxivBatch {
+ end := i + config.ArxivBatch
+ if end > len(batchIDs) {
+ end = len(batchIDs)
+ }
+
+ // rate limit
+ if err := config.HTTP.RateLimitArxiv(ctx); err != nil {
+ return nil, err
+ }
+
+ chunk := batchIDs[i:end]
+ apiURL := fmt.Sprintf(arxivQueryFmt, strings.Join(chunk, ","))
+
+ req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
+ if err != nil {
+ continue
+ }
+ req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8")
+
+ resp, err := config.HTTP.Do(req)
+ if err != nil {
+ continue
+ }
+
+ if resp.StatusCode != http.StatusOK {
+ resp.Body.Close()
+ continue
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ resp.Body.Close()
+ if err != nil {
+ continue
+ }
+
+ fp := gofeed.NewParser()
+ feed, err := fp.Parse(bytes.NewReader(body))
+ if err != nil {
+ continue
+ }
+
+ for _, item := range feed.Items {
+ id, err := getArxivIdentifier(item.GUID)
+ if err != nil || id == "" {
+ id, err = getArxivIdentifier(item.Link)
+ if err != nil || id == "" {
+ continue
+ }
+ }
+
+ title := normalizeSpace(item.Title)
+ if title == "" {
+ continue
+ }
+
+ baseID := stripArxivVersion(id)
+ originalURL, exists := idToURL[baseID]
+ if !exists {
+ continue
+ }
+
+ content := ""
+ if config.WithContent {
+ content = normalizeSpace(item.Description)
+ }
+
+ articles = append(articles, &Article{
+ Title: title,
+ Content: content,
+ URL: originalURL,
+ })
+ }
+ }
+
+ return articles, nil
+}
diff --git a/client.go b/client.go
new file mode 100644
index 0000000..39a3e34
--- /dev/null
+++ b/client.go
@@ -0,0 +1,133 @@
+// CLIENT LAYER - HTTP AND RATE LIMITING
+//
+// manages HTTP requests with retry logic and API-specific rate limits.
+//
+// RATE LIMITS:
+// - arXiv: 1 second between requests (enforced to be safe)
+// - Semantic Scholar: 100ms between requests (configurable via API key)
+//
+// STRATEGY:
+// - retries on network failures and HTTP 429
+// - exponential backoff: 1s, 2s, 4s
+// - all delays respect context cancellation
+package main
+
+import (
+ "context"
+ "net/http"
+ "os"
+ "time"
+)
+
+// HTTPClient wraps an HTTP client with common behavior like user agent,
+// rate limiting, and retry logic.
+type HTTPClient struct {
+ client *http.Client
+ userAgent string
+ arxivDelay time.Duration
+ s2Delay time.Duration
+ maxRetries int
+}
+
+// NewHTTPClient creates a new HTTP client wrapper with defaults.
+func NewHTTPClient() *HTTPClient {
+ return &HTTPClient{
+ client: &http.Client{
+ Timeout: 30 * time.Second,
+ },
+ userAgent: "scholfetch/1.0 (+https://samsci.com)",
+ arxivDelay: 1 * time.Second,
+ s2Delay: 100 * time.Millisecond,
+ maxRetries: 3,
+ }
+}
+
+// Do performs an HTTP request with retry logic.
+// retries on network errors and 429 (rate limit) responses.
+func (c *HTTPClient) Do(req *http.Request) (*http.Response, error) {
+ // Set user agent if not already set
+ if req.Header.Get("User-Agent") == "" {
+ req.Header.Set("User-Agent", c.userAgent)
+ }
+
+ var lastErr error
+ for attempt := 0; attempt < c.maxRetries; attempt++ {
+ if attempt > 0 {
+ // Exponential backoff: 1s, 2s, 4s
+ backoff := time.Duration(1<<uint(attempt-1)) * time.Second
+ select {
+ case <-time.After(backoff):
+ case <-req.Context().Done():
+ return nil, req.Context().Err()
+ }
+ }
+
+ resp, err := c.client.Do(req)
+ if err != nil {
+ lastErr = err
+ continue
+ }
+
+ // Retry on 429 (rate limit) but not other errors
+ if resp.StatusCode == http.StatusTooManyRequests {
+ resp.Body.Close()
+ lastErr = nil // Reset error for retryable status code
+ continue
+ }
+
+ return resp, nil
+ }
+
+ return nil, lastErr
+}
+
+// RateLimitArxiv adds a delay for arXiv API requests.
+func (c *HTTPClient) RateLimitArxiv(ctx context.Context) error {
+ select {
+ case <-time.After(c.arxivDelay):
+ return nil
+ case <-ctx.Done():
+ return ctx.Err()
+ }
+}
+
+// RateLimitS2 adds a delay for Semantic Scholar API requests.
+func (c *HTTPClient) RateLimitS2(ctx context.Context) error {
+ select {
+ case <-time.After(c.s2Delay):
+ return nil
+ case <-ctx.Done():
+ return ctx.Err()
+ }
+}
+
+// config for scholfetch.
+type Config struct {
+ WithContent bool
+ Verbose bool
+ Logger Logger
+ HTTP *HTTPClient
+ ArxivBatch int
+ S2APIKey string
+}
+
+// Logger interface for dependency injection
+type Logger interface {
+ Printf(format string, v ...interface{})
+}
+
+func NewConfig() *Config {
+ return &Config{
+ WithContent: false,
+ Verbose: false,
+ HTTP: NewHTTPClient(),
+ ArxivBatch: 50,
+ S2APIKey: os.Getenv("S2_API_KEY"),
+ }
+}
+
+func NewConfigWithLogger(logger Logger) *Config {
+ cfg := NewConfig()
+ cfg.Logger = logger
+ return cfg
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..de16fa5
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,18 @@
+module scholfetch
+
+go 1.25.5
+
+require (
+ github.com/PuerkitoBio/goquery v1.11.0
+ github.com/mmcdole/gofeed v1.3.0
+)
+
+require (
+ github.com/andybalholm/cascadia v1.3.3 // indirect
+ github.com/json-iterator/go v1.1.12 // indirect
+ github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 // indirect
+ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+ github.com/modern-go/reflect2 v1.0.2 // indirect
+ golang.org/x/net v0.47.0 // indirect
+ golang.org/x/text v0.31.0 // indirect
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..5933b8c
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,96 @@
+github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
+github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
+github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
+github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
+github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/mmcdole/gofeed v1.3.0 h1:5yn+HeqlcvjMeAI4gu6T+crm7d0anY85+M+v6fIFNG4=
+github.com/mmcdole/gofeed v1.3.0/go.mod h1:9TGv2LcJhdXePDzxiuMnukhV2/zb6VtnZt1mS+SjkLE=
+github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 h1:Zr92CAlFhy2gL+V1F+EyIuzbQNbSgP4xhTODZtrXUtk=
+github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
+github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
+github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
+github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
+golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
+golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
+golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
+golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
+golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
+golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
+golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
+golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
+golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
+golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
+golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
+golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
+golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
+golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
+golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
+golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/html.go b/html.go
new file mode 100644
index 0000000..0995865
--- /dev/null
+++ b/html.go
@@ -0,0 +1,198 @@
+// RAW HTML HANDLER
+//
+// Fallback handler for URLs that don't match arXiv or Semantic Scholar patterns.
+//
+// STRATEGY:
+// - progressive extraction tries multiple metadata sources in order
+// - JSON-LD structured data first (highest quality)
+// - citation meta tags (scholarly articles)
+// - open Graph (social media)
+// - basic HTML tags (last resort)
+package main
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "net/http"
+
+ "github.com/PuerkitoBio/goquery"
+)
+
+// fetchRawHTML attempts to fetch article content by parsing HTML metadata.
+func fetchRawHTML(ctx context.Context, config *Config, urlStr string) (*Article, error) {
+ req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
+ if err != nil {
+ return nil, fmt.Errorf("failed to build request for %s: %w", urlStr, err)
+ }
+
+ resp, err := config.HTTP.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("failed to fetch %s: %w", urlStr, err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("HTTP %d when fetching %s", resp.StatusCode, urlStr)
+ }
+
+ doc, err := goquery.NewDocumentFromReader(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse HTML from %s: %w", urlStr, err)
+ }
+
+ // extract title using various strategies
+ strategies := []func(*goquery.Document) string{
+ extractTitleFromJSONLD,
+ extractTitleFromCitationMeta,
+ extractTitleFromOpenGraph,
+ func(d *goquery.Document) string { return normalizeSpace(d.Find("title").First().Text()) },
+ }
+
+ var title string
+ for _, strategy := range strategies {
+ title = strategy(doc)
+ if title != "" {
+ break
+ }
+ }
+
+ if title == "" {
+ return nil, fmt.Errorf("no title found for %s", urlStr)
+ }
+
+ article := &Article{
+ URL: urlStr,
+ Title: title,
+ }
+
+ // only fetch content if requested
+ if config.WithContent {
+ contentStrategies := []func(*goquery.Document) string{
+ extractContentFromJSONLD,
+ extractContentFromCitationMeta,
+ extractContentFromOpenGraph,
+ extractContentFromBasicMeta,
+ }
+
+ var content string
+ for _, strategy := range contentStrategies {
+ content = strategy(doc)
+ if content != "" && len(content) > 50 {
+ break
+ }
+ }
+
+ if content != "" {
+ article.Content = content
+ }
+ }
+
+ return article, nil
+}
+
+func extractTitleFromJSONLD(doc *goquery.Document) string {
+ var title string
+ doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
+ if title != "" {
+ return
+ }
+ var data map[string]interface{}
+ if json.Unmarshal([]byte(s.Text()), &data) == nil {
+ t := getStringFromJSONLD(data, []string{"name", "headline", "title"})
+ if t != "" {
+ title = normalizeSpace(t)
+ }
+ }
+ })
+ return title
+}
+
+func extractContentFromJSONLD(doc *goquery.Document) string {
+ var content string
+ doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
+ if content != "" {
+ return
+ }
+ var data map[string]interface{}
+ if json.Unmarshal([]byte(s.Text()), &data) == nil {
+ d := getStringFromJSONLD(data, []string{"description", "abstract", "summary"})
+ if d != "" {
+ content = normalizeSpace(d)
+ if len(content) > 5000 {
+ content = content[:5000] + "..."
+ }
+ }
+ }
+ })
+ return content
+}
+
+func extractTitleFromCitationMeta(doc *goquery.Document) string {
+ title, _ := doc.Find("meta[name='citation_title']").Attr("content")
+ return normalizeSpace(title)
+}
+
+func extractContentFromCitationMeta(doc *goquery.Document) string {
+ content, _ := doc.Find("meta[name='citation_abstract']").Attr("content")
+ return normalizeSpace(content)
+}
+
+func extractTitleFromOpenGraph(doc *goquery.Document) string {
+ title, _ := doc.Find("meta[property='og:title']").Attr("content")
+ return normalizeSpace(title)
+}
+
+func extractContentFromOpenGraph(doc *goquery.Document) string {
+ content, _ := doc.Find("meta[property='og:description']").Attr("content")
+ return normalizeSpace(content)
+}
+
+func extractContentFromBasicMeta(doc *goquery.Document) string {
+ contentRaw, _ := doc.Find("meta[name='description']").Attr("content")
+ content := normalizeSpace(contentRaw)
+
+ // if meta description is too short, try to extract from the body
+ if len(content) < 100 {
+ selectors := []string{"article", "main", ".abstract", ".summary", "[role='main']", ".content", ".entry-content"}
+ for _, selector := range selectors {
+ if contentText := extractCleanText(doc, selector); len(contentText) > len(content) {
+ content = contentText
+ break
+ }
+ }
+ }
+
+ if len(content) > 5000 {
+ content = content[:5000]
+ }
+
+ if len(content) < 50 {
+ return ""
+ }
+
+ return content
+}
+
+func extractCleanText(doc *goquery.Document, selector string) string {
+ element := doc.Find(selector).First()
+ if element.Length() == 0 {
+ return ""
+ }
+ element.Find("script, style, nav, header, footer, aside").Remove()
+ text := element.Text()
+ text = normalizeSpace(text)
+ if len(text) > 5000 {
+ text = text[:5000]
+ }
+ return text
+}
+
+func getStringFromJSONLD(data map[string]interface{}, fields []string) string {
+ for _, field := range fields {
+ if val, ok := data[field].(string); ok && val != "" {
+ return val
+ }
+ }
+ return ""
+}
diff --git a/justfile b/justfile
new file mode 100644
index 0000000..2e0f285
--- /dev/null
+++ b/justfile
@@ -0,0 +1,20 @@
+# ScholFetch - URL to article metadata converter
+
+default:
+ @just --list
+
+# Build the binary
+build:
+ go build -o scholfetch .
+
+# Run tests
+test:
+ go test ./...
+
+# Format code
+fmt:
+ go fmt ./...
+
+# Run linter (requires golangci-lint)
+lint:
+ golangci-lint run
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..1c286ea
--- /dev/null
+++ b/main.go
@@ -0,0 +1,131 @@
+// scholfetch - URL to article converter for scholscan
+// takes URLs on stdin, outputs Article structs on stdout (JSONL)
+// logs everything to scholfetch.log
+package main
+
+import (
+ "bufio"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "strings"
+)
+
+type Article struct {
+ Title string `json:"title"`
+ Content string `json:"content,omitempty"` // Optional - expensive to fetch
+ URL string `json:"url"`
+ Route string `json:"-"` // Internal: tracks which handler succeeded
+}
+
+type Result struct {
+ Urls []string
+ FailureIndices []int
+ ArticlesWritten int
+ Errors int
+}
+
+func main() {
+ var withContent bool
+ var verbose bool
+
+ fs := flag.NewFlagSet("scholfetch", flag.ExitOnError)
+ fs.Usage = func() {
+ fmt.Fprintf(fs.Output(), `Usage: scholfetch [options] < urls.txt > articles.jsonl
+
+Converts URLs to Article JSONL format for scholscan processing.
+
+Default mode: Title-only extraction (fast)
+Optional mode: Full content extraction with --with-content
+
+Input: Text file with one URL per line (stdin)
+Output: Article JSONL (stdout)
+
+Options:
+`)
+ fs.PrintDefaults()
+ fmt.Fprint(fs.Output(), `
+Examples:
+ # Title-only mode (default)
+ cat urls.txt | scholfetch > articles.jsonl
+
+ # With full content
+ cat urls.txt | scholfetch --with-content > articles.jsonl
+
+Note: Set S2_API_KEY environment variable for higher Semantic Scholar rate limits.
+`)
+ }
+
+ fs.BoolVar(&withContent, "with-content", false, "Fetch full article content (slower)")
+ fs.BoolVar(&verbose, "verbose", false, "Show progress information")
+
+ // validate args and exit early on err
+ if err := fs.Parse(os.Args[1:]); err != nil {
+ fmt.Fprintf(os.Stderr, "Error: %v\n", err)
+ os.Exit(1)
+ }
+
+ if fs.NArg() > 0 {
+ fmt.Fprintf(os.Stderr, "Error: Unexpected arguments: %v\n", fs.Args())
+ os.Exit(1)
+ }
+
+ // set up logger
+ var logger *log.Logger
+ if verbose {
+ logger = log.New(os.Stderr, "", log.LstdFlags)
+ } else {
+ logger = log.New(io.Discard, "", 0)
+ }
+
+ // config controls how URLs are handled and what data is extracted
+ config := NewConfigWithLogger(logger)
+ config.WithContent = withContent
+ config.Verbose = verbose
+
+ urls := readURLs(os.Stdin)
+
+ // notify user about S2 key found/not
+ if config.S2APIKey != "" {
+ fmt.Fprintln(os.Stderr, "Semantic Scholar API key detected: using authenticated rate limits.")
+ } else {
+ fmt.Fprintln(os.Stderr, "Semantic Scholar API key not set: using public rate limits.")
+ }
+
+ // log file for processing info, sep from stderr to keep output clean
+ logFile, err := os.Create("scholfetch.log")
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "Error: could not create log file: %v\n", err)
+ os.Exit(1)
+ }
+ defer logFile.Close()
+
+ fmt.Fprintf(os.Stderr, "Processing %d URLs (content=%t)...\n", len(urls), withContent)
+ fmt.Fprintln(os.Stderr, "Monitor progress: tail -f scholfetch.log")
+
+ encoder := json.NewEncoder(os.Stdout)
+
+ // DO THE ACTUAL WORK
+ result := ProcessURLsWithConfig(urls, config, encoder, logFile)
+
+ // report final stats to stderr
+ fmt.Fprintf(os.Stderr, "Finished: %d articles written, %d errors\n", result.ArticlesWritten, result.Errors)
+ fmt.Fprintln(os.Stderr, "See scholfetch.log for details")
+}
+
+// readURLs parses stdin into a URL slice
+// filters out empty lines and comments (#)
+func readURLs(r io.Reader) []string {
+ var urls []string
+ scanner := bufio.NewScanner(r)
+ for scanner.Scan() {
+ line := strings.TrimSpace(scanner.Text())
+ if line != "" && !strings.HasPrefix(line, "#") {
+ urls = append(urls, line)
+ }
+ }
+ return urls
+}
diff --git a/processor.go b/processor.go
new file mode 100644
index 0000000..1079e3d
--- /dev/null
+++ b/processor.go
@@ -0,0 +1,295 @@
+// PROCESSING PIPELINE
+//
+// Handles batch processing of URLs with rate limiting and fallback strategies.
+//
+// DESIGN:
+// - fixed chunk size (50) to balance API efficiency and error recovery
+// - batching for arxiv/s2 APIs, individual fallback on batch failure
+// - sep handlers for each route type (arxiv, s2, rawhtml)
+// - JSONL logging of every attempt (success/failure) with timestamps
+package main
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "os"
+ "time"
+)
+
+type ProcessResult struct {
+ ArticlesWritten int
+ Errors int
+}
+
+type URLLogEntry struct {
+ Time string `json:"time"`
+ URL string `json:"url"`
+ Success int `json:"success"`
+ API string `json:"api"`
+ Error string `json:"error,omitempty"`
+}
+
+func logArticleAttempt(logEncoder *json.Encoder, url, api string, err error) error {
+ success := 0
+ errMsg := ""
+ if err == nil {
+ success = 1
+ } else {
+ errMsg = err.Error()
+ }
+ return logEncoder.Encode(URLLogEntry{
+ Time: time.Now().Format(time.RFC3339),
+ URL: url,
+ Success: success,
+ API: api,
+ Error: errMsg,
+ })
+}
+
+func logEncodingFailure(logEncoder *json.Encoder, url string, err error) error {
+ return logEncoder.Encode(URLLogEntry{
+ Time: time.Now().Format(time.RFC3339),
+ URL: url,
+ Success: 0,
+ API: "",
+ Error: fmt.Sprintf("encoding error: %v", err),
+ })
+}
+
+// ProcessURLsWithConfig orchestrates the entire processing pipeline
+// chunks URLs to balance API efficiency with error recovery
+func ProcessURLsWithConfig(urls []string, config *Config, encoder *json.Encoder, logFile io.Writer) ProcessResult {
+ result := ProcessResult{}
+ ctx := context.Background()
+ logEncoder := json.NewEncoder(logFile)
+
+ chunkSize := 50
+
+ processedCount := 0
+
+ // process URLs in chunks
+ for i := 0; i < len(urls); i += chunkSize {
+ end := i + chunkSize
+ if end > len(urls) {
+ end = len(urls)
+ }
+
+ chunk := urls[i:end]
+
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Processing chunk %d-%d of %d URLs", i+1, end, len(urls))
+ }
+
+ // do the work
+ chunkResult := processChunk(ctx, chunk, config, encoder, logEncoder)
+
+ result.ArticlesWritten += chunkResult.ArticlesWritten
+ result.Errors += chunkResult.Errors
+ processedCount += len(chunk)
+
+ if config.Verbose && config.Logger != nil {
+ fmt.Fprintf(os.Stderr, "Processed %d articles...\n", processedCount)
+ }
+ }
+
+ return result
+}
+
+// processChunk handles routing, batching, and fallback for a given chunk of URLs.
+func processChunk(ctx context.Context, urls []string, config *Config, encoder *json.Encoder, logEncoder *json.Encoder) ProcessResult {
+ result := ProcessResult{}
+
+ // create temporary articles for routing and processing
+ articles := make([]*Article, len(urls))
+ for i, url := range urls {
+ articles[i] = &Article{URL: url}
+ }
+
+ // 1. toute all articles in the chunk
+ for _, article := range articles {
+ routeArticle(article)
+ }
+
+ // 2. group by type for batching
+ arxivURLs := []string{}
+ s2URLs := []string{}
+ htmlURLs := []string{}
+
+ for _, article := range articles {
+ switch article.Route {
+ case "arxiv":
+ arxivURLs = append(arxivURLs, article.URL)
+ case "s2":
+ s2URLs = append(s2URLs, article.URL)
+ default:
+ htmlURLs = append(htmlURLs, article.URL)
+ }
+ }
+
+ // 3. process each type (lim to chunk size)
+ if len(arxivURLs) > 0 {
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Processing %d arXiv URLs in chunk", len(arxivURLs))
+ }
+ n, err := processArxiv(ctx, arxivURLs, encoder, config, logEncoder)
+ result.ArticlesWritten += n
+ if err != nil {
+ result.Errors++
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Error processing arXiv URLs: %v", err)
+ }
+ }
+ }
+
+ if len(s2URLs) > 0 {
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Processing %d Semantic Scholar URLs in chunk", len(s2URLs))
+ }
+ n, err := processSemanticScholar(ctx, s2URLs, encoder, config, logEncoder)
+ result.ArticlesWritten += n
+ if err != nil {
+ result.Errors++
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Error processing S2 URLs: %v", err)
+ }
+ }
+ }
+
+ if len(htmlURLs) > 0 {
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Processing %d raw HTML URLs in chunk", len(htmlURLs))
+ }
+ n, err := processHTML(ctx, htmlURLs, encoder, config, logEncoder)
+ result.ArticlesWritten += n
+ if err != nil {
+ result.Errors++
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Error processing HTML URLs: %v", err)
+ }
+ }
+ }
+
+ return result
+}
+
+func processArxiv(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) {
+ articles, err := fetchArxivBatch(ctx, config, urls)
+ if err != nil {
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("arXiv batch failed: %v, falling back to individual processing", err)
+ }
+ return processIndividualArxiv(ctx, urls, encoder, config, logEncoder)
+ }
+
+ written := 0
+ for _, article := range articles {
+ if err := encoder.Encode(article); err != nil {
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Error encoding article: %v", err)
+ }
+ _ = logEncodingFailure(logEncoder, article.URL, err)
+ } else {
+ written++
+ _ = logArticleAttempt(logEncoder, article.URL, "arxiv", nil)
+ }
+ }
+ return written, nil
+}
+
+func processSemanticScholar(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) {
+ articles, err := fetchSemanticScholarBatch(ctx, config, urls)
+ if err != nil {
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("S2 batch failed: %v, falling back to individual processing", err)
+ }
+ return processIndividualS2(ctx, urls, encoder, config, logEncoder)
+ }
+
+ written := 0
+ for _, article := range articles {
+ if err := encoder.Encode(article); err != nil {
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Error encoding article: %v", err)
+ }
+ _ = logEncodingFailure(logEncoder, article.URL, err)
+ } else {
+ written++
+ _ = logArticleAttempt(logEncoder, article.URL, "s2", nil)
+ }
+ }
+ return written, nil
+}
+
+func processHTML(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) {
+ written := 0
+ for _, url := range urls {
+ article, err := fetchRawHTML(ctx, config, url)
+ if err != nil {
+ _ = logArticleAttempt(logEncoder, url, "", err)
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Error fetching HTML %s: %v", url, err)
+ }
+ continue
+ }
+ if err := encoder.Encode(article); err != nil {
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Error encoding article: %v", err)
+ }
+ _ = logEncodingFailure(logEncoder, url, err)
+ } else {
+ written++
+ _ = logArticleAttempt(logEncoder, url, "html", nil)
+ }
+ }
+ return written, nil
+}
+
+func processIndividualArxiv(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) {
+ written := 0
+ for _, url := range urls {
+ article, err := fetchArxiv(ctx, config, url)
+ if err != nil {
+ _ = logArticleAttempt(logEncoder, url, "", err)
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Error fetching arXiv %s: %v", url, err)
+ }
+ continue
+ }
+ if err := encoder.Encode(article); err != nil {
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Error encoding article: %v", err)
+ }
+ _ = logEncodingFailure(logEncoder, url, err)
+ } else {
+ written++
+ _ = logArticleAttempt(logEncoder, url, "arxiv", nil)
+ }
+ }
+ return written, nil
+}
+
+func processIndividualS2(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) {
+ written := 0
+ for _, url := range urls {
+ article, err := fetchSemanticScholar(ctx, config, url)
+ if err != nil {
+ _ = logArticleAttempt(logEncoder, url, "", err)
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Error fetching S2 %s: %v", url, err)
+ }
+ continue
+ }
+ if err := encoder.Encode(article); err != nil {
+ if config.Verbose && config.Logger != nil {
+ config.Logger.Printf("Error encoding article: %v", err)
+ }
+ _ = logEncodingFailure(logEncoder, url, err)
+ } else {
+ written++
+ _ = logArticleAttempt(logEncoder, url, "s2", nil)
+ }
+ }
+ return written, nil
+}
diff --git a/routes.go b/routes.go
new file mode 100644
index 0000000..39bb7a9
--- /dev/null
+++ b/routes.go
@@ -0,0 +1,75 @@
+// ROUTING STRATEGY
+//
+// Routes URLs to the appropriate extraction handler. The order matters:
+// 1. arxiv - direct arXiv URLs and IDs, use specialized arXiv API
+// 2. s2 - Semantic Scholar for DOI-based sources, richer metadata
+// 3. rawhtml - fallback for direct publisher URLs, generic extraction
+package main
+
+import (
+ "fmt"
+ "net/url"
+ "regexp"
+ "strings"
+)
+
+var (
+ // regex to extract arXiv identifier from various arXiv URLs.
+ // supports new (2109.05857) and old (math-ph/0301015) formats.
+ arxivIdentifierRegex = regexp.MustCompile(`(?:arxiv\.org/(?:abs|pdf)/|arXiv:)([a-z-]+/[0-9]{7}|[0-9]{4}\.[0-9]{4,5}(?:v[0-9]+)?)(?:\.pdf)?`)
+
+ // regex to find a DOI in a string.
+ doiRegex = regexp.MustCompile(`(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)`)
+)
+
+// route determines the primary enrichment strategy for a URL.
+// returns the route str: "arxiv", "s2", or "rawhtml".
+func Route(urlStr string) string {
+ parsedURL, err := url.Parse(urlStr)
+ if err != nil {
+ return "rawhtml" // fallback if URL is unparseable
+ }
+
+ hostname := parsedURL.Hostname()
+
+ // 1. arXiv.org or arXiv ID pattern in URL
+ if hostname == "arxiv.org" || strings.Contains(urlStr, "arxiv.org/abs/") {
+ if _, err := getArxivIdentifier(urlStr); err == nil {
+ return "arxiv"
+ }
+ }
+
+ // 2. direct DOI link from doi.org
+ if hostname == "doi.org" {
+ return "s2"
+ }
+
+ // 3. DOI present in URL path (e.g. some publisher sites)
+ if doi := getDOI(urlStr); doi != "" {
+ return "s2"
+ }
+
+ // 4. fallback to rawhtml
+ return "rawhtml"
+}
+
+// routeArticle determines the route for an article and sets the Route field.
+func routeArticle(article *Article) {
+ article.Route = Route(article.URL)
+}
+
+func getArxivIdentifier(articleURL string) (string, error) {
+ matches := arxivIdentifierRegex.FindStringSubmatch(articleURL)
+ if len(matches) > 1 {
+ return matches[1], nil
+ }
+ return "", fmt.Errorf("no arXiv identifier found")
+}
+
+func getDOI(text string) string {
+ matches := doiRegex.FindStringSubmatch(text)
+ if len(matches) > 1 {
+ return matches[1]
+ }
+ return ""
+}
diff --git a/scholar.go b/scholar.go
new file mode 100644
index 0000000..ad1e5e0
--- /dev/null
+++ b/scholar.go
@@ -0,0 +1,217 @@
+// SEMANTIC SCHOLAR HANDLER
+//
+// Uses S2's Graph API to fetch paper metadata via DOI.
+//
+// STRATEGY:
+// - requires valid DOI in URL or DOI.org redirect
+// - batch API for efficiency (up to 500 papers per request)
+// - positional matching: response[i] maps to URLs[i]
+// - rate limited to 100ms per request (configurable with API key)
+//
+// AUTH:
+// - S2_API_KEY environment variable increases rate limits
+// - Without key: public limits apply
+package main
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "strings"
+)
+
+const (
+ semScholarPaperDOIFmtTitle = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title"
+ semScholarPaperDOIFmtFull = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title,abstract"
+ semScholarBatchURLTitle = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title"
+ semScholarBatchURLFull = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title,abstract"
+)
+
+// escapeDOI URL-encodes a DOI for safe use in API endpoints.
+// DOIs contain forward slashes which must be escaped for the URL path.
+// Example: "10.1234/abcd5678" -> "10.1234/abcd5678" (already safe in this case)
+func escapeDOI(doi string) string {
+ parts := strings.SplitN(doi, "/", 2)
+ if len(parts) != 2 {
+ return url.PathEscape(doi)
+ }
+ return url.PathEscape(parts[0]) + "/" + url.PathEscape(parts[1])
+}
+
+// S2BatchResponseItem represents a Semantic Scholar batch API response item
+type S2BatchResponseItem struct {
+ PaperID string `json:"paperId"`
+ Title string `json:"title"`
+ Abstract string `json:"abstract"`
+}
+
+// fetchSemanticScholar fetches content for a single DOI via Semantic Scholar.
+func fetchSemanticScholar(ctx context.Context, config *Config, urlStr string) (*Article, error) {
+ doi := getDOI(urlStr)
+ if doi == "" {
+ return nil, fmt.Errorf("Semantic Scholar: URL doesn't contain valid DOI: %s", urlStr)
+ }
+
+ // rate limit
+ if err := config.HTTP.RateLimitS2(ctx); err != nil {
+ return nil, err
+ }
+
+ escapedDOI := escapeDOI(doi)
+
+ // choose the appropriate URL based on whether we need content
+ var apiURL string
+ if config.WithContent {
+ apiURL = fmt.Sprintf(semScholarPaperDOIFmtFull, escapedDOI)
+ } else {
+ apiURL = fmt.Sprintf(semScholarPaperDOIFmtTitle, escapedDOI)
+ }
+
+ req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
+ if err != nil {
+ return nil, fmt.Errorf("failed to construct Semantic Scholar request: %w", err)
+ }
+ req.Header.Set("Accept", "application/json")
+ if config.S2APIKey != "" {
+ req.Header.Set("x-api-key", config.S2APIKey)
+ }
+
+ resp, err := config.HTTP.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("failed to make request to Semantic Scholar API: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("Semantic Scholar API returned non-200 status for DOI %s: %s", doi, resp.Status)
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("failed to read Semantic Scholar API response body: %w", err)
+ }
+
+ var s2 struct {
+ Title string `json:"title"`
+ Abstract string `json:"abstract"`
+ }
+ if err := json.Unmarshal(body, &s2); err != nil {
+ return nil, fmt.Errorf("failed to unmarshal Semantic Scholar JSON for DOI %s: %w", doi, err)
+ }
+
+ title := normalizeSpace(s2.Title)
+ content := normalizeSpace(s2.Abstract)
+
+ // del content if not requested
+ if !config.WithContent {
+ content = ""
+ }
+
+ if title == "" {
+ return nil, fmt.Errorf("no title found for DOI %s", doi)
+ }
+
+ return &Article{
+ Title: title,
+ Content: content,
+ URL: urlStr,
+ }, nil
+}
+
+// fetchSemanticScholarBatch fetches a batch of papers from the S2 API.
+func fetchSemanticScholarBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) {
+ if len(urls) == 0 {
+ return nil, nil
+ }
+
+ // rate limit
+ if err := config.HTTP.RateLimitS2(ctx); err != nil {
+ return nil, err
+ }
+
+ // extract DOIs from URLs, maintaining order for pos matching
+ validURLs := make([]string, 0, len(urls))
+ s2IDs := make([]string, 0, len(urls))
+
+ for _, urlStr := range urls {
+ doi := getDOI(urlStr)
+ if doi != "" {
+ validURLs = append(validURLs, urlStr)
+ s2IDs = append(s2IDs, "DOI:"+doi)
+ }
+ }
+
+ if len(s2IDs) == 0 {
+ return nil, nil
+ }
+
+ requestBody, err := json.Marshal(map[string][]string{"ids": s2IDs})
+ if err != nil {
+ return nil, fmt.Errorf("failed to marshal S2 batch request body: %w", err)
+ }
+
+ // choose the appropriate URL based on whether we need content
+ var batchURL string
+ if config.WithContent {
+ batchURL = semScholarBatchURLFull
+ } else {
+ batchURL = semScholarBatchURLTitle
+ }
+
+ req, err := http.NewRequestWithContext(ctx, "POST", batchURL, bytes.NewReader(requestBody))
+ if err != nil {
+ return nil, fmt.Errorf("failed to create S2 batch request: %w", err)
+ }
+ req.Header.Set("Content-Type", "application/json")
+ if config.S2APIKey != "" {
+ req.Header.Set("x-api-key", config.S2APIKey)
+ }
+
+ resp, err := config.HTTP.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("S2 batch request failed: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("S2 batch API returned non-200 status: %s", resp.Status)
+ }
+
+ var responseItems []*S2BatchResponseItem
+ if err := json.NewDecoder(resp.Body).Decode(&responseItems); err != nil {
+ return nil, fmt.Errorf("failed to decode S2 batch response: %w", err)
+ }
+
+ var articles []*Article
+ // match responses positionally to input URLs
+ for i, item := range responseItems {
+ if i >= len(validURLs) {
+ break
+ }
+ if item == nil {
+ continue
+ }
+
+ title := normalizeSpace(item.Title)
+ if title != "" {
+ content := normalizeSpace(item.Abstract)
+
+ // skip content if not requested
+ if !config.WithContent {
+ content = ""
+ }
+
+ articles = append(articles, &Article{
+ Title: title,
+ Content: content,
+ URL: validURLs[i],
+ })
+ }
+ }
+
+ return articles, nil
+}
diff --git a/scholfetch_test.go b/scholfetch_test.go
new file mode 100644
index 0000000..59adae7
--- /dev/null
+++ b/scholfetch_test.go
@@ -0,0 +1,193 @@
+package main
+
+import (
+ "context"
+ "fmt"
+ "net/http"
+ "net/http/httptest"
+ "testing"
+ "time"
+)
+
+type TestLogger struct {
+ messages []string
+}
+
+func (l *TestLogger) Printf(format string, v ...interface{}) {
+ l.messages = append(l.messages, fmt.Sprintf(format, v...))
+}
+
+func TestHTTPClientDefaults(t *testing.T) {
+ client := NewHTTPClient()
+
+ if client.userAgent != "scholfetch/1.0 (+https://samsci.com)" {
+ t.Errorf("Expected default user agent, got %s", client.userAgent)
+ }
+
+ if client.arxivDelay != 1*time.Second {
+ t.Errorf("Expected arxiv delay of 1s, got %v", client.arxivDelay)
+ }
+
+ if client.maxRetries != 3 {
+ t.Errorf("Expected max retries of 3, got %d", client.maxRetries)
+ }
+}
+
+func TestRateLimiting(t *testing.T) {
+ client := NewHTTPClient()
+ client.arxivDelay = 10 * time.Millisecond // Speed up test
+ client.s2Delay = 5 * time.Millisecond
+
+ // Test arxiv rate limiting
+ start := time.Now()
+ err := client.RateLimitArxiv(context.Background())
+ if err != nil {
+ t.Fatalf("RateLimitArxiv failed: %v", err)
+ }
+ duration := time.Since(start)
+ if duration < 10*time.Millisecond {
+ t.Errorf("Expected arxiv delay of ~10ms, got %v", duration)
+ }
+
+ // Test S2 rate limiting
+ start = time.Now()
+ err = client.RateLimitS2(context.Background())
+ if err != nil {
+ t.Fatalf("RateLimitS2 failed: %v", err)
+ }
+ duration = time.Since(start)
+ if duration < 5*time.Millisecond {
+ t.Errorf("Expected S2 delay of ~5ms, got %v", duration)
+ }
+}
+
+func TestHTTPIPRequest(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusOK)
+ _, _ = w.Write([]byte("test response"))
+ }))
+ defer server.Close()
+
+ client := NewHTTPClient()
+ req, _ := http.NewRequest("GET", server.URL, nil)
+
+ resp, err := client.Do(req)
+ if err != nil {
+ t.Fatalf("Request failed: %v", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != 200 {
+ t.Errorf("Expected status 200, got %d", resp.StatusCode)
+ }
+}
+
+func TestURLRouting(t *testing.T) {
+ tests := map[string]string{
+ "https://arxiv.org/abs/2301.00001": "arxiv",
+ "https://arxiv.org/pdf/2301.00001.pdf": "arxiv",
+ "http://arxiv.org/abs/2301.00001v2": "arxiv",
+ "https://api.semanticscholar.org/DOI:10.1234": "rawhtml",
+ "https://doi.org/10.1234/abcd5678": "s2",
+ "https://example.com/paper": "rawhtml",
+ "https://pubmed.ncbi.nlm.nih.gov/12345678/": "rawhtml",
+ }
+
+ for url, expected := range tests {
+ result := Route(url)
+ if result != expected {
+ t.Errorf("Route(%s) = %s, expected %s", url, result, expected)
+ }
+ }
+}
+
+func TestConfigDefaults(t *testing.T) {
+ config := NewConfig()
+
+ if config.WithContent != false {
+ t.Error("Expected WithContent=false by default")
+ }
+
+ if config.Verbose != false {
+ t.Error("Expected Verbose=false by default")
+ }
+
+ if config.ArxivBatch != 50 {
+ t.Errorf("Expected ArxivBatch=50, got %d", config.ArxivBatch)
+ }
+
+ if config.HTTP == nil {
+ t.Error("Expected HTTP client to be initialized")
+ }
+}
+
+func TestConfigWithLogger(t *testing.T) {
+ logger := &TestLogger{}
+ config := NewConfigWithLogger(logger)
+
+ if config.Logger != logger {
+ t.Error("Logger not set correctly")
+ }
+}
+
+func TestArxivURLParsing(t *testing.T) {
+ tests := map[string]string{
+ "https://arxiv.org/abs/2301.00001": "2301.00001",
+ "http://arxiv.org/abs/2301.00001v2": "2301.00001v2",
+ "https://arxiv.org/pdf/2301.00001.pdf": "2301.00001",
+ "https://example.com/not-arxiv": "",
+ }
+
+ for url, expected := range tests {
+ result, _ := getArxivIdentifier(url)
+ if result != expected {
+ t.Errorf("getArxivIdentifier(%s) = %s, expected %s", url, result, expected)
+ }
+ }
+}
+
+func TestDOIParsing(t *testing.T) {
+ tests := map[string]string{
+ "https://doi.org/10.1234/abcd5678": "10.1234/abcd5678",
+ "https://api.semanticscholar.org/DOI:10.1234": "",
+ "https://example.com/no-doi": "",
+ }
+
+ for url, expected := range tests {
+ result := getDOI(url)
+ if result == expected {
+ t.Logf("✓ getDOI(%s) = %s", url, result)
+ } else {
+ t.Errorf("getDOI(%s) = %s, expected %s", url, result, expected)
+ }
+ }
+}
+
+func TestBatchURLRouting(t *testing.T) {
+ urls := []string{
+ "https://arxiv.org/abs/2301.00001",
+ "https://doi.org/10.1234/test1",
+ "https://example.com/paper1",
+ "https://arxiv.org/pdf/2301.00002.pdf",
+ "https://doi.org/10.5678/test2",
+ }
+
+ routeCounts := make(map[string]int)
+ for _, url := range urls {
+ route := Route(url)
+ routeCounts[route]++
+ }
+
+ expected := map[string]int{
+ "arxiv": 2,
+ "s2": 2,
+ "rawhtml": 1,
+ }
+
+ for route, expectedCount := range expected {
+ if routeCounts[route] != expectedCount {
+ t.Errorf("Expected %d URLs for route %s, got %d",
+ expectedCount, route, routeCounts[route])
+ }
+ }
+} \ No newline at end of file
diff --git a/util.go b/util.go
new file mode 100644
index 0000000..f7d94e5
--- /dev/null
+++ b/util.go
@@ -0,0 +1,14 @@
+package main
+
+import (
+ "regexp"
+ "strings"
+)
+
+func normalizeSpace(s string) string {
+ return strings.Join(strings.Fields(s), " ")
+}
+
+func stripArxivVersion(id string) string {
+ return regexp.MustCompile(`v\d+$`).ReplaceAllString(id, "")
+}