Init v0.1.0HEAD main

author: Sam Scholten 2025-12-15 19:35:46 +1000
committer: Sam Scholten 2025-12-15 19:35:57 +1000
commit: 3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2 (patch)
tree: 42b1f0e0a346a1cf087df90e29a100edbd66b3eb
download: scholfetch-3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2.tar.gz
scholfetch-3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2.zip
14 files changed, 1679 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4a79922
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+# Built binaries
+scholfetch
+
+# Test files
+test_urls.txt
+
+# Environment and configuration files
+.env*
+config.*
+secrets.*
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9190dc1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,83 @@
+# ScholFetch
+
+URL → Article metadata (JSONL) converter. Fetches title-only by default for speed.
+
+## Overview
+
+ScholFetch extracts academic article metadata from URLs.
+It supports arXiv, Semantic Scholar, and generic HTML sources.
+The tool outputs structured JSONL format suitable for downstream processing by ScholScan (see below).
+
+## Usage
+```bash
+cat urls.txt | scholfetch > articles.jsonl
+# or:
+cat urls.txt | scholfetch --with-content > articles.jsonl
+```
+
+## Monitoring Progress
+
+ScholFetch writes a structured log file `scholfetch.log` during processing. Monitor it in another terminal:
+
+```bash
+tail -f scholfetch.log
+```
+
+## Semantic Scholar API key
+
+Get higher rate limits by setting your S2 API key (*not required*):
+
+```bash
+export S2_API_KEY="your-key-here"
+cat urls.txt | scholfetch > articles.jsonl
+```
+
+Get your free key at: https://www.semanticscholar.org/product/api
+
+ScholFetch will notify you on startup whether the key is detected.
+
+## Integration with ScholScan
+
+Once you have structured article data, pipe it to [ScholScan](https://git.samsci.com/scholscan) for ML-based filtering:
+
+```bash
+# Get articles from URLs
+cat urls.txt | scholfetch > articles.jsonl
+
+# Train a classification model
+scholscan train articles.jsonl --rss-feeds feeds.txt > model.json
+
+# Score articles from an RSS feed
+scholscan scan --model model.json --url "https://example.com/feed.rss" > results.jsonl
+```
+
+ScholFetch extracts and enriches article metadata, while ScholScan handles classification. Together they provide a complete pipeline for filtering academic literature.
+
+## Input/Output
+- Input: URLs (one per line) on stdin
+- Output: JSONL with `title` and `url` fields (stdout)
+- Add `--with-content` for `content` field
+
+## How it works
+
+URLs get routed by pattern (arXiv IDs → arXiv API, DOIs → Semantic Scholar, everything else → HTML scrape). 
+Batched in chunks of 50 for efficiency. If batch fails, falls back to individual requests. Rate limited per API.
+
+## Code
+
+- `main.go` - reads stdin, sets up flags/output
+- `routes.go` - determines which handler (arxiv/s2/html) for each URL
+- `processor.go` - batching, fallback logic
+- `arxiv.go`, `scholar.go`, `html.go` - the actual extractors
+- `client.go` - HTTP client with retries and rate limiting
+
+## Build and Development
+
+```bash
+just build
+just test
+```
+
+## Roadmap
+
+Future work could integrate crossref, pubmed quite easily (especially for title-only approach).
diff --git a/arxiv.go b/arxiv.go
new file mode 100644
index 0000000..6e7fad5
--- /dev/null
+++ b/arxiv.go
@@ -0,0 +1,196 @@
+// ARXIV HANDLER
+//
+// Uses arXiv's API to fetch article metadata.
+// 
+// STRATEGY:
+// - single requests and batched requests supported
+// - uses gofeed to parse Atom XML responses
+// - rate limited to 1 request per second (conservative)
+// - handles both old (math-ph/0301015) and new (2109.05857) ID formats
+package main
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+
+	"github.com/mmcdole/gofeed"
+)
+
+const (
+	arxivQueryFmt = "http://export.arxiv.org/api/query?id_list=%s"
+)
+
+// fetchArxiv fetches content for a single arXiv article.
+func fetchArxiv(ctx context.Context, config *Config, urlStr string) (*Article, error) {
+	arxivID, err := getArxivIdentifier(urlStr)
+	if err != nil || arxivID == "" {
+		return nil, fmt.Errorf("arXiv: invalid URL format, expected arxiv.org/abs/ID: %s", urlStr)
+	}
+
+	// rate limit
+	if err := config.HTTP.RateLimitArxiv(ctx); err != nil {
+		return nil, err
+	}
+
+	apiURL := fmt.Sprintf(arxivQueryFmt, arxivID)
+	req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to construct arXiv request: %w", err)
+	}
+	req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8")
+
+	resp, err := config.HTTP.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("failed to fetch arXiv feed for ID %s: %w", arxivID, err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("arXiv API returned non-200 status for ID %s: %s", arxivID, resp.Status)
+	}
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read arXiv response body for ID %s: %w", arxivID, err)
+	}
+
+	fp := gofeed.NewParser()
+	feed, err := fp.Parse(bytes.NewReader(body))
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse arXiv feed for ID %s: %w", arxivID, err)
+	}
+
+	if len(feed.Items) == 0 {
+		return nil, fmt.Errorf("no items found in arXiv feed for ID %s", arxivID)
+	}
+
+	item := feed.Items[0]
+	title := normalizeSpace(item.Title)
+	content := normalizeSpace(item.Description)
+	
+	if config.Verbose {
+		config.Logger.Printf("arXiv single fetch result: ID=%s, Title=%s", arxivID, title)
+	}
+
+	// del content if not requested
+	if !config.WithContent {
+		content = ""
+	}
+
+	if title == "" {
+		return nil, fmt.Errorf("no title found for arXiv ID %s", arxivID)
+	}
+
+	return &Article{
+		Title:   title,
+		Content: content,
+		URL:     urlStr,
+	}, nil
+}
+
+// fetchArxivBatch fetches metadata for a list of arXiv URLs in batches.
+func fetchArxivBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) {
+	if len(urls) == 0 {
+		return nil, nil
+	}
+
+	idToURL := make(map[string]string)
+	batchIDs := make([]string, 0, len(urls))
+
+	for _, urlStr := range urls {
+		id, err := getArxivIdentifier(urlStr)
+		if err != nil {
+			continue
+		}
+		batchIDs = append(batchIDs, id)
+		stripped := stripArxivVersion(id)
+		idToURL[stripped] = urlStr
+	}
+
+	if len(batchIDs) == 0 {
+		return nil, nil
+	}
+
+	var articles []*Article
+
+	for i := 0; i < len(batchIDs); i += config.ArxivBatch {
+		end := i + config.ArxivBatch
+		if end > len(batchIDs) {
+			end = len(batchIDs)
+		}
+
+		// rate limit
+		if err := config.HTTP.RateLimitArxiv(ctx); err != nil {
+			return nil, err
+		}
+
+		chunk := batchIDs[i:end]
+		apiURL := fmt.Sprintf(arxivQueryFmt, strings.Join(chunk, ","))
+
+		req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
+		if err != nil {
+			continue
+		}
+		req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8")
+
+		resp, err := config.HTTP.Do(req)
+		if err != nil {
+			continue
+		}
+
+		if resp.StatusCode != http.StatusOK {
+			resp.Body.Close()
+			continue
+		}
+
+		body, err := io.ReadAll(resp.Body)
+		resp.Body.Close()
+		if err != nil {
+			continue
+		}
+
+		fp := gofeed.NewParser()
+		feed, err := fp.Parse(bytes.NewReader(body))
+		if err != nil {
+			continue
+		}
+
+		for _, item := range feed.Items {
+			id, err := getArxivIdentifier(item.GUID)
+			if err != nil || id == "" {
+				id, err = getArxivIdentifier(item.Link)
+				if err != nil || id == "" {
+					continue
+				}
+			}
+
+			title := normalizeSpace(item.Title)
+			if title == "" {
+				continue
+			}
+
+			baseID := stripArxivVersion(id)
+			originalURL, exists := idToURL[baseID]
+			if !exists {
+				continue
+			}
+
+			content := ""
+			if config.WithContent {
+				content = normalizeSpace(item.Description)
+			}
+
+			articles = append(articles, &Article{
+				Title:   title,
+				Content: content,
+				URL:     originalURL,
+			})
+		}
+	}
+
+	return articles, nil
+}
diff --git a/client.go b/client.go
new file mode 100644
index 0000000..39a3e34
--- /dev/null
+++ b/client.go
@@ -0,0 +1,133 @@
+// CLIENT LAYER - HTTP AND RATE LIMITING
+//
+// manages HTTP requests with retry logic and API-specific rate limits.
+// 
+// RATE LIMITS:
+// - arXiv: 1 second between requests (enforced to be safe)
+// - Semantic Scholar: 100ms between requests (configurable via API key)
+//
+// STRATEGY:
+// - retries on network failures and HTTP 429
+// - exponential backoff: 1s, 2s, 4s
+// - all delays respect context cancellation
+package main
+
+import (
+	"context"
+	"net/http"
+	"os"
+	"time"
+)
+
+// HTTPClient wraps an HTTP client with common behavior like user agent,
+// rate limiting, and retry logic.
+type HTTPClient struct {
+	client     *http.Client
+	userAgent  string
+	arxivDelay time.Duration
+	s2Delay    time.Duration
+	maxRetries int
+}
+
+// NewHTTPClient creates a new HTTP client wrapper with defaults.
+func NewHTTPClient() *HTTPClient {
+	return &HTTPClient{
+		client: &http.Client{
+			Timeout: 30 * time.Second,
+		},
+		userAgent:  "scholfetch/1.0 (+https://samsci.com)",
+		arxivDelay: 1 * time.Second,
+		s2Delay:    100 * time.Millisecond,
+		maxRetries: 3,
+	}
+}
+
+// Do performs an HTTP request with retry logic.
+// retries on network errors and 429 (rate limit) responses.
+func (c *HTTPClient) Do(req *http.Request) (*http.Response, error) {
+	// Set user agent if not already set
+	if req.Header.Get("User-Agent") == "" {
+		req.Header.Set("User-Agent", c.userAgent)
+	}
+
+	var lastErr error
+	for attempt := 0; attempt < c.maxRetries; attempt++ {
+		if attempt > 0 {
+			// Exponential backoff: 1s, 2s, 4s
+			backoff := time.Duration(1<<uint(attempt-1)) * time.Second
+			select {
+			case <-time.After(backoff):
+			case <-req.Context().Done():
+				return nil, req.Context().Err()
+			}
+		}
+
+		resp, err := c.client.Do(req)
+		if err != nil {
+			lastErr = err
+			continue
+		}
+
+		// Retry on 429 (rate limit) but not other errors
+		if resp.StatusCode == http.StatusTooManyRequests {
+			resp.Body.Close()
+			lastErr = nil // Reset error for retryable status code
+			continue
+		}
+
+		return resp, nil
+	}
+
+	return nil, lastErr
+}
+
+// RateLimitArxiv adds a delay for arXiv API requests.
+func (c *HTTPClient) RateLimitArxiv(ctx context.Context) error {
+	select {
+	case <-time.After(c.arxivDelay):
+		return nil
+	case <-ctx.Done():
+		return ctx.Err()
+	}
+}
+
+// RateLimitS2 adds a delay for Semantic Scholar API requests.
+func (c *HTTPClient) RateLimitS2(ctx context.Context) error {
+	select {
+	case <-time.After(c.s2Delay):
+		return nil
+	case <-ctx.Done():
+		return ctx.Err()
+	}
+}
+
+// config for scholfetch.
+type Config struct {
+	WithContent bool
+	Verbose     bool
+	Logger      Logger
+	HTTP        *HTTPClient
+	ArxivBatch  int	
+	S2APIKey     string
+}
+
+// Logger interface for dependency injection
+type Logger interface {
+	Printf(format string, v ...interface{})
+}
+
+func NewConfig() *Config {
+	return &Config{
+		WithContent: false,
+		Verbose:     false,
+		HTTP:        NewHTTPClient(),
+		ArxivBatch:  50,
+		S2APIKey:    os.Getenv("S2_API_KEY"),
+	}
+}
+
+func NewConfigWithLogger(logger Logger) *Config {
+	cfg := NewConfig()
+	cfg.Logger = logger
+	return cfg
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..de16fa5
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,18 @@
+module scholfetch
+
+go 1.25.5
+
+require (
+	github.com/PuerkitoBio/goquery v1.11.0
+	github.com/mmcdole/gofeed v1.3.0
+)
+
+require (
+	github.com/andybalholm/cascadia v1.3.3 // indirect
+	github.com/json-iterator/go v1.1.12 // indirect
+	github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 // indirect
+	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+	github.com/modern-go/reflect2 v1.0.2 // indirect
+	golang.org/x/net v0.47.0 // indirect
+	golang.org/x/text v0.31.0 // indirect
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..5933b8c
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,96 @@
+github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw=
+github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ=
+github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
+github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
+github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/mmcdole/gofeed v1.3.0 h1:5yn+HeqlcvjMeAI4gu6T+crm7d0anY85+M+v6fIFNG4=
+github.com/mmcdole/gofeed v1.3.0/go.mod h1:9TGv2LcJhdXePDzxiuMnukhV2/zb6VtnZt1mS+SjkLE=
+github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 h1:Zr92CAlFhy2gL+V1F+EyIuzbQNbSgP4xhTODZtrXUtk=
+github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
+github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
+github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
+github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
+golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
+golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
+golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
+golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
+golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
+golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
+golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
+golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
+golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
+golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
+golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
+golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
+golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
+golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
+golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
+golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/html.go b/html.go
new file mode 100644
index 0000000..0995865
--- /dev/null
+++ b/html.go
@@ -0,0 +1,198 @@
+// RAW HTML HANDLER
+//
+// Fallback handler for URLs that don't match arXiv or Semantic Scholar patterns.
+// 
+// STRATEGY:
+// - progressive extraction tries multiple metadata sources in order
+// - JSON-LD structured data first (highest quality)
+// - citation meta tags (scholarly articles)
+// - open Graph (social media)
+// - basic HTML tags (last resort)
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+// fetchRawHTML attempts to fetch article content by parsing HTML metadata.
+func fetchRawHTML(ctx context.Context, config *Config, urlStr string) (*Article, error) {
+	req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to build request for %s: %w", urlStr, err)
+	}
+
+	resp, err := config.HTTP.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("failed to fetch %s: %w", urlStr, err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("HTTP %d when fetching %s", resp.StatusCode, urlStr)
+	}
+
+	doc, err := goquery.NewDocumentFromReader(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse HTML from %s: %w", urlStr, err)
+	}
+
+	// extract title using various strategies
+	strategies := []func(*goquery.Document) string{
+		extractTitleFromJSONLD,
+		extractTitleFromCitationMeta,
+		extractTitleFromOpenGraph,
+		func(d *goquery.Document) string { return normalizeSpace(d.Find("title").First().Text()) },
+	}
+
+	var title string
+	for _, strategy := range strategies {
+		title = strategy(doc)
+		if title != "" {
+			break
+		}
+	}
+
+	if title == "" {
+		return nil, fmt.Errorf("no title found for %s", urlStr)
+	}
+
+	article := &Article{
+		URL:   urlStr,
+		Title: title,
+	}
+
+	// only fetch content if requested
+	if config.WithContent {
+		contentStrategies := []func(*goquery.Document) string{
+			extractContentFromJSONLD,
+			extractContentFromCitationMeta,
+			extractContentFromOpenGraph,
+			extractContentFromBasicMeta,
+		}
+
+		var content string
+		for _, strategy := range contentStrategies {
+			content = strategy(doc)
+			if content != "" && len(content) > 50 {
+				break
+			}
+		}
+
+		if content != "" {
+			article.Content = content
+		}
+	}
+
+	return article, nil
+}
+
+func extractTitleFromJSONLD(doc *goquery.Document) string {
+	var title string
+	doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
+		if title != "" {
+			return
+		}
+		var data map[string]interface{}
+		if json.Unmarshal([]byte(s.Text()), &data) == nil {
+			t := getStringFromJSONLD(data, []string{"name", "headline", "title"})
+			if t != "" {
+				title = normalizeSpace(t)
+			}
+		}
+	})
+	return title
+}
+
+func extractContentFromJSONLD(doc *goquery.Document) string {
+	var content string
+	doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
+		if content != "" {
+			return
+		}
+		var data map[string]interface{}
+		if json.Unmarshal([]byte(s.Text()), &data) == nil {
+			d := getStringFromJSONLD(data, []string{"description", "abstract", "summary"})
+			if d != "" {
+				content = normalizeSpace(d)
+				if len(content) > 5000 {
+					content = content[:5000] + "..."
+				}
+			}
+		}
+	})
+	return content
+}
+
+func extractTitleFromCitationMeta(doc *goquery.Document) string {
+	title, _ := doc.Find("meta[name='citation_title']").Attr("content")
+	return normalizeSpace(title)
+}
+
+func extractContentFromCitationMeta(doc *goquery.Document) string {
+	content, _ := doc.Find("meta[name='citation_abstract']").Attr("content")
+	return normalizeSpace(content)
+}
+
+func extractTitleFromOpenGraph(doc *goquery.Document) string {
+	title, _ := doc.Find("meta[property='og:title']").Attr("content")
+	return normalizeSpace(title)
+}
+
+func extractContentFromOpenGraph(doc *goquery.Document) string {
+	content, _ := doc.Find("meta[property='og:description']").Attr("content")
+	return normalizeSpace(content)
+}
+
+func extractContentFromBasicMeta(doc *goquery.Document) string {
+	contentRaw, _ := doc.Find("meta[name='description']").Attr("content")
+	content := normalizeSpace(contentRaw)
+
+	// if meta description is too short, try to extract from the body
+	if len(content) < 100 {
+		selectors := []string{"article", "main", ".abstract", ".summary", "[role='main']", ".content", ".entry-content"}
+		for _, selector := range selectors {
+			if contentText := extractCleanText(doc, selector); len(contentText) > len(content) {
+				content = contentText
+				break
+			}
+		}
+	}
+
+	if len(content) > 5000 {
+		content = content[:5000]
+	}
+
+	if len(content) < 50 {
+		return ""
+	}
+
+	return content
+}
+
+func extractCleanText(doc *goquery.Document, selector string) string {
+	element := doc.Find(selector).First()
+	if element.Length() == 0 {
+		return ""
+	}
+	element.Find("script, style, nav, header, footer, aside").Remove()
+	text := element.Text()
+	text = normalizeSpace(text)
+	if len(text) > 5000 {
+		text = text[:5000]
+	}
+	return text
+}
+
+func getStringFromJSONLD(data map[string]interface{}, fields []string) string {
+	for _, field := range fields {
+		if val, ok := data[field].(string); ok && val != "" {
+			return val
+		}
+	}
+	return ""
+}
diff --git a/justfile b/justfile
new file mode 100644
index 0000000..2e0f285
--- /dev/null
+++ b/justfile
@@ -0,0 +1,20 @@
+# ScholFetch - URL to article metadata converter
+
+default:
+    @just --list
+
+# Build the binary
+build:
+    go build -o scholfetch .
+
+# Run tests
+test:
+    go test ./...
+
+# Format code
+fmt:
+    go fmt ./...
+
+# Run linter (requires golangci-lint)
+lint:
+    golangci-lint run
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..1c286ea
--- /dev/null
+++ b/main.go
@@ -0,0 +1,131 @@
+// scholfetch - URL to article converter for scholscan
+// takes URLs on stdin, outputs Article structs on stdout (JSONL)
+// logs everything to scholfetch.log
+package main
+
+import (
+	"bufio"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"strings"
+)
+
+type Article struct {
+	Title   string `json:"title"`
+	Content string `json:"content,omitempty"` // Optional - expensive to fetch
+	URL     string `json:"url"`
+	Route   string `json:"-"`                 // Internal: tracks which handler succeeded
+}
+
+type Result struct {
+	Urls           []string
+	FailureIndices []int
+	ArticlesWritten int
+	Errors         int
+}
+
+func main() {
+	var withContent bool
+	var verbose bool
+
+	fs := flag.NewFlagSet("scholfetch", flag.ExitOnError)
+	fs.Usage = func() {
+		fmt.Fprintf(fs.Output(), `Usage: scholfetch [options] < urls.txt > articles.jsonl
+
+Converts URLs to Article JSONL format for scholscan processing.
+
+Default mode: Title-only extraction (fast)
+Optional mode: Full content extraction with --with-content
+
+Input:  Text file with one URL per line (stdin)
+Output: Article JSONL (stdout)
+
+Options:
+`)
+		fs.PrintDefaults()
+		fmt.Fprint(fs.Output(), `
+Examples:
+  # Title-only mode (default)
+  cat urls.txt | scholfetch > articles.jsonl
+  
+  # With full content
+  cat urls.txt | scholfetch --with-content > articles.jsonl
+
+Note: Set S2_API_KEY environment variable for higher Semantic Scholar rate limits.
+`)
+	}
+
+	fs.BoolVar(&withContent, "with-content", false, "Fetch full article content (slower)")
+	fs.BoolVar(&verbose, "verbose", false, "Show progress information")
+
+	// validate args and exit early on err
+	if err := fs.Parse(os.Args[1:]); err != nil {
+		fmt.Fprintf(os.Stderr, "Error: %v\n", err)
+		os.Exit(1)
+	}
+
+	if fs.NArg() > 0 {
+		fmt.Fprintf(os.Stderr, "Error: Unexpected arguments: %v\n", fs.Args())
+		os.Exit(1)
+	}
+
+	// set up logger
+	var logger *log.Logger
+	if verbose {
+		logger = log.New(os.Stderr, "", log.LstdFlags)
+	} else {
+		logger = log.New(io.Discard, "", 0)
+	}
+
+	// config controls how URLs are handled and what data is extracted
+	config := NewConfigWithLogger(logger)
+	config.WithContent = withContent
+	config.Verbose = verbose
+
+	urls := readURLs(os.Stdin)
+
+	// notify user about S2 key found/not
+	if config.S2APIKey != "" {
+		fmt.Fprintln(os.Stderr, "Semantic Scholar API key detected: using authenticated rate limits.")
+	} else {
+		fmt.Fprintln(os.Stderr, "Semantic Scholar API key not set: using public rate limits.")
+	}
+
+	// log file for processing info, sep from stderr to keep output clean
+	logFile, err := os.Create("scholfetch.log")
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Error: could not create log file: %v\n", err)
+		os.Exit(1)
+	}
+	defer logFile.Close()
+
+	fmt.Fprintf(os.Stderr, "Processing %d URLs (content=%t)...\n", len(urls), withContent)
+	fmt.Fprintln(os.Stderr, "Monitor progress: tail -f scholfetch.log")
+
+	encoder := json.NewEncoder(os.Stdout)
+	
+	// DO THE ACTUAL WORK
+	result := ProcessURLsWithConfig(urls, config, encoder, logFile)
+
+	// report final stats to stderr
+	fmt.Fprintf(os.Stderr, "Finished: %d articles written, %d errors\n", result.ArticlesWritten, result.Errors)
+	fmt.Fprintln(os.Stderr, "See scholfetch.log for details")
+}
+
+// readURLs parses stdin into a URL slice
+// filters out empty lines and comments (#)
+func readURLs(r io.Reader) []string {
+	var urls []string
+	scanner := bufio.NewScanner(r)
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if line != "" && !strings.HasPrefix(line, "#") {
+			urls = append(urls, line)
+		}
+	}
+	return urls
+}
diff --git a/processor.go b/processor.go
new file mode 100644
index 0000000..1079e3d
--- /dev/null
+++ b/processor.go
@@ -0,0 +1,295 @@
+// PROCESSING PIPELINE
+//
+// Handles batch processing of URLs with rate limiting and fallback strategies.
+// 
+// DESIGN:
+// - fixed chunk size (50) to balance API efficiency and error recovery
+// - batching for arxiv/s2 APIs, individual fallback on batch failure
+// - sep handlers for each route type (arxiv, s2, rawhtml)
+// - JSONL logging of every attempt (success/failure) with timestamps
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"os"
+	"time"
+)
+
+type ProcessResult struct {
+	ArticlesWritten int
+	Errors          int
+}
+
+type URLLogEntry struct {
+	Time    string `json:"time"`
+	URL     string `json:"url"`
+	Success int    `json:"success"`
+	API     string `json:"api"`
+	Error   string `json:"error,omitempty"`
+}
+
+func logArticleAttempt(logEncoder *json.Encoder, url, api string, err error) error {
+	success := 0
+	errMsg := ""
+	if err == nil {
+		success = 1
+	} else {
+		errMsg = err.Error()
+	}
+	return logEncoder.Encode(URLLogEntry{
+		Time:    time.Now().Format(time.RFC3339),
+		URL:     url,
+		Success: success,
+		API:     api,
+		Error:   errMsg,
+	})
+}
+
+func logEncodingFailure(logEncoder *json.Encoder, url string, err error) error {
+	return logEncoder.Encode(URLLogEntry{
+		Time:    time.Now().Format(time.RFC3339),
+		URL:     url,
+		Success: 0,
+		API:     "",
+		Error:   fmt.Sprintf("encoding error: %v", err),
+	})
+}
+
+// ProcessURLsWithConfig orchestrates the entire processing pipeline
+// chunks URLs to balance API efficiency with error recovery
+func ProcessURLsWithConfig(urls []string, config *Config, encoder *json.Encoder, logFile io.Writer) ProcessResult {
+	result := ProcessResult{}
+	ctx := context.Background()
+	logEncoder := json.NewEncoder(logFile)
+
+	chunkSize := 50
+
+	processedCount := 0
+
+	// process URLs in chunks
+	for i := 0; i < len(urls); i += chunkSize {
+		end := i + chunkSize
+		if end > len(urls) {
+			end = len(urls)
+		}
+
+		chunk := urls[i:end]
+		
+		if config.Verbose && config.Logger != nil {
+			config.Logger.Printf("Processing chunk %d-%d of %d URLs", i+1, end, len(urls))
+		}
+
+		// do the work
+		chunkResult := processChunk(ctx, chunk, config, encoder, logEncoder)
+		
+		result.ArticlesWritten += chunkResult.ArticlesWritten
+		result.Errors += chunkResult.Errors
+		processedCount += len(chunk)
+
+		if config.Verbose && config.Logger != nil {
+			fmt.Fprintf(os.Stderr, "Processed %d articles...\n", processedCount)
+		}
+	}
+
+	return result
+}
+
+// processChunk handles routing, batching, and fallback for a given chunk of URLs.
+func processChunk(ctx context.Context, urls []string, config *Config, encoder *json.Encoder, logEncoder *json.Encoder) ProcessResult {
+	result := ProcessResult{}
+
+	// create temporary articles for routing and processing
+	articles := make([]*Article, len(urls))
+	for i, url := range urls {
+		articles[i] = &Article{URL: url}
+	}
+
+	// 1. toute all articles in the chunk
+	for _, article := range articles {
+		routeArticle(article)
+	}
+
+	// 2. group by type for batching
+	arxivURLs := []string{}
+	s2URLs := []string{}
+	htmlURLs := []string{}
+
+	for _, article := range articles {
+		switch article.Route {
+		case "arxiv":
+			arxivURLs = append(arxivURLs, article.URL)
+		case "s2":
+			s2URLs = append(s2URLs, article.URL)
+		default:
+			htmlURLs = append(htmlURLs, article.URL)
+		}
+	}
+
+	// 3. process each type (lim to chunk size)
+	if len(arxivURLs) > 0 {
+		if config.Verbose && config.Logger != nil {
+			config.Logger.Printf("Processing %d arXiv URLs in chunk", len(arxivURLs))
+		}
+		n, err := processArxiv(ctx, arxivURLs, encoder, config, logEncoder)
+		result.ArticlesWritten += n
+		if err != nil {
+			result.Errors++
+			if config.Verbose && config.Logger != nil {
+				config.Logger.Printf("Error processing arXiv URLs: %v", err)
+			}
+		}
+	}
+
+	if len(s2URLs) > 0 {
+		if config.Verbose && config.Logger != nil {
+			config.Logger.Printf("Processing %d Semantic Scholar URLs in chunk", len(s2URLs))
+		}
+		n, err := processSemanticScholar(ctx, s2URLs, encoder, config, logEncoder)
+		result.ArticlesWritten += n
+		if err != nil {
+			result.Errors++
+			if config.Verbose && config.Logger != nil {
+				config.Logger.Printf("Error processing S2 URLs: %v", err)
+			}
+		}
+	}
+
+	if len(htmlURLs) > 0 {
+		if config.Verbose && config.Logger != nil {
+			config.Logger.Printf("Processing %d raw HTML URLs in chunk", len(htmlURLs))
+		}
+		n, err := processHTML(ctx, htmlURLs, encoder, config, logEncoder)
+		result.ArticlesWritten += n
+		if err != nil {
+			result.Errors++
+			if config.Verbose && config.Logger != nil {
+				config.Logger.Printf("Error processing HTML URLs: %v", err)
+			}
+		}
+	}
+
+	return result
+}
+
+func processArxiv(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) {
+	articles, err := fetchArxivBatch(ctx, config, urls)
+	if err != nil {
+		if config.Verbose && config.Logger != nil {
+			config.Logger.Printf("arXiv batch failed: %v, falling back to individual processing", err)
+		}
+		return processIndividualArxiv(ctx, urls, encoder, config, logEncoder)
+	}
+
+	written := 0
+	for _, article := range articles {
+		if err := encoder.Encode(article); err != nil {
+			if config.Verbose && config.Logger != nil {
+				config.Logger.Printf("Error encoding article: %v", err)
+			}
+			_ = logEncodingFailure(logEncoder, article.URL, err)
+		} else {
+			written++
+			_ = logArticleAttempt(logEncoder, article.URL, "arxiv", nil)
+		}
+	}
+	return written, nil
+}
+
+func processSemanticScholar(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) {
+	articles, err := fetchSemanticScholarBatch(ctx, config, urls)
+	if err != nil {
+		if config.Verbose && config.Logger != nil {
+			config.Logger.Printf("S2 batch failed: %v, falling back to individual processing", err)
+		}
+		return processIndividualS2(ctx, urls, encoder, config, logEncoder)
+	}
+
+	written := 0
+	for _, article := range articles {
+		if err := encoder.Encode(article); err != nil {
+			if config.Verbose && config.Logger != nil {
+				config.Logger.Printf("Error encoding article: %v", err)
+			}
+			_ = logEncodingFailure(logEncoder, article.URL, err)
+		} else {
+			written++
+			_ = logArticleAttempt(logEncoder, article.URL, "s2", nil)
+		}
+	}
+	return written, nil
+}
+
+func processHTML(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) {
+	written := 0
+	for _, url := range urls {
+		article, err := fetchRawHTML(ctx, config, url)
+		if err != nil {
+			_ = logArticleAttempt(logEncoder, url, "", err)
+			if config.Verbose && config.Logger != nil {
+				config.Logger.Printf("Error fetching HTML %s: %v", url, err)
+			}
+			continue
+		}
+		if err := encoder.Encode(article); err != nil {
+			if config.Verbose && config.Logger != nil {
+				config.Logger.Printf("Error encoding article: %v", err)
+			}
+			_ = logEncodingFailure(logEncoder, url, err)
+		} else {
+			written++
+			_ = logArticleAttempt(logEncoder, url, "html", nil)
+		}
+	}
+	return written, nil
+}
+
+func processIndividualArxiv(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) {
+	written := 0
+	for _, url := range urls {
+		article, err := fetchArxiv(ctx, config, url)
+		if err != nil {
+			_ = logArticleAttempt(logEncoder, url, "", err)
+			if config.Verbose && config.Logger != nil {
+				config.Logger.Printf("Error fetching arXiv %s: %v", url, err)
+			}
+			continue
+		}
+		if err := encoder.Encode(article); err != nil {
+			if config.Verbose && config.Logger != nil {
+				config.Logger.Printf("Error encoding article: %v", err)
+			}
+			_ = logEncodingFailure(logEncoder, url, err)
+		} else {
+			written++
+			_ = logArticleAttempt(logEncoder, url, "arxiv", nil)
+		}
+	}
+	return written, nil
+}
+
+func processIndividualS2(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) {
+	written := 0
+	for _, url := range urls {
+		article, err := fetchSemanticScholar(ctx, config, url)
+		if err != nil {
+			_ = logArticleAttempt(logEncoder, url, "", err)
+			if config.Verbose && config.Logger != nil {
+				config.Logger.Printf("Error fetching S2 %s: %v", url, err)
+			}
+			continue
+		}
+		if err := encoder.Encode(article); err != nil {
+			if config.Verbose && config.Logger != nil {
+				config.Logger.Printf("Error encoding article: %v", err)
+			}
+			_ = logEncodingFailure(logEncoder, url, err)
+		} else {
+			written++
+			_ = logArticleAttempt(logEncoder, url, "s2", nil)
+		}
+	}
+	return written, nil
+}
diff --git a/routes.go b/routes.go
new file mode 100644
index 0000000..39bb7a9
--- /dev/null
+++ b/routes.go
@@ -0,0 +1,75 @@
+// ROUTING STRATEGY
+//
+// Routes URLs to the appropriate extraction handler. The order matters:
+// 1. arxiv - direct arXiv URLs and IDs, use specialized arXiv API
+// 2. s2 - Semantic Scholar for DOI-based sources, richer metadata
+// 3. rawhtml - fallback for direct publisher URLs, generic extraction
+package main
+
+import (
+	"fmt"
+	"net/url"
+	"regexp"
+	"strings"
+)
+
+var (
+	// regex to extract arXiv identifier from various arXiv URLs.
+	// supports new (2109.05857) and old (math-ph/0301015) formats.
+	arxivIdentifierRegex = regexp.MustCompile(`(?:arxiv\.org/(?:abs|pdf)/|arXiv:)([a-z-]+/[0-9]{7}|[0-9]{4}\.[0-9]{4,5}(?:v[0-9]+)?)(?:\.pdf)?`)
+
+	// regex to find a DOI in a string.
+	doiRegex = regexp.MustCompile(`(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)`)
+)
+
+// route determines the primary enrichment strategy for a URL.
+// returns the route str: "arxiv", "s2", or "rawhtml".
+func Route(urlStr string) string {
+	parsedURL, err := url.Parse(urlStr)
+	if err != nil {
+		return "rawhtml" // fallback if URL is unparseable
+	}
+
+	hostname := parsedURL.Hostname()
+
+	// 1. arXiv.org or arXiv ID pattern in URL
+	if hostname == "arxiv.org" || strings.Contains(urlStr, "arxiv.org/abs/") {
+		if _, err := getArxivIdentifier(urlStr); err == nil {
+			return "arxiv"
+		}
+	}
+
+	// 2. direct DOI link from doi.org
+	if hostname == "doi.org" {
+		return "s2"
+	}
+
+	// 3. DOI present in URL path (e.g. some publisher sites)
+	if doi := getDOI(urlStr); doi != "" {
+		return "s2"
+	}
+
+	// 4. fallback to rawhtml
+	return "rawhtml"
+}
+
+// routeArticle determines the route for an article and sets the Route field.
+func routeArticle(article *Article) {
+	article.Route = Route(article.URL)
+}
+
+func getArxivIdentifier(articleURL string) (string, error) {
+	matches := arxivIdentifierRegex.FindStringSubmatch(articleURL)
+	if len(matches) > 1 {
+		return matches[1], nil
+	}
+	return "", fmt.Errorf("no arXiv identifier found")
+}
+
+func getDOI(text string) string {
+	matches := doiRegex.FindStringSubmatch(text)
+	if len(matches) > 1 {
+		return matches[1]
+	}
+	return ""
+}
diff --git a/scholar.go b/scholar.go
new file mode 100644
index 0000000..ad1e5e0
--- /dev/null
+++ b/scholar.go
@@ -0,0 +1,217 @@
+// SEMANTIC SCHOLAR HANDLER
+//
+// Uses S2's Graph API to fetch paper metadata via DOI.
+// 
+// STRATEGY:
+// - requires valid DOI in URL or DOI.org redirect
+// - batch API for efficiency (up to 500 papers per request)
+// - positional matching: response[i] maps to URLs[i]
+// - rate limited to 100ms per request (configurable with API key)
+//
+// AUTH:
+// - S2_API_KEY environment variable increases rate limits
+// - Without key: public limits apply
+package main
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"strings"
+)
+
+const (
+	semScholarPaperDOIFmtTitle    = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title"
+	semScholarPaperDOIFmtFull     = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title,abstract"
+	semScholarBatchURLTitle       = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title"
+	semScholarBatchURLFull        = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title,abstract"
+)
+
+// escapeDOI URL-encodes a DOI for safe use in API endpoints.
+// DOIs contain forward slashes which must be escaped for the URL path.
+// Example: "10.1234/abcd5678" -> "10.1234/abcd5678" (already safe in this case)
+func escapeDOI(doi string) string {
+	parts := strings.SplitN(doi, "/", 2)
+	if len(parts) != 2 {
+		return url.PathEscape(doi)
+	}
+	return url.PathEscape(parts[0]) + "/" + url.PathEscape(parts[1])
+}
+
+// S2BatchResponseItem represents a Semantic Scholar batch API response item
+type S2BatchResponseItem struct {
+	PaperID  string `json:"paperId"`
+	Title    string `json:"title"`
+	Abstract string `json:"abstract"`
+}
+
+// fetchSemanticScholar fetches content for a single DOI via Semantic Scholar.
+func fetchSemanticScholar(ctx context.Context, config *Config, urlStr string) (*Article, error) {
+	doi := getDOI(urlStr)
+	if doi == "" {
+		return nil, fmt.Errorf("Semantic Scholar: URL doesn't contain valid DOI: %s", urlStr)
+	}
+
+	// rate limit
+	if err := config.HTTP.RateLimitS2(ctx); err != nil {
+		return nil, err
+	}
+
+	escapedDOI := escapeDOI(doi)
+	
+	// choose the appropriate URL based on whether we need content
+	var apiURL string
+	if config.WithContent {
+		apiURL = fmt.Sprintf(semScholarPaperDOIFmtFull, escapedDOI)
+	} else {
+		apiURL = fmt.Sprintf(semScholarPaperDOIFmtTitle, escapedDOI)
+	}
+
+	req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to construct Semantic Scholar request: %w", err)
+	}
+	req.Header.Set("Accept", "application/json")
+	if config.S2APIKey != "" {
+		req.Header.Set("x-api-key", config.S2APIKey)
+	}
+
+	resp, err := config.HTTP.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("failed to make request to Semantic Scholar API: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("Semantic Scholar API returned non-200 status for DOI %s: %s", doi, resp.Status)
+	}
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read Semantic Scholar API response body: %w", err)
+	}
+
+	var s2 struct {
+		Title    string `json:"title"`
+		Abstract string `json:"abstract"`
+	}
+	if err := json.Unmarshal(body, &s2); err != nil {
+		return nil, fmt.Errorf("failed to unmarshal Semantic Scholar JSON for DOI %s: %w", doi, err)
+	}
+
+	title := normalizeSpace(s2.Title)
+	content := normalizeSpace(s2.Abstract)
+
+	// del content if not requested
+	if !config.WithContent {
+		content = ""
+	}
+
+	if title == "" {
+		return nil, fmt.Errorf("no title found for DOI %s", doi)
+	}
+
+	return &Article{
+		Title:   title,
+		Content: content,
+		URL:     urlStr,
+	}, nil
+}
+
+// fetchSemanticScholarBatch fetches a batch of papers from the S2 API.
+func fetchSemanticScholarBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) {
+	if len(urls) == 0 {
+		return nil, nil
+	}
+
+	// rate limit
+	if err := config.HTTP.RateLimitS2(ctx); err != nil {
+		return nil, err
+	}
+
+	// extract DOIs from URLs, maintaining order for pos matching
+	validURLs := make([]string, 0, len(urls))
+	s2IDs := make([]string, 0, len(urls))
+
+	for _, urlStr := range urls {
+		doi := getDOI(urlStr)
+		if doi != "" {
+			validURLs = append(validURLs, urlStr)
+			s2IDs = append(s2IDs, "DOI:"+doi)
+		}
+	}
+
+	if len(s2IDs) == 0 {
+		return nil, nil
+	}
+
+	requestBody, err := json.Marshal(map[string][]string{"ids": s2IDs})
+	if err != nil {
+		return nil, fmt.Errorf("failed to marshal S2 batch request body: %w", err)
+	}
+
+	// choose the appropriate URL based on whether we need content
+	var batchURL string
+	if config.WithContent {
+		batchURL = semScholarBatchURLFull
+	} else {
+		batchURL = semScholarBatchURLTitle
+	}
+
+	req, err := http.NewRequestWithContext(ctx, "POST", batchURL, bytes.NewReader(requestBody))
+	if err != nil {
+		return nil, fmt.Errorf("failed to create S2 batch request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+	if config.S2APIKey != "" {
+		req.Header.Set("x-api-key", config.S2APIKey)
+	}
+
+	resp, err := config.HTTP.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("S2 batch request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("S2 batch API returned non-200 status: %s", resp.Status)
+	}
+
+	var responseItems []*S2BatchResponseItem
+	if err := json.NewDecoder(resp.Body).Decode(&responseItems); err != nil {
+		return nil, fmt.Errorf("failed to decode S2 batch response: %w", err)
+	}
+
+	var articles []*Article
+	// match responses positionally to input URLs
+	for i, item := range responseItems {
+		if i >= len(validURLs) {
+			break
+		}
+		if item == nil {
+			continue
+		}
+
+		title := normalizeSpace(item.Title)
+		if title != "" {
+			content := normalizeSpace(item.Abstract)
+			
+			// skip content if not requested
+			if !config.WithContent {
+				content = ""
+			}
+			
+			articles = append(articles, &Article{
+				Title:   title,
+				Content: content,
+				URL:     validURLs[i],
+			})
+		}
+	}
+
+	return articles, nil
+}
diff --git a/scholfetch_test.go b/scholfetch_test.go
new file mode 100644
index 0000000..59adae7
--- /dev/null
+++ b/scholfetch_test.go
@@ -0,0 +1,193 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+)
+
+type TestLogger struct {
+	messages []string
+}
+
+func (l *TestLogger) Printf(format string, v ...interface{}) {
+	l.messages = append(l.messages, fmt.Sprintf(format, v...))
+}
+
+func TestHTTPClientDefaults(t *testing.T) {
+	client := NewHTTPClient()
+	
+	if client.userAgent != "scholfetch/1.0 (+https://samsci.com)" {
+		t.Errorf("Expected default user agent, got %s", client.userAgent)
+	}
+	
+	if client.arxivDelay != 1*time.Second {
+		t.Errorf("Expected arxiv delay of 1s, got %v", client.arxivDelay)
+	}
+	
+	if client.maxRetries != 3 {
+		t.Errorf("Expected max retries of 3, got %d", client.maxRetries)
+	}
+}
+
+func TestRateLimiting(t *testing.T) {
+	client := NewHTTPClient()
+	client.arxivDelay = 10 * time.Millisecond // Speed up test
+	client.s2Delay = 5 * time.Millisecond
+	
+	// Test arxiv rate limiting
+	start := time.Now()
+	err := client.RateLimitArxiv(context.Background())
+	if err != nil {
+		t.Fatalf("RateLimitArxiv failed: %v", err)
+	}
+	duration := time.Since(start)
+	if duration < 10*time.Millisecond {
+		t.Errorf("Expected arxiv delay of ~10ms, got %v", duration)
+	}
+	
+	// Test S2 rate limiting
+	start = time.Now()
+	err = client.RateLimitS2(context.Background())
+	if err != nil {
+		t.Fatalf("RateLimitS2 failed: %v", err)
+	}
+	duration = time.Since(start)
+	if duration < 5*time.Millisecond {
+		t.Errorf("Expected S2 delay of ~5ms, got %v", duration)
+	}
+}
+
+func TestHTTPIPRequest(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte("test response"))
+	}))
+	defer server.Close()
+	
+	client := NewHTTPClient()
+	req, _ := http.NewRequest("GET", server.URL, nil)
+	
+	resp, err := client.Do(req)
+	if err != nil {
+		t.Fatalf("Request failed: %v", err)
+	}
+	defer resp.Body.Close()
+	
+	if resp.StatusCode != 200 {
+		t.Errorf("Expected status 200, got %d", resp.StatusCode)
+	}
+}
+
+func TestURLRouting(t *testing.T) {
+	tests := map[string]string{
+		"https://arxiv.org/abs/2301.00001":            "arxiv",
+		"https://arxiv.org/pdf/2301.00001.pdf":        "arxiv",
+		"http://arxiv.org/abs/2301.00001v2":           "arxiv",
+		"https://api.semanticscholar.org/DOI:10.1234": "rawhtml",
+		"https://doi.org/10.1234/abcd5678":            "s2",
+		"https://example.com/paper":                    "rawhtml",
+		"https://pubmed.ncbi.nlm.nih.gov/12345678/":   "rawhtml",
+	}
+	
+	for url, expected := range tests {
+		result := Route(url)
+		if result != expected {
+			t.Errorf("Route(%s) = %s, expected %s", url, result, expected)
+		}
+	}
+}
+
+func TestConfigDefaults(t *testing.T) {
+	config := NewConfig()
+	
+	if config.WithContent != false {
+		t.Error("Expected WithContent=false by default")
+	}
+	
+	if config.Verbose != false {
+		t.Error("Expected Verbose=false by default")
+	}
+	
+	if config.ArxivBatch != 50 {
+		t.Errorf("Expected ArxivBatch=50, got %d", config.ArxivBatch)
+	}
+	
+	if config.HTTP == nil {
+		t.Error("Expected HTTP client to be initialized")
+	}
+}
+
+func TestConfigWithLogger(t *testing.T) {
+	logger := &TestLogger{}
+	config := NewConfigWithLogger(logger)
+	
+	if config.Logger != logger {
+		t.Error("Logger not set correctly")
+	}
+}
+
+func TestArxivURLParsing(t *testing.T) {
+	tests := map[string]string{
+		"https://arxiv.org/abs/2301.00001":            "2301.00001",
+		"http://arxiv.org/abs/2301.00001v2":           "2301.00001v2",
+		"https://arxiv.org/pdf/2301.00001.pdf":        "2301.00001",
+		"https://example.com/not-arxiv":                "",
+	}
+	
+	for url, expected := range tests {
+		result, _ := getArxivIdentifier(url)
+		if result != expected {
+			t.Errorf("getArxivIdentifier(%s) = %s, expected %s", url, result, expected)
+		}
+	}
+}
+
+func TestDOIParsing(t *testing.T) {
+	tests := map[string]string{
+		"https://doi.org/10.1234/abcd5678":            "10.1234/abcd5678",
+		"https://api.semanticscholar.org/DOI:10.1234": "",
+		"https://example.com/no-doi":                  "",
+	}
+	
+	for url, expected := range tests {
+		result := getDOI(url)
+		if result == expected {
+			t.Logf("✓ getDOI(%s) = %s", url, result)
+		} else {
+			t.Errorf("getDOI(%s) = %s, expected %s", url, result, expected)
+		}
+	}
+}
+
+func TestBatchURLRouting(t *testing.T) {
+	urls := []string{
+		"https://arxiv.org/abs/2301.00001",
+		"https://doi.org/10.1234/test1",
+		"https://example.com/paper1",
+		"https://arxiv.org/pdf/2301.00002.pdf",
+		"https://doi.org/10.5678/test2",
+	}
+	
+	routeCounts := make(map[string]int)
+	for _, url := range urls {
+		route := Route(url)
+		routeCounts[route]++
+	}
+	
+	expected := map[string]int{
+		"arxiv":  2,
+		"s2":     2,
+		"rawhtml": 1,
+	}
+	
+	for route, expectedCount := range expected {
+		if routeCounts[route] != expectedCount {
+			t.Errorf("Expected %d URLs for route %s, got %d", 
+				expectedCount, route, routeCounts[route])
+		}
+	}
+}
+\ No newline at end of file
diff --git a/util.go b/util.go
new file mode 100644
index 0000000..f7d94e5
--- /dev/null
+++ b/util.go
@@ -0,0 +1,14 @@
+package main
+
+import (
+	"regexp"
+	"strings"
+)
+
+func normalizeSpace(s string) string {
+	return strings.Join(strings.Fields(s), " ")
+}
+
+func stripArxivVersion(id string) string {
+	return regexp.MustCompile(`v\d+$`).ReplaceAllString(id, "")
+}
author	Sam Scholten	2025-12-15 19:35:46 +1000
committer	Sam Scholten	2025-12-15 19:35:57 +1000
commit	3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2 (patch)
tree	42b1f0e0a346a1cf087df90e29a100edbd66b3eb
download	scholfetch-3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2.tar.gz scholfetch-3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2.zip