From 3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2 Mon Sep 17 00:00:00 2001 From: Sam Scholten Date: Mon, 15 Dec 2025 19:35:46 +1000 Subject: Init v0.1.0 --- .gitignore | 10 ++ README.md | 83 +++++++++++++++ arxiv.go | 196 +++++++++++++++++++++++++++++++++++ client.go | 133 ++++++++++++++++++++++++ go.mod | 18 ++++ go.sum | 96 +++++++++++++++++ html.go | 198 +++++++++++++++++++++++++++++++++++ justfile | 20 ++++ main.go | 131 ++++++++++++++++++++++++ processor.go | 295 +++++++++++++++++++++++++++++++++++++++++++++++++++++ routes.go | 75 ++++++++++++++ scholar.go | 217 +++++++++++++++++++++++++++++++++++++++ scholfetch_test.go | 193 +++++++++++++++++++++++++++++++++++ util.go | 14 +++ 14 files changed, 1679 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 arxiv.go create mode 100644 client.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 html.go create mode 100644 justfile create mode 100644 main.go create mode 100644 processor.go create mode 100644 routes.go create mode 100644 scholar.go create mode 100644 scholfetch_test.go create mode 100644 util.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4a79922 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +# Built binaries +scholfetch + +# Test files +test_urls.txt + +# Environment and configuration files +.env* +config.* +secrets.* diff --git a/README.md b/README.md new file mode 100644 index 0000000..9190dc1 --- /dev/null +++ b/README.md @@ -0,0 +1,83 @@ +# ScholFetch + +URL → Article metadata (JSONL) converter. Fetches title-only by default for speed. + +## Overview + +ScholFetch extracts academic article metadata from URLs. +It supports arXiv, Semantic Scholar, and generic HTML sources. +The tool outputs structured JSONL format suitable for downstream processing by ScholScan (see below). + +## Usage +```bash +cat urls.txt | scholfetch > articles.jsonl +# or: +cat urls.txt | scholfetch --with-content > articles.jsonl +``` + +## Monitoring Progress + +ScholFetch writes a structured log file `scholfetch.log` during processing. Monitor it in another terminal: + +```bash +tail -f scholfetch.log +``` + +## Semantic Scholar API key + +Get higher rate limits by setting your S2 API key (*not required*): + +```bash +export S2_API_KEY="your-key-here" +cat urls.txt | scholfetch > articles.jsonl +``` + +Get your free key at: https://www.semanticscholar.org/product/api + +ScholFetch will notify you on startup whether the key is detected. + +## Integration with ScholScan + +Once you have structured article data, pipe it to [ScholScan](https://git.samsci.com/scholscan) for ML-based filtering: + +```bash +# Get articles from URLs +cat urls.txt | scholfetch > articles.jsonl + +# Train a classification model +scholscan train articles.jsonl --rss-feeds feeds.txt > model.json + +# Score articles from an RSS feed +scholscan scan --model model.json --url "https://example.com/feed.rss" > results.jsonl +``` + +ScholFetch extracts and enriches article metadata, while ScholScan handles classification. Together they provide a complete pipeline for filtering academic literature. + +## Input/Output +- Input: URLs (one per line) on stdin +- Output: JSONL with `title` and `url` fields (stdout) +- Add `--with-content` for `content` field + +## How it works + +URLs get routed by pattern (arXiv IDs → arXiv API, DOIs → Semantic Scholar, everything else → HTML scrape). +Batched in chunks of 50 for efficiency. If batch fails, falls back to individual requests. Rate limited per API. + +## Code + +- `main.go` - reads stdin, sets up flags/output +- `routes.go` - determines which handler (arxiv/s2/html) for each URL +- `processor.go` - batching, fallback logic +- `arxiv.go`, `scholar.go`, `html.go` - the actual extractors +- `client.go` - HTTP client with retries and rate limiting + +## Build and Development + +```bash +just build +just test +``` + +## Roadmap + +Future work could integrate crossref, pubmed quite easily (especially for title-only approach). diff --git a/arxiv.go b/arxiv.go new file mode 100644 index 0000000..6e7fad5 --- /dev/null +++ b/arxiv.go @@ -0,0 +1,196 @@ +// ARXIV HANDLER +// +// Uses arXiv's API to fetch article metadata. +// +// STRATEGY: +// - single requests and batched requests supported +// - uses gofeed to parse Atom XML responses +// - rate limited to 1 request per second (conservative) +// - handles both old (math-ph/0301015) and new (2109.05857) ID formats +package main + +import ( + "bytes" + "context" + "fmt" + "io" + "net/http" + "strings" + + "github.com/mmcdole/gofeed" +) + +const ( + arxivQueryFmt = "http://export.arxiv.org/api/query?id_list=%s" +) + +// fetchArxiv fetches content for a single arXiv article. +func fetchArxiv(ctx context.Context, config *Config, urlStr string) (*Article, error) { + arxivID, err := getArxivIdentifier(urlStr) + if err != nil || arxivID == "" { + return nil, fmt.Errorf("arXiv: invalid URL format, expected arxiv.org/abs/ID: %s", urlStr) + } + + // rate limit + if err := config.HTTP.RateLimitArxiv(ctx); err != nil { + return nil, err + } + + apiURL := fmt.Sprintf(arxivQueryFmt, arxivID) + req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) + if err != nil { + return nil, fmt.Errorf("failed to construct arXiv request: %w", err) + } + req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8") + + resp, err := config.HTTP.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to fetch arXiv feed for ID %s: %w", arxivID, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("arXiv API returned non-200 status for ID %s: %s", arxivID, resp.Status) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read arXiv response body for ID %s: %w", arxivID, err) + } + + fp := gofeed.NewParser() + feed, err := fp.Parse(bytes.NewReader(body)) + if err != nil { + return nil, fmt.Errorf("failed to parse arXiv feed for ID %s: %w", arxivID, err) + } + + if len(feed.Items) == 0 { + return nil, fmt.Errorf("no items found in arXiv feed for ID %s", arxivID) + } + + item := feed.Items[0] + title := normalizeSpace(item.Title) + content := normalizeSpace(item.Description) + + if config.Verbose { + config.Logger.Printf("arXiv single fetch result: ID=%s, Title=%s", arxivID, title) + } + + // del content if not requested + if !config.WithContent { + content = "" + } + + if title == "" { + return nil, fmt.Errorf("no title found for arXiv ID %s", arxivID) + } + + return &Article{ + Title: title, + Content: content, + URL: urlStr, + }, nil +} + +// fetchArxivBatch fetches metadata for a list of arXiv URLs in batches. +func fetchArxivBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) { + if len(urls) == 0 { + return nil, nil + } + + idToURL := make(map[string]string) + batchIDs := make([]string, 0, len(urls)) + + for _, urlStr := range urls { + id, err := getArxivIdentifier(urlStr) + if err != nil { + continue + } + batchIDs = append(batchIDs, id) + stripped := stripArxivVersion(id) + idToURL[stripped] = urlStr + } + + if len(batchIDs) == 0 { + return nil, nil + } + + var articles []*Article + + for i := 0; i < len(batchIDs); i += config.ArxivBatch { + end := i + config.ArxivBatch + if end > len(batchIDs) { + end = len(batchIDs) + } + + // rate limit + if err := config.HTTP.RateLimitArxiv(ctx); err != nil { + return nil, err + } + + chunk := batchIDs[i:end] + apiURL := fmt.Sprintf(arxivQueryFmt, strings.Join(chunk, ",")) + + req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) + if err != nil { + continue + } + req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8") + + resp, err := config.HTTP.Do(req) + if err != nil { + continue + } + + if resp.StatusCode != http.StatusOK { + resp.Body.Close() + continue + } + + body, err := io.ReadAll(resp.Body) + resp.Body.Close() + if err != nil { + continue + } + + fp := gofeed.NewParser() + feed, err := fp.Parse(bytes.NewReader(body)) + if err != nil { + continue + } + + for _, item := range feed.Items { + id, err := getArxivIdentifier(item.GUID) + if err != nil || id == "" { + id, err = getArxivIdentifier(item.Link) + if err != nil || id == "" { + continue + } + } + + title := normalizeSpace(item.Title) + if title == "" { + continue + } + + baseID := stripArxivVersion(id) + originalURL, exists := idToURL[baseID] + if !exists { + continue + } + + content := "" + if config.WithContent { + content = normalizeSpace(item.Description) + } + + articles = append(articles, &Article{ + Title: title, + Content: content, + URL: originalURL, + }) + } + } + + return articles, nil +} diff --git a/client.go b/client.go new file mode 100644 index 0000000..39a3e34 --- /dev/null +++ b/client.go @@ -0,0 +1,133 @@ +// CLIENT LAYER - HTTP AND RATE LIMITING +// +// manages HTTP requests with retry logic and API-specific rate limits. +// +// RATE LIMITS: +// - arXiv: 1 second between requests (enforced to be safe) +// - Semantic Scholar: 100ms between requests (configurable via API key) +// +// STRATEGY: +// - retries on network failures and HTTP 429 +// - exponential backoff: 1s, 2s, 4s +// - all delays respect context cancellation +package main + +import ( + "context" + "net/http" + "os" + "time" +) + +// HTTPClient wraps an HTTP client with common behavior like user agent, +// rate limiting, and retry logic. +type HTTPClient struct { + client *http.Client + userAgent string + arxivDelay time.Duration + s2Delay time.Duration + maxRetries int +} + +// NewHTTPClient creates a new HTTP client wrapper with defaults. +func NewHTTPClient() *HTTPClient { + return &HTTPClient{ + client: &http.Client{ + Timeout: 30 * time.Second, + }, + userAgent: "scholfetch/1.0 (+https://samsci.com)", + arxivDelay: 1 * time.Second, + s2Delay: 100 * time.Millisecond, + maxRetries: 3, + } +} + +// Do performs an HTTP request with retry logic. +// retries on network errors and 429 (rate limit) responses. +func (c *HTTPClient) Do(req *http.Request) (*http.Response, error) { + // Set user agent if not already set + if req.Header.Get("User-Agent") == "" { + req.Header.Set("User-Agent", c.userAgent) + } + + var lastErr error + for attempt := 0; attempt < c.maxRetries; attempt++ { + if attempt > 0 { + // Exponential backoff: 1s, 2s, 4s + backoff := time.Duration(1< 50 { + break + } + } + + if content != "" { + article.Content = content + } + } + + return article, nil +} + +func extractTitleFromJSONLD(doc *goquery.Document) string { + var title string + doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) { + if title != "" { + return + } + var data map[string]interface{} + if json.Unmarshal([]byte(s.Text()), &data) == nil { + t := getStringFromJSONLD(data, []string{"name", "headline", "title"}) + if t != "" { + title = normalizeSpace(t) + } + } + }) + return title +} + +func extractContentFromJSONLD(doc *goquery.Document) string { + var content string + doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) { + if content != "" { + return + } + var data map[string]interface{} + if json.Unmarshal([]byte(s.Text()), &data) == nil { + d := getStringFromJSONLD(data, []string{"description", "abstract", "summary"}) + if d != "" { + content = normalizeSpace(d) + if len(content) > 5000 { + content = content[:5000] + "..." + } + } + } + }) + return content +} + +func extractTitleFromCitationMeta(doc *goquery.Document) string { + title, _ := doc.Find("meta[name='citation_title']").Attr("content") + return normalizeSpace(title) +} + +func extractContentFromCitationMeta(doc *goquery.Document) string { + content, _ := doc.Find("meta[name='citation_abstract']").Attr("content") + return normalizeSpace(content) +} + +func extractTitleFromOpenGraph(doc *goquery.Document) string { + title, _ := doc.Find("meta[property='og:title']").Attr("content") + return normalizeSpace(title) +} + +func extractContentFromOpenGraph(doc *goquery.Document) string { + content, _ := doc.Find("meta[property='og:description']").Attr("content") + return normalizeSpace(content) +} + +func extractContentFromBasicMeta(doc *goquery.Document) string { + contentRaw, _ := doc.Find("meta[name='description']").Attr("content") + content := normalizeSpace(contentRaw) + + // if meta description is too short, try to extract from the body + if len(content) < 100 { + selectors := []string{"article", "main", ".abstract", ".summary", "[role='main']", ".content", ".entry-content"} + for _, selector := range selectors { + if contentText := extractCleanText(doc, selector); len(contentText) > len(content) { + content = contentText + break + } + } + } + + if len(content) > 5000 { + content = content[:5000] + } + + if len(content) < 50 { + return "" + } + + return content +} + +func extractCleanText(doc *goquery.Document, selector string) string { + element := doc.Find(selector).First() + if element.Length() == 0 { + return "" + } + element.Find("script, style, nav, header, footer, aside").Remove() + text := element.Text() + text = normalizeSpace(text) + if len(text) > 5000 { + text = text[:5000] + } + return text +} + +func getStringFromJSONLD(data map[string]interface{}, fields []string) string { + for _, field := range fields { + if val, ok := data[field].(string); ok && val != "" { + return val + } + } + return "" +} diff --git a/justfile b/justfile new file mode 100644 index 0000000..2e0f285 --- /dev/null +++ b/justfile @@ -0,0 +1,20 @@ +# ScholFetch - URL to article metadata converter + +default: + @just --list + +# Build the binary +build: + go build -o scholfetch . + +# Run tests +test: + go test ./... + +# Format code +fmt: + go fmt ./... + +# Run linter (requires golangci-lint) +lint: + golangci-lint run diff --git a/main.go b/main.go new file mode 100644 index 0000000..1c286ea --- /dev/null +++ b/main.go @@ -0,0 +1,131 @@ +// scholfetch - URL to article converter for scholscan +// takes URLs on stdin, outputs Article structs on stdout (JSONL) +// logs everything to scholfetch.log +package main + +import ( + "bufio" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "os" + "strings" +) + +type Article struct { + Title string `json:"title"` + Content string `json:"content,omitempty"` // Optional - expensive to fetch + URL string `json:"url"` + Route string `json:"-"` // Internal: tracks which handler succeeded +} + +type Result struct { + Urls []string + FailureIndices []int + ArticlesWritten int + Errors int +} + +func main() { + var withContent bool + var verbose bool + + fs := flag.NewFlagSet("scholfetch", flag.ExitOnError) + fs.Usage = func() { + fmt.Fprintf(fs.Output(), `Usage: scholfetch [options] < urls.txt > articles.jsonl + +Converts URLs to Article JSONL format for scholscan processing. + +Default mode: Title-only extraction (fast) +Optional mode: Full content extraction with --with-content + +Input: Text file with one URL per line (stdin) +Output: Article JSONL (stdout) + +Options: +`) + fs.PrintDefaults() + fmt.Fprint(fs.Output(), ` +Examples: + # Title-only mode (default) + cat urls.txt | scholfetch > articles.jsonl + + # With full content + cat urls.txt | scholfetch --with-content > articles.jsonl + +Note: Set S2_API_KEY environment variable for higher Semantic Scholar rate limits. +`) + } + + fs.BoolVar(&withContent, "with-content", false, "Fetch full article content (slower)") + fs.BoolVar(&verbose, "verbose", false, "Show progress information") + + // validate args and exit early on err + if err := fs.Parse(os.Args[1:]); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + if fs.NArg() > 0 { + fmt.Fprintf(os.Stderr, "Error: Unexpected arguments: %v\n", fs.Args()) + os.Exit(1) + } + + // set up logger + var logger *log.Logger + if verbose { + logger = log.New(os.Stderr, "", log.LstdFlags) + } else { + logger = log.New(io.Discard, "", 0) + } + + // config controls how URLs are handled and what data is extracted + config := NewConfigWithLogger(logger) + config.WithContent = withContent + config.Verbose = verbose + + urls := readURLs(os.Stdin) + + // notify user about S2 key found/not + if config.S2APIKey != "" { + fmt.Fprintln(os.Stderr, "Semantic Scholar API key detected: using authenticated rate limits.") + } else { + fmt.Fprintln(os.Stderr, "Semantic Scholar API key not set: using public rate limits.") + } + + // log file for processing info, sep from stderr to keep output clean + logFile, err := os.Create("scholfetch.log") + if err != nil { + fmt.Fprintf(os.Stderr, "Error: could not create log file: %v\n", err) + os.Exit(1) + } + defer logFile.Close() + + fmt.Fprintf(os.Stderr, "Processing %d URLs (content=%t)...\n", len(urls), withContent) + fmt.Fprintln(os.Stderr, "Monitor progress: tail -f scholfetch.log") + + encoder := json.NewEncoder(os.Stdout) + + // DO THE ACTUAL WORK + result := ProcessURLsWithConfig(urls, config, encoder, logFile) + + // report final stats to stderr + fmt.Fprintf(os.Stderr, "Finished: %d articles written, %d errors\n", result.ArticlesWritten, result.Errors) + fmt.Fprintln(os.Stderr, "See scholfetch.log for details") +} + +// readURLs parses stdin into a URL slice +// filters out empty lines and comments (#) +func readURLs(r io.Reader) []string { + var urls []string + scanner := bufio.NewScanner(r) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line != "" && !strings.HasPrefix(line, "#") { + urls = append(urls, line) + } + } + return urls +} diff --git a/processor.go b/processor.go new file mode 100644 index 0000000..1079e3d --- /dev/null +++ b/processor.go @@ -0,0 +1,295 @@ +// PROCESSING PIPELINE +// +// Handles batch processing of URLs with rate limiting and fallback strategies. +// +// DESIGN: +// - fixed chunk size (50) to balance API efficiency and error recovery +// - batching for arxiv/s2 APIs, individual fallback on batch failure +// - sep handlers for each route type (arxiv, s2, rawhtml) +// - JSONL logging of every attempt (success/failure) with timestamps +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "os" + "time" +) + +type ProcessResult struct { + ArticlesWritten int + Errors int +} + +type URLLogEntry struct { + Time string `json:"time"` + URL string `json:"url"` + Success int `json:"success"` + API string `json:"api"` + Error string `json:"error,omitempty"` +} + +func logArticleAttempt(logEncoder *json.Encoder, url, api string, err error) error { + success := 0 + errMsg := "" + if err == nil { + success = 1 + } else { + errMsg = err.Error() + } + return logEncoder.Encode(URLLogEntry{ + Time: time.Now().Format(time.RFC3339), + URL: url, + Success: success, + API: api, + Error: errMsg, + }) +} + +func logEncodingFailure(logEncoder *json.Encoder, url string, err error) error { + return logEncoder.Encode(URLLogEntry{ + Time: time.Now().Format(time.RFC3339), + URL: url, + Success: 0, + API: "", + Error: fmt.Sprintf("encoding error: %v", err), + }) +} + +// ProcessURLsWithConfig orchestrates the entire processing pipeline +// chunks URLs to balance API efficiency with error recovery +func ProcessURLsWithConfig(urls []string, config *Config, encoder *json.Encoder, logFile io.Writer) ProcessResult { + result := ProcessResult{} + ctx := context.Background() + logEncoder := json.NewEncoder(logFile) + + chunkSize := 50 + + processedCount := 0 + + // process URLs in chunks + for i := 0; i < len(urls); i += chunkSize { + end := i + chunkSize + if end > len(urls) { + end = len(urls) + } + + chunk := urls[i:end] + + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Processing chunk %d-%d of %d URLs", i+1, end, len(urls)) + } + + // do the work + chunkResult := processChunk(ctx, chunk, config, encoder, logEncoder) + + result.ArticlesWritten += chunkResult.ArticlesWritten + result.Errors += chunkResult.Errors + processedCount += len(chunk) + + if config.Verbose && config.Logger != nil { + fmt.Fprintf(os.Stderr, "Processed %d articles...\n", processedCount) + } + } + + return result +} + +// processChunk handles routing, batching, and fallback for a given chunk of URLs. +func processChunk(ctx context.Context, urls []string, config *Config, encoder *json.Encoder, logEncoder *json.Encoder) ProcessResult { + result := ProcessResult{} + + // create temporary articles for routing and processing + articles := make([]*Article, len(urls)) + for i, url := range urls { + articles[i] = &Article{URL: url} + } + + // 1. toute all articles in the chunk + for _, article := range articles { + routeArticle(article) + } + + // 2. group by type for batching + arxivURLs := []string{} + s2URLs := []string{} + htmlURLs := []string{} + + for _, article := range articles { + switch article.Route { + case "arxiv": + arxivURLs = append(arxivURLs, article.URL) + case "s2": + s2URLs = append(s2URLs, article.URL) + default: + htmlURLs = append(htmlURLs, article.URL) + } + } + + // 3. process each type (lim to chunk size) + if len(arxivURLs) > 0 { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Processing %d arXiv URLs in chunk", len(arxivURLs)) + } + n, err := processArxiv(ctx, arxivURLs, encoder, config, logEncoder) + result.ArticlesWritten += n + if err != nil { + result.Errors++ + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error processing arXiv URLs: %v", err) + } + } + } + + if len(s2URLs) > 0 { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Processing %d Semantic Scholar URLs in chunk", len(s2URLs)) + } + n, err := processSemanticScholar(ctx, s2URLs, encoder, config, logEncoder) + result.ArticlesWritten += n + if err != nil { + result.Errors++ + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error processing S2 URLs: %v", err) + } + } + } + + if len(htmlURLs) > 0 { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Processing %d raw HTML URLs in chunk", len(htmlURLs)) + } + n, err := processHTML(ctx, htmlURLs, encoder, config, logEncoder) + result.ArticlesWritten += n + if err != nil { + result.Errors++ + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error processing HTML URLs: %v", err) + } + } + } + + return result +} + +func processArxiv(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) { + articles, err := fetchArxivBatch(ctx, config, urls) + if err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("arXiv batch failed: %v, falling back to individual processing", err) + } + return processIndividualArxiv(ctx, urls, encoder, config, logEncoder) + } + + written := 0 + for _, article := range articles { + if err := encoder.Encode(article); err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error encoding article: %v", err) + } + _ = logEncodingFailure(logEncoder, article.URL, err) + } else { + written++ + _ = logArticleAttempt(logEncoder, article.URL, "arxiv", nil) + } + } + return written, nil +} + +func processSemanticScholar(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) { + articles, err := fetchSemanticScholarBatch(ctx, config, urls) + if err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("S2 batch failed: %v, falling back to individual processing", err) + } + return processIndividualS2(ctx, urls, encoder, config, logEncoder) + } + + written := 0 + for _, article := range articles { + if err := encoder.Encode(article); err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error encoding article: %v", err) + } + _ = logEncodingFailure(logEncoder, article.URL, err) + } else { + written++ + _ = logArticleAttempt(logEncoder, article.URL, "s2", nil) + } + } + return written, nil +} + +func processHTML(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) { + written := 0 + for _, url := range urls { + article, err := fetchRawHTML(ctx, config, url) + if err != nil { + _ = logArticleAttempt(logEncoder, url, "", err) + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error fetching HTML %s: %v", url, err) + } + continue + } + if err := encoder.Encode(article); err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error encoding article: %v", err) + } + _ = logEncodingFailure(logEncoder, url, err) + } else { + written++ + _ = logArticleAttempt(logEncoder, url, "html", nil) + } + } + return written, nil +} + +func processIndividualArxiv(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) { + written := 0 + for _, url := range urls { + article, err := fetchArxiv(ctx, config, url) + if err != nil { + _ = logArticleAttempt(logEncoder, url, "", err) + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error fetching arXiv %s: %v", url, err) + } + continue + } + if err := encoder.Encode(article); err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error encoding article: %v", err) + } + _ = logEncodingFailure(logEncoder, url, err) + } else { + written++ + _ = logArticleAttempt(logEncoder, url, "arxiv", nil) + } + } + return written, nil +} + +func processIndividualS2(ctx context.Context, urls []string, encoder *json.Encoder, config *Config, logEncoder *json.Encoder) (int, error) { + written := 0 + for _, url := range urls { + article, err := fetchSemanticScholar(ctx, config, url) + if err != nil { + _ = logArticleAttempt(logEncoder, url, "", err) + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error fetching S2 %s: %v", url, err) + } + continue + } + if err := encoder.Encode(article); err != nil { + if config.Verbose && config.Logger != nil { + config.Logger.Printf("Error encoding article: %v", err) + } + _ = logEncodingFailure(logEncoder, url, err) + } else { + written++ + _ = logArticleAttempt(logEncoder, url, "s2", nil) + } + } + return written, nil +} diff --git a/routes.go b/routes.go new file mode 100644 index 0000000..39bb7a9 --- /dev/null +++ b/routes.go @@ -0,0 +1,75 @@ +// ROUTING STRATEGY +// +// Routes URLs to the appropriate extraction handler. The order matters: +// 1. arxiv - direct arXiv URLs and IDs, use specialized arXiv API +// 2. s2 - Semantic Scholar for DOI-based sources, richer metadata +// 3. rawhtml - fallback for direct publisher URLs, generic extraction +package main + +import ( + "fmt" + "net/url" + "regexp" + "strings" +) + +var ( + // regex to extract arXiv identifier from various arXiv URLs. + // supports new (2109.05857) and old (math-ph/0301015) formats. + arxivIdentifierRegex = regexp.MustCompile(`(?:arxiv\.org/(?:abs|pdf)/|arXiv:)([a-z-]+/[0-9]{7}|[0-9]{4}\.[0-9]{4,5}(?:v[0-9]+)?)(?:\.pdf)?`) + + // regex to find a DOI in a string. + doiRegex = regexp.MustCompile(`(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)`) +) + +// route determines the primary enrichment strategy for a URL. +// returns the route str: "arxiv", "s2", or "rawhtml". +func Route(urlStr string) string { + parsedURL, err := url.Parse(urlStr) + if err != nil { + return "rawhtml" // fallback if URL is unparseable + } + + hostname := parsedURL.Hostname() + + // 1. arXiv.org or arXiv ID pattern in URL + if hostname == "arxiv.org" || strings.Contains(urlStr, "arxiv.org/abs/") { + if _, err := getArxivIdentifier(urlStr); err == nil { + return "arxiv" + } + } + + // 2. direct DOI link from doi.org + if hostname == "doi.org" { + return "s2" + } + + // 3. DOI present in URL path (e.g. some publisher sites) + if doi := getDOI(urlStr); doi != "" { + return "s2" + } + + // 4. fallback to rawhtml + return "rawhtml" +} + +// routeArticle determines the route for an article and sets the Route field. +func routeArticle(article *Article) { + article.Route = Route(article.URL) +} + +func getArxivIdentifier(articleURL string) (string, error) { + matches := arxivIdentifierRegex.FindStringSubmatch(articleURL) + if len(matches) > 1 { + return matches[1], nil + } + return "", fmt.Errorf("no arXiv identifier found") +} + +func getDOI(text string) string { + matches := doiRegex.FindStringSubmatch(text) + if len(matches) > 1 { + return matches[1] + } + return "" +} diff --git a/scholar.go b/scholar.go new file mode 100644 index 0000000..ad1e5e0 --- /dev/null +++ b/scholar.go @@ -0,0 +1,217 @@ +// SEMANTIC SCHOLAR HANDLER +// +// Uses S2's Graph API to fetch paper metadata via DOI. +// +// STRATEGY: +// - requires valid DOI in URL or DOI.org redirect +// - batch API for efficiency (up to 500 papers per request) +// - positional matching: response[i] maps to URLs[i] +// - rate limited to 100ms per request (configurable with API key) +// +// AUTH: +// - S2_API_KEY environment variable increases rate limits +// - Without key: public limits apply +package main + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" +) + +const ( + semScholarPaperDOIFmtTitle = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title" + semScholarPaperDOIFmtFull = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title,abstract" + semScholarBatchURLTitle = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title" + semScholarBatchURLFull = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title,abstract" +) + +// escapeDOI URL-encodes a DOI for safe use in API endpoints. +// DOIs contain forward slashes which must be escaped for the URL path. +// Example: "10.1234/abcd5678" -> "10.1234/abcd5678" (already safe in this case) +func escapeDOI(doi string) string { + parts := strings.SplitN(doi, "/", 2) + if len(parts) != 2 { + return url.PathEscape(doi) + } + return url.PathEscape(parts[0]) + "/" + url.PathEscape(parts[1]) +} + +// S2BatchResponseItem represents a Semantic Scholar batch API response item +type S2BatchResponseItem struct { + PaperID string `json:"paperId"` + Title string `json:"title"` + Abstract string `json:"abstract"` +} + +// fetchSemanticScholar fetches content for a single DOI via Semantic Scholar. +func fetchSemanticScholar(ctx context.Context, config *Config, urlStr string) (*Article, error) { + doi := getDOI(urlStr) + if doi == "" { + return nil, fmt.Errorf("Semantic Scholar: URL doesn't contain valid DOI: %s", urlStr) + } + + // rate limit + if err := config.HTTP.RateLimitS2(ctx); err != nil { + return nil, err + } + + escapedDOI := escapeDOI(doi) + + // choose the appropriate URL based on whether we need content + var apiURL string + if config.WithContent { + apiURL = fmt.Sprintf(semScholarPaperDOIFmtFull, escapedDOI) + } else { + apiURL = fmt.Sprintf(semScholarPaperDOIFmtTitle, escapedDOI) + } + + req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) + if err != nil { + return nil, fmt.Errorf("failed to construct Semantic Scholar request: %w", err) + } + req.Header.Set("Accept", "application/json") + if config.S2APIKey != "" { + req.Header.Set("x-api-key", config.S2APIKey) + } + + resp, err := config.HTTP.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to make request to Semantic Scholar API: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("Semantic Scholar API returned non-200 status for DOI %s: %s", doi, resp.Status) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read Semantic Scholar API response body: %w", err) + } + + var s2 struct { + Title string `json:"title"` + Abstract string `json:"abstract"` + } + if err := json.Unmarshal(body, &s2); err != nil { + return nil, fmt.Errorf("failed to unmarshal Semantic Scholar JSON for DOI %s: %w", doi, err) + } + + title := normalizeSpace(s2.Title) + content := normalizeSpace(s2.Abstract) + + // del content if not requested + if !config.WithContent { + content = "" + } + + if title == "" { + return nil, fmt.Errorf("no title found for DOI %s", doi) + } + + return &Article{ + Title: title, + Content: content, + URL: urlStr, + }, nil +} + +// fetchSemanticScholarBatch fetches a batch of papers from the S2 API. +func fetchSemanticScholarBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) { + if len(urls) == 0 { + return nil, nil + } + + // rate limit + if err := config.HTTP.RateLimitS2(ctx); err != nil { + return nil, err + } + + // extract DOIs from URLs, maintaining order for pos matching + validURLs := make([]string, 0, len(urls)) + s2IDs := make([]string, 0, len(urls)) + + for _, urlStr := range urls { + doi := getDOI(urlStr) + if doi != "" { + validURLs = append(validURLs, urlStr) + s2IDs = append(s2IDs, "DOI:"+doi) + } + } + + if len(s2IDs) == 0 { + return nil, nil + } + + requestBody, err := json.Marshal(map[string][]string{"ids": s2IDs}) + if err != nil { + return nil, fmt.Errorf("failed to marshal S2 batch request body: %w", err) + } + + // choose the appropriate URL based on whether we need content + var batchURL string + if config.WithContent { + batchURL = semScholarBatchURLFull + } else { + batchURL = semScholarBatchURLTitle + } + + req, err := http.NewRequestWithContext(ctx, "POST", batchURL, bytes.NewReader(requestBody)) + if err != nil { + return nil, fmt.Errorf("failed to create S2 batch request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + if config.S2APIKey != "" { + req.Header.Set("x-api-key", config.S2APIKey) + } + + resp, err := config.HTTP.Do(req) + if err != nil { + return nil, fmt.Errorf("S2 batch request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("S2 batch API returned non-200 status: %s", resp.Status) + } + + var responseItems []*S2BatchResponseItem + if err := json.NewDecoder(resp.Body).Decode(&responseItems); err != nil { + return nil, fmt.Errorf("failed to decode S2 batch response: %w", err) + } + + var articles []*Article + // match responses positionally to input URLs + for i, item := range responseItems { + if i >= len(validURLs) { + break + } + if item == nil { + continue + } + + title := normalizeSpace(item.Title) + if title != "" { + content := normalizeSpace(item.Abstract) + + // skip content if not requested + if !config.WithContent { + content = "" + } + + articles = append(articles, &Article{ + Title: title, + Content: content, + URL: validURLs[i], + }) + } + } + + return articles, nil +} diff --git a/scholfetch_test.go b/scholfetch_test.go new file mode 100644 index 0000000..59adae7 --- /dev/null +++ b/scholfetch_test.go @@ -0,0 +1,193 @@ +package main + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +type TestLogger struct { + messages []string +} + +func (l *TestLogger) Printf(format string, v ...interface{}) { + l.messages = append(l.messages, fmt.Sprintf(format, v...)) +} + +func TestHTTPClientDefaults(t *testing.T) { + client := NewHTTPClient() + + if client.userAgent != "scholfetch/1.0 (+https://samsci.com)" { + t.Errorf("Expected default user agent, got %s", client.userAgent) + } + + if client.arxivDelay != 1*time.Second { + t.Errorf("Expected arxiv delay of 1s, got %v", client.arxivDelay) + } + + if client.maxRetries != 3 { + t.Errorf("Expected max retries of 3, got %d", client.maxRetries) + } +} + +func TestRateLimiting(t *testing.T) { + client := NewHTTPClient() + client.arxivDelay = 10 * time.Millisecond // Speed up test + client.s2Delay = 5 * time.Millisecond + + // Test arxiv rate limiting + start := time.Now() + err := client.RateLimitArxiv(context.Background()) + if err != nil { + t.Fatalf("RateLimitArxiv failed: %v", err) + } + duration := time.Since(start) + if duration < 10*time.Millisecond { + t.Errorf("Expected arxiv delay of ~10ms, got %v", duration) + } + + // Test S2 rate limiting + start = time.Now() + err = client.RateLimitS2(context.Background()) + if err != nil { + t.Fatalf("RateLimitS2 failed: %v", err) + } + duration = time.Since(start) + if duration < 5*time.Millisecond { + t.Errorf("Expected S2 delay of ~5ms, got %v", duration) + } +} + +func TestHTTPIPRequest(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("test response")) + })) + defer server.Close() + + client := NewHTTPClient() + req, _ := http.NewRequest("GET", server.URL, nil) + + resp, err := client.Do(req) + if err != nil { + t.Fatalf("Request failed: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + t.Errorf("Expected status 200, got %d", resp.StatusCode) + } +} + +func TestURLRouting(t *testing.T) { + tests := map[string]string{ + "https://arxiv.org/abs/2301.00001": "arxiv", + "https://arxiv.org/pdf/2301.00001.pdf": "arxiv", + "http://arxiv.org/abs/2301.00001v2": "arxiv", + "https://api.semanticscholar.org/DOI:10.1234": "rawhtml", + "https://doi.org/10.1234/abcd5678": "s2", + "https://example.com/paper": "rawhtml", + "https://pubmed.ncbi.nlm.nih.gov/12345678/": "rawhtml", + } + + for url, expected := range tests { + result := Route(url) + if result != expected { + t.Errorf("Route(%s) = %s, expected %s", url, result, expected) + } + } +} + +func TestConfigDefaults(t *testing.T) { + config := NewConfig() + + if config.WithContent != false { + t.Error("Expected WithContent=false by default") + } + + if config.Verbose != false { + t.Error("Expected Verbose=false by default") + } + + if config.ArxivBatch != 50 { + t.Errorf("Expected ArxivBatch=50, got %d", config.ArxivBatch) + } + + if config.HTTP == nil { + t.Error("Expected HTTP client to be initialized") + } +} + +func TestConfigWithLogger(t *testing.T) { + logger := &TestLogger{} + config := NewConfigWithLogger(logger) + + if config.Logger != logger { + t.Error("Logger not set correctly") + } +} + +func TestArxivURLParsing(t *testing.T) { + tests := map[string]string{ + "https://arxiv.org/abs/2301.00001": "2301.00001", + "http://arxiv.org/abs/2301.00001v2": "2301.00001v2", + "https://arxiv.org/pdf/2301.00001.pdf": "2301.00001", + "https://example.com/not-arxiv": "", + } + + for url, expected := range tests { + result, _ := getArxivIdentifier(url) + if result != expected { + t.Errorf("getArxivIdentifier(%s) = %s, expected %s", url, result, expected) + } + } +} + +func TestDOIParsing(t *testing.T) { + tests := map[string]string{ + "https://doi.org/10.1234/abcd5678": "10.1234/abcd5678", + "https://api.semanticscholar.org/DOI:10.1234": "", + "https://example.com/no-doi": "", + } + + for url, expected := range tests { + result := getDOI(url) + if result == expected { + t.Logf("✓ getDOI(%s) = %s", url, result) + } else { + t.Errorf("getDOI(%s) = %s, expected %s", url, result, expected) + } + } +} + +func TestBatchURLRouting(t *testing.T) { + urls := []string{ + "https://arxiv.org/abs/2301.00001", + "https://doi.org/10.1234/test1", + "https://example.com/paper1", + "https://arxiv.org/pdf/2301.00002.pdf", + "https://doi.org/10.5678/test2", + } + + routeCounts := make(map[string]int) + for _, url := range urls { + route := Route(url) + routeCounts[route]++ + } + + expected := map[string]int{ + "arxiv": 2, + "s2": 2, + "rawhtml": 1, + } + + for route, expectedCount := range expected { + if routeCounts[route] != expectedCount { + t.Errorf("Expected %d URLs for route %s, got %d", + expectedCount, route, routeCounts[route]) + } + } +} \ No newline at end of file diff --git a/util.go b/util.go new file mode 100644 index 0000000..f7d94e5 --- /dev/null +++ b/util.go @@ -0,0 +1,14 @@ +package main + +import ( + "regexp" + "strings" +) + +func normalizeSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} + +func stripArxivVersion(id string) string { + return regexp.MustCompile(`v\d+$`).ReplaceAllString(id, "") +} -- cgit v1.2.3