From 3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2 Mon Sep 17 00:00:00 2001 From: Sam Scholten Date: Mon, 15 Dec 2025 19:35:46 +1000 Subject: Init v0.1.0 --- scholar.go | 217 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 scholar.go (limited to 'scholar.go') diff --git a/scholar.go b/scholar.go new file mode 100644 index 0000000..ad1e5e0 --- /dev/null +++ b/scholar.go @@ -0,0 +1,217 @@ +// SEMANTIC SCHOLAR HANDLER +// +// Uses S2's Graph API to fetch paper metadata via DOI. +// +// STRATEGY: +// - requires valid DOI in URL or DOI.org redirect +// - batch API for efficiency (up to 500 papers per request) +// - positional matching: response[i] maps to URLs[i] +// - rate limited to 100ms per request (configurable with API key) +// +// AUTH: +// - S2_API_KEY environment variable increases rate limits +// - Without key: public limits apply +package main + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" +) + +const ( + semScholarPaperDOIFmtTitle = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title" + semScholarPaperDOIFmtFull = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title,abstract" + semScholarBatchURLTitle = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title" + semScholarBatchURLFull = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title,abstract" +) + +// escapeDOI URL-encodes a DOI for safe use in API endpoints. +// DOIs contain forward slashes which must be escaped for the URL path. +// Example: "10.1234/abcd5678" -> "10.1234/abcd5678" (already safe in this case) +func escapeDOI(doi string) string { + parts := strings.SplitN(doi, "/", 2) + if len(parts) != 2 { + return url.PathEscape(doi) + } + return url.PathEscape(parts[0]) + "/" + url.PathEscape(parts[1]) +} + +// S2BatchResponseItem represents a Semantic Scholar batch API response item +type S2BatchResponseItem struct { + PaperID string `json:"paperId"` + Title string `json:"title"` + Abstract string `json:"abstract"` +} + +// fetchSemanticScholar fetches content for a single DOI via Semantic Scholar. +func fetchSemanticScholar(ctx context.Context, config *Config, urlStr string) (*Article, error) { + doi := getDOI(urlStr) + if doi == "" { + return nil, fmt.Errorf("Semantic Scholar: URL doesn't contain valid DOI: %s", urlStr) + } + + // rate limit + if err := config.HTTP.RateLimitS2(ctx); err != nil { + return nil, err + } + + escapedDOI := escapeDOI(doi) + + // choose the appropriate URL based on whether we need content + var apiURL string + if config.WithContent { + apiURL = fmt.Sprintf(semScholarPaperDOIFmtFull, escapedDOI) + } else { + apiURL = fmt.Sprintf(semScholarPaperDOIFmtTitle, escapedDOI) + } + + req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) + if err != nil { + return nil, fmt.Errorf("failed to construct Semantic Scholar request: %w", err) + } + req.Header.Set("Accept", "application/json") + if config.S2APIKey != "" { + req.Header.Set("x-api-key", config.S2APIKey) + } + + resp, err := config.HTTP.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to make request to Semantic Scholar API: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("Semantic Scholar API returned non-200 status for DOI %s: %s", doi, resp.Status) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read Semantic Scholar API response body: %w", err) + } + + var s2 struct { + Title string `json:"title"` + Abstract string `json:"abstract"` + } + if err := json.Unmarshal(body, &s2); err != nil { + return nil, fmt.Errorf("failed to unmarshal Semantic Scholar JSON for DOI %s: %w", doi, err) + } + + title := normalizeSpace(s2.Title) + content := normalizeSpace(s2.Abstract) + + // del content if not requested + if !config.WithContent { + content = "" + } + + if title == "" { + return nil, fmt.Errorf("no title found for DOI %s", doi) + } + + return &Article{ + Title: title, + Content: content, + URL: urlStr, + }, nil +} + +// fetchSemanticScholarBatch fetches a batch of papers from the S2 API. +func fetchSemanticScholarBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) { + if len(urls) == 0 { + return nil, nil + } + + // rate limit + if err := config.HTTP.RateLimitS2(ctx); err != nil { + return nil, err + } + + // extract DOIs from URLs, maintaining order for pos matching + validURLs := make([]string, 0, len(urls)) + s2IDs := make([]string, 0, len(urls)) + + for _, urlStr := range urls { + doi := getDOI(urlStr) + if doi != "" { + validURLs = append(validURLs, urlStr) + s2IDs = append(s2IDs, "DOI:"+doi) + } + } + + if len(s2IDs) == 0 { + return nil, nil + } + + requestBody, err := json.Marshal(map[string][]string{"ids": s2IDs}) + if err != nil { + return nil, fmt.Errorf("failed to marshal S2 batch request body: %w", err) + } + + // choose the appropriate URL based on whether we need content + var batchURL string + if config.WithContent { + batchURL = semScholarBatchURLFull + } else { + batchURL = semScholarBatchURLTitle + } + + req, err := http.NewRequestWithContext(ctx, "POST", batchURL, bytes.NewReader(requestBody)) + if err != nil { + return nil, fmt.Errorf("failed to create S2 batch request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + if config.S2APIKey != "" { + req.Header.Set("x-api-key", config.S2APIKey) + } + + resp, err := config.HTTP.Do(req) + if err != nil { + return nil, fmt.Errorf("S2 batch request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("S2 batch API returned non-200 status: %s", resp.Status) + } + + var responseItems []*S2BatchResponseItem + if err := json.NewDecoder(resp.Body).Decode(&responseItems); err != nil { + return nil, fmt.Errorf("failed to decode S2 batch response: %w", err) + } + + var articles []*Article + // match responses positionally to input URLs + for i, item := range responseItems { + if i >= len(validURLs) { + break + } + if item == nil { + continue + } + + title := normalizeSpace(item.Title) + if title != "" { + content := normalizeSpace(item.Abstract) + + // skip content if not requested + if !config.WithContent { + content = "" + } + + articles = append(articles, &Article{ + Title: title, + Content: content, + URL: validURLs[i], + }) + } + } + + return articles, nil +} -- cgit v1.2.3