aboutsummaryrefslogtreecommitdiff
path: root/scholar.go
diff options
context:
space:
mode:
Diffstat (limited to 'scholar.go')
-rw-r--r--scholar.go217
1 files changed, 217 insertions, 0 deletions
diff --git a/scholar.go b/scholar.go
new file mode 100644
index 0000000..ad1e5e0
--- /dev/null
+++ b/scholar.go
@@ -0,0 +1,217 @@
+// SEMANTIC SCHOLAR HANDLER
+//
+// Uses S2's Graph API to fetch paper metadata via DOI.
+//
+// STRATEGY:
+// - requires valid DOI in URL or DOI.org redirect
+// - batch API for efficiency (up to 500 papers per request)
+// - positional matching: response[i] maps to URLs[i]
+// - rate limited to 100ms per request (configurable with API key)
+//
+// AUTH:
+// - S2_API_KEY environment variable increases rate limits
+// - Without key: public limits apply
+package main
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "strings"
+)
+
+const (
+ semScholarPaperDOIFmtTitle = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title"
+ semScholarPaperDOIFmtFull = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title,abstract"
+ semScholarBatchURLTitle = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title"
+ semScholarBatchURLFull = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title,abstract"
+)
+
+// escapeDOI URL-encodes a DOI for safe use in API endpoints.
+// DOIs contain forward slashes which must be escaped for the URL path.
+// Example: "10.1234/abcd5678" -> "10.1234/abcd5678" (already safe in this case)
+func escapeDOI(doi string) string {
+ parts := strings.SplitN(doi, "/", 2)
+ if len(parts) != 2 {
+ return url.PathEscape(doi)
+ }
+ return url.PathEscape(parts[0]) + "/" + url.PathEscape(parts[1])
+}
+
+// S2BatchResponseItem represents a Semantic Scholar batch API response item
+type S2BatchResponseItem struct {
+ PaperID string `json:"paperId"`
+ Title string `json:"title"`
+ Abstract string `json:"abstract"`
+}
+
+// fetchSemanticScholar fetches content for a single DOI via Semantic Scholar.
+func fetchSemanticScholar(ctx context.Context, config *Config, urlStr string) (*Article, error) {
+ doi := getDOI(urlStr)
+ if doi == "" {
+ return nil, fmt.Errorf("Semantic Scholar: URL doesn't contain valid DOI: %s", urlStr)
+ }
+
+ // rate limit
+ if err := config.HTTP.RateLimitS2(ctx); err != nil {
+ return nil, err
+ }
+
+ escapedDOI := escapeDOI(doi)
+
+ // choose the appropriate URL based on whether we need content
+ var apiURL string
+ if config.WithContent {
+ apiURL = fmt.Sprintf(semScholarPaperDOIFmtFull, escapedDOI)
+ } else {
+ apiURL = fmt.Sprintf(semScholarPaperDOIFmtTitle, escapedDOI)
+ }
+
+ req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
+ if err != nil {
+ return nil, fmt.Errorf("failed to construct Semantic Scholar request: %w", err)
+ }
+ req.Header.Set("Accept", "application/json")
+ if config.S2APIKey != "" {
+ req.Header.Set("x-api-key", config.S2APIKey)
+ }
+
+ resp, err := config.HTTP.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("failed to make request to Semantic Scholar API: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("Semantic Scholar API returned non-200 status for DOI %s: %s", doi, resp.Status)
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("failed to read Semantic Scholar API response body: %w", err)
+ }
+
+ var s2 struct {
+ Title string `json:"title"`
+ Abstract string `json:"abstract"`
+ }
+ if err := json.Unmarshal(body, &s2); err != nil {
+ return nil, fmt.Errorf("failed to unmarshal Semantic Scholar JSON for DOI %s: %w", doi, err)
+ }
+
+ title := normalizeSpace(s2.Title)
+ content := normalizeSpace(s2.Abstract)
+
+ // del content if not requested
+ if !config.WithContent {
+ content = ""
+ }
+
+ if title == "" {
+ return nil, fmt.Errorf("no title found for DOI %s", doi)
+ }
+
+ return &Article{
+ Title: title,
+ Content: content,
+ URL: urlStr,
+ }, nil
+}
+
+// fetchSemanticScholarBatch fetches a batch of papers from the S2 API.
+func fetchSemanticScholarBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) {
+ if len(urls) == 0 {
+ return nil, nil
+ }
+
+ // rate limit
+ if err := config.HTTP.RateLimitS2(ctx); err != nil {
+ return nil, err
+ }
+
+ // extract DOIs from URLs, maintaining order for pos matching
+ validURLs := make([]string, 0, len(urls))
+ s2IDs := make([]string, 0, len(urls))
+
+ for _, urlStr := range urls {
+ doi := getDOI(urlStr)
+ if doi != "" {
+ validURLs = append(validURLs, urlStr)
+ s2IDs = append(s2IDs, "DOI:"+doi)
+ }
+ }
+
+ if len(s2IDs) == 0 {
+ return nil, nil
+ }
+
+ requestBody, err := json.Marshal(map[string][]string{"ids": s2IDs})
+ if err != nil {
+ return nil, fmt.Errorf("failed to marshal S2 batch request body: %w", err)
+ }
+
+ // choose the appropriate URL based on whether we need content
+ var batchURL string
+ if config.WithContent {
+ batchURL = semScholarBatchURLFull
+ } else {
+ batchURL = semScholarBatchURLTitle
+ }
+
+ req, err := http.NewRequestWithContext(ctx, "POST", batchURL, bytes.NewReader(requestBody))
+ if err != nil {
+ return nil, fmt.Errorf("failed to create S2 batch request: %w", err)
+ }
+ req.Header.Set("Content-Type", "application/json")
+ if config.S2APIKey != "" {
+ req.Header.Set("x-api-key", config.S2APIKey)
+ }
+
+ resp, err := config.HTTP.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("S2 batch request failed: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("S2 batch API returned non-200 status: %s", resp.Status)
+ }
+
+ var responseItems []*S2BatchResponseItem
+ if err := json.NewDecoder(resp.Body).Decode(&responseItems); err != nil {
+ return nil, fmt.Errorf("failed to decode S2 batch response: %w", err)
+ }
+
+ var articles []*Article
+ // match responses positionally to input URLs
+ for i, item := range responseItems {
+ if i >= len(validURLs) {
+ break
+ }
+ if item == nil {
+ continue
+ }
+
+ title := normalizeSpace(item.Title)
+ if title != "" {
+ content := normalizeSpace(item.Abstract)
+
+ // skip content if not requested
+ if !config.WithContent {
+ content = ""
+ }
+
+ articles = append(articles, &Article{
+ Title: title,
+ Content: content,
+ URL: validURLs[i],
+ })
+ }
+ }
+
+ return articles, nil
+}