// SEMANTIC SCHOLAR HANDLER // // Uses S2's Graph API to fetch paper metadata via DOI. // // STRATEGY: // - requires valid DOI in URL or DOI.org redirect // - batch API for efficiency (up to 500 papers per request) // - positional matching: response[i] maps to URLs[i] // - rate limited to 100ms per request (configurable with API key) // // AUTH: // - S2_API_KEY environment variable increases rate limits // - Without key: public limits apply package main import ( "bytes" "context" "encoding/json" "fmt" "io" "net/http" "net/url" "strings" ) const ( semScholarPaperDOIFmtTitle = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title" semScholarPaperDOIFmtFull = "https://api.semanticscholar.org/graph/v1/paper/DOI:%s?fields=title,abstract" semScholarBatchURLTitle = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title" semScholarBatchURLFull = "https://api.semanticscholar.org/graph/v1/paper/batch?fields=title,abstract" ) // escapeDOI URL-encodes a DOI for safe use in API endpoints. // DOIs contain forward slashes which must be escaped for the URL path. // Example: "10.1234/abcd5678" -> "10.1234/abcd5678" (already safe in this case) func escapeDOI(doi string) string { parts := strings.SplitN(doi, "/", 2) if len(parts) != 2 { return url.PathEscape(doi) } return url.PathEscape(parts[0]) + "/" + url.PathEscape(parts[1]) } // S2BatchResponseItem represents a Semantic Scholar batch API response item type S2BatchResponseItem struct { PaperID string `json:"paperId"` Title string `json:"title"` Abstract string `json:"abstract"` } // fetchSemanticScholar fetches content for a single DOI via Semantic Scholar. func fetchSemanticScholar(ctx context.Context, config *Config, urlStr string) (*Article, error) { doi := getDOI(urlStr) if doi == "" { return nil, fmt.Errorf("Semantic Scholar: URL doesn't contain valid DOI: %s", urlStr) } // rate limit if err := config.HTTP.RateLimitS2(ctx); err != nil { return nil, err } escapedDOI := escapeDOI(doi) // choose the appropriate URL based on whether we need content var apiURL string if config.WithContent { apiURL = fmt.Sprintf(semScholarPaperDOIFmtFull, escapedDOI) } else { apiURL = fmt.Sprintf(semScholarPaperDOIFmtTitle, escapedDOI) } req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) if err != nil { return nil, fmt.Errorf("failed to construct Semantic Scholar request: %w", err) } req.Header.Set("Accept", "application/json") if config.S2APIKey != "" { req.Header.Set("x-api-key", config.S2APIKey) } resp, err := config.HTTP.Do(req) if err != nil { return nil, fmt.Errorf("failed to make request to Semantic Scholar API: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("Semantic Scholar API returned non-200 status for DOI %s: %s", doi, resp.Status) } body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read Semantic Scholar API response body: %w", err) } var s2 struct { Title string `json:"title"` Abstract string `json:"abstract"` } if err := json.Unmarshal(body, &s2); err != nil { return nil, fmt.Errorf("failed to unmarshal Semantic Scholar JSON for DOI %s: %w", doi, err) } title := normalizeSpace(s2.Title) content := normalizeSpace(s2.Abstract) // del content if not requested if !config.WithContent { content = "" } if title == "" { return nil, fmt.Errorf("no title found for DOI %s", doi) } return &Article{ Title: title, Content: content, URL: urlStr, }, nil } // fetchSemanticScholarBatch fetches a batch of papers from the S2 API. func fetchSemanticScholarBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) { if len(urls) == 0 { return nil, nil } // rate limit if err := config.HTTP.RateLimitS2(ctx); err != nil { return nil, err } // extract DOIs from URLs, maintaining order for pos matching validURLs := make([]string, 0, len(urls)) s2IDs := make([]string, 0, len(urls)) for _, urlStr := range urls { doi := getDOI(urlStr) if doi != "" { validURLs = append(validURLs, urlStr) s2IDs = append(s2IDs, "DOI:"+doi) } } if len(s2IDs) == 0 { return nil, nil } requestBody, err := json.Marshal(map[string][]string{"ids": s2IDs}) if err != nil { return nil, fmt.Errorf("failed to marshal S2 batch request body: %w", err) } // choose the appropriate URL based on whether we need content var batchURL string if config.WithContent { batchURL = semScholarBatchURLFull } else { batchURL = semScholarBatchURLTitle } req, err := http.NewRequestWithContext(ctx, "POST", batchURL, bytes.NewReader(requestBody)) if err != nil { return nil, fmt.Errorf("failed to create S2 batch request: %w", err) } req.Header.Set("Content-Type", "application/json") if config.S2APIKey != "" { req.Header.Set("x-api-key", config.S2APIKey) } resp, err := config.HTTP.Do(req) if err != nil { return nil, fmt.Errorf("S2 batch request failed: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("S2 batch API returned non-200 status: %s", resp.Status) } var responseItems []*S2BatchResponseItem if err := json.NewDecoder(resp.Body).Decode(&responseItems); err != nil { return nil, fmt.Errorf("failed to decode S2 batch response: %w", err) } var articles []*Article // match responses positionally to input URLs for i, item := range responseItems { if i >= len(validURLs) { break } if item == nil { continue } title := normalizeSpace(item.Title) if title != "" { content := normalizeSpace(item.Abstract) // skip content if not requested if !config.WithContent { content = "" } articles = append(articles, &Article{ Title: title, Content: content, URL: validURLs[i], }) } } return articles, nil }