// ARXIV HANDLER // // Uses arXiv's API to fetch article metadata. // // STRATEGY: // - single requests and batched requests supported // - uses gofeed to parse Atom XML responses // - rate limited to 1 request per second (conservative) // - handles both old (math-ph/0301015) and new (2109.05857) ID formats package main import ( "bytes" "context" "fmt" "io" "net/http" "strings" "github.com/mmcdole/gofeed" ) const ( arxivQueryFmt = "http://export.arxiv.org/api/query?id_list=%s" ) // fetchArxiv fetches content for a single arXiv article. func fetchArxiv(ctx context.Context, config *Config, urlStr string) (*Article, error) { arxivID, err := getArxivIdentifier(urlStr) if err != nil || arxivID == "" { return nil, fmt.Errorf("arXiv: invalid URL format, expected arxiv.org/abs/ID: %s", urlStr) } // rate limit if err := config.HTTP.RateLimitArxiv(ctx); err != nil { return nil, err } apiURL := fmt.Sprintf(arxivQueryFmt, arxivID) req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) if err != nil { return nil, fmt.Errorf("failed to construct arXiv request: %w", err) } req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8") resp, err := config.HTTP.Do(req) if err != nil { return nil, fmt.Errorf("failed to fetch arXiv feed for ID %s: %w", arxivID, err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("arXiv API returned non-200 status for ID %s: %s", arxivID, resp.Status) } body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read arXiv response body for ID %s: %w", arxivID, err) } fp := gofeed.NewParser() feed, err := fp.Parse(bytes.NewReader(body)) if err != nil { return nil, fmt.Errorf("failed to parse arXiv feed for ID %s: %w", arxivID, err) } if len(feed.Items) == 0 { return nil, fmt.Errorf("no items found in arXiv feed for ID %s", arxivID) } item := feed.Items[0] title := normalizeSpace(item.Title) content := normalizeSpace(item.Description) if config.Verbose { config.Logger.Printf("arXiv single fetch result: ID=%s, Title=%s", arxivID, title) } // del content if not requested if !config.WithContent { content = "" } if title == "" { return nil, fmt.Errorf("no title found for arXiv ID %s", arxivID) } return &Article{ Title: title, Content: content, URL: urlStr, }, nil } // fetchArxivBatch fetches metadata for a list of arXiv URLs in batches. func fetchArxivBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) { if len(urls) == 0 { return nil, nil } idToURL := make(map[string]string) batchIDs := make([]string, 0, len(urls)) for _, urlStr := range urls { id, err := getArxivIdentifier(urlStr) if err != nil { continue } batchIDs = append(batchIDs, id) stripped := stripArxivVersion(id) idToURL[stripped] = urlStr } if len(batchIDs) == 0 { return nil, nil } var articles []*Article for i := 0; i < len(batchIDs); i += config.ArxivBatch { end := i + config.ArxivBatch if end > len(batchIDs) { end = len(batchIDs) } // rate limit if err := config.HTTP.RateLimitArxiv(ctx); err != nil { return nil, err } chunk := batchIDs[i:end] apiURL := fmt.Sprintf(arxivQueryFmt, strings.Join(chunk, ",")) req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) if err != nil { continue } req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8") resp, err := config.HTTP.Do(req) if err != nil { continue } if resp.StatusCode != http.StatusOK { resp.Body.Close() continue } body, err := io.ReadAll(resp.Body) resp.Body.Close() if err != nil { continue } fp := gofeed.NewParser() feed, err := fp.Parse(bytes.NewReader(body)) if err != nil { continue } for _, item := range feed.Items { id, err := getArxivIdentifier(item.GUID) if err != nil || id == "" { id, err = getArxivIdentifier(item.Link) if err != nil || id == "" { continue } } title := normalizeSpace(item.Title) if title == "" { continue } baseID := stripArxivVersion(id) originalURL, exists := idToURL[baseID] if !exists { continue } content := "" if config.WithContent { content = normalizeSpace(item.Description) } articles = append(articles, &Article{ Title: title, Content: content, URL: originalURL, }) } } return articles, nil }