From 3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2 Mon Sep 17 00:00:00 2001 From: Sam Scholten Date: Mon, 15 Dec 2025 19:35:46 +1000 Subject: Init v0.1.0 --- arxiv.go | 196 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 arxiv.go (limited to 'arxiv.go') diff --git a/arxiv.go b/arxiv.go new file mode 100644 index 0000000..6e7fad5 --- /dev/null +++ b/arxiv.go @@ -0,0 +1,196 @@ +// ARXIV HANDLER +// +// Uses arXiv's API to fetch article metadata. +// +// STRATEGY: +// - single requests and batched requests supported +// - uses gofeed to parse Atom XML responses +// - rate limited to 1 request per second (conservative) +// - handles both old (math-ph/0301015) and new (2109.05857) ID formats +package main + +import ( + "bytes" + "context" + "fmt" + "io" + "net/http" + "strings" + + "github.com/mmcdole/gofeed" +) + +const ( + arxivQueryFmt = "http://export.arxiv.org/api/query?id_list=%s" +) + +// fetchArxiv fetches content for a single arXiv article. +func fetchArxiv(ctx context.Context, config *Config, urlStr string) (*Article, error) { + arxivID, err := getArxivIdentifier(urlStr) + if err != nil || arxivID == "" { + return nil, fmt.Errorf("arXiv: invalid URL format, expected arxiv.org/abs/ID: %s", urlStr) + } + + // rate limit + if err := config.HTTP.RateLimitArxiv(ctx); err != nil { + return nil, err + } + + apiURL := fmt.Sprintf(arxivQueryFmt, arxivID) + req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) + if err != nil { + return nil, fmt.Errorf("failed to construct arXiv request: %w", err) + } + req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8") + + resp, err := config.HTTP.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to fetch arXiv feed for ID %s: %w", arxivID, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("arXiv API returned non-200 status for ID %s: %s", arxivID, resp.Status) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read arXiv response body for ID %s: %w", arxivID, err) + } + + fp := gofeed.NewParser() + feed, err := fp.Parse(bytes.NewReader(body)) + if err != nil { + return nil, fmt.Errorf("failed to parse arXiv feed for ID %s: %w", arxivID, err) + } + + if len(feed.Items) == 0 { + return nil, fmt.Errorf("no items found in arXiv feed for ID %s", arxivID) + } + + item := feed.Items[0] + title := normalizeSpace(item.Title) + content := normalizeSpace(item.Description) + + if config.Verbose { + config.Logger.Printf("arXiv single fetch result: ID=%s, Title=%s", arxivID, title) + } + + // del content if not requested + if !config.WithContent { + content = "" + } + + if title == "" { + return nil, fmt.Errorf("no title found for arXiv ID %s", arxivID) + } + + return &Article{ + Title: title, + Content: content, + URL: urlStr, + }, nil +} + +// fetchArxivBatch fetches metadata for a list of arXiv URLs in batches. +func fetchArxivBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) { + if len(urls) == 0 { + return nil, nil + } + + idToURL := make(map[string]string) + batchIDs := make([]string, 0, len(urls)) + + for _, urlStr := range urls { + id, err := getArxivIdentifier(urlStr) + if err != nil { + continue + } + batchIDs = append(batchIDs, id) + stripped := stripArxivVersion(id) + idToURL[stripped] = urlStr + } + + if len(batchIDs) == 0 { + return nil, nil + } + + var articles []*Article + + for i := 0; i < len(batchIDs); i += config.ArxivBatch { + end := i + config.ArxivBatch + if end > len(batchIDs) { + end = len(batchIDs) + } + + // rate limit + if err := config.HTTP.RateLimitArxiv(ctx); err != nil { + return nil, err + } + + chunk := batchIDs[i:end] + apiURL := fmt.Sprintf(arxivQueryFmt, strings.Join(chunk, ",")) + + req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) + if err != nil { + continue + } + req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8") + + resp, err := config.HTTP.Do(req) + if err != nil { + continue + } + + if resp.StatusCode != http.StatusOK { + resp.Body.Close() + continue + } + + body, err := io.ReadAll(resp.Body) + resp.Body.Close() + if err != nil { + continue + } + + fp := gofeed.NewParser() + feed, err := fp.Parse(bytes.NewReader(body)) + if err != nil { + continue + } + + for _, item := range feed.Items { + id, err := getArxivIdentifier(item.GUID) + if err != nil || id == "" { + id, err = getArxivIdentifier(item.Link) + if err != nil || id == "" { + continue + } + } + + title := normalizeSpace(item.Title) + if title == "" { + continue + } + + baseID := stripArxivVersion(id) + originalURL, exists := idToURL[baseID] + if !exists { + continue + } + + content := "" + if config.WithContent { + content = normalizeSpace(item.Description) + } + + articles = append(articles, &Article{ + Title: title, + Content: content, + URL: originalURL, + }) + } + } + + return articles, nil +} -- cgit v1.2.3