aboutsummaryrefslogtreecommitdiff
path: root/arxiv.go
diff options
context:
space:
mode:
authorSam Scholten2025-12-15 19:35:46 +1000
committerSam Scholten2025-12-15 19:35:57 +1000
commit3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2 (patch)
tree42b1f0e0a346a1cf087df90e29a100edbd66b3eb /arxiv.go
downloadscholfetch-3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2.tar.gz
scholfetch-3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2.zip
Init v0.1.0HEADmain
Diffstat (limited to 'arxiv.go')
-rw-r--r--arxiv.go196
1 files changed, 196 insertions, 0 deletions
diff --git a/arxiv.go b/arxiv.go
new file mode 100644
index 0000000..6e7fad5
--- /dev/null
+++ b/arxiv.go
@@ -0,0 +1,196 @@
+// ARXIV HANDLER
+//
+// Uses arXiv's API to fetch article metadata.
+//
+// STRATEGY:
+// - single requests and batched requests supported
+// - uses gofeed to parse Atom XML responses
+// - rate limited to 1 request per second (conservative)
+// - handles both old (math-ph/0301015) and new (2109.05857) ID formats
+package main
+
+import (
+ "bytes"
+ "context"
+ "fmt"
+ "io"
+ "net/http"
+ "strings"
+
+ "github.com/mmcdole/gofeed"
+)
+
+const (
+ arxivQueryFmt = "http://export.arxiv.org/api/query?id_list=%s"
+)
+
+// fetchArxiv fetches content for a single arXiv article.
+func fetchArxiv(ctx context.Context, config *Config, urlStr string) (*Article, error) {
+ arxivID, err := getArxivIdentifier(urlStr)
+ if err != nil || arxivID == "" {
+ return nil, fmt.Errorf("arXiv: invalid URL format, expected arxiv.org/abs/ID: %s", urlStr)
+ }
+
+ // rate limit
+ if err := config.HTTP.RateLimitArxiv(ctx); err != nil {
+ return nil, err
+ }
+
+ apiURL := fmt.Sprintf(arxivQueryFmt, arxivID)
+ req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
+ if err != nil {
+ return nil, fmt.Errorf("failed to construct arXiv request: %w", err)
+ }
+ req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8")
+
+ resp, err := config.HTTP.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("failed to fetch arXiv feed for ID %s: %w", arxivID, err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("arXiv API returned non-200 status for ID %s: %s", arxivID, resp.Status)
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("failed to read arXiv response body for ID %s: %w", arxivID, err)
+ }
+
+ fp := gofeed.NewParser()
+ feed, err := fp.Parse(bytes.NewReader(body))
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse arXiv feed for ID %s: %w", arxivID, err)
+ }
+
+ if len(feed.Items) == 0 {
+ return nil, fmt.Errorf("no items found in arXiv feed for ID %s", arxivID)
+ }
+
+ item := feed.Items[0]
+ title := normalizeSpace(item.Title)
+ content := normalizeSpace(item.Description)
+
+ if config.Verbose {
+ config.Logger.Printf("arXiv single fetch result: ID=%s, Title=%s", arxivID, title)
+ }
+
+ // del content if not requested
+ if !config.WithContent {
+ content = ""
+ }
+
+ if title == "" {
+ return nil, fmt.Errorf("no title found for arXiv ID %s", arxivID)
+ }
+
+ return &Article{
+ Title: title,
+ Content: content,
+ URL: urlStr,
+ }, nil
+}
+
+// fetchArxivBatch fetches metadata for a list of arXiv URLs in batches.
+func fetchArxivBatch(ctx context.Context, config *Config, urls []string) ([]*Article, error) {
+ if len(urls) == 0 {
+ return nil, nil
+ }
+
+ idToURL := make(map[string]string)
+ batchIDs := make([]string, 0, len(urls))
+
+ for _, urlStr := range urls {
+ id, err := getArxivIdentifier(urlStr)
+ if err != nil {
+ continue
+ }
+ batchIDs = append(batchIDs, id)
+ stripped := stripArxivVersion(id)
+ idToURL[stripped] = urlStr
+ }
+
+ if len(batchIDs) == 0 {
+ return nil, nil
+ }
+
+ var articles []*Article
+
+ for i := 0; i < len(batchIDs); i += config.ArxivBatch {
+ end := i + config.ArxivBatch
+ if end > len(batchIDs) {
+ end = len(batchIDs)
+ }
+
+ // rate limit
+ if err := config.HTTP.RateLimitArxiv(ctx); err != nil {
+ return nil, err
+ }
+
+ chunk := batchIDs[i:end]
+ apiURL := fmt.Sprintf(arxivQueryFmt, strings.Join(chunk, ","))
+
+ req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
+ if err != nil {
+ continue
+ }
+ req.Header.Set("Accept", "application/atom+xml, application/xml;q=0.9, */*;q=0.8")
+
+ resp, err := config.HTTP.Do(req)
+ if err != nil {
+ continue
+ }
+
+ if resp.StatusCode != http.StatusOK {
+ resp.Body.Close()
+ continue
+ }
+
+ body, err := io.ReadAll(resp.Body)
+ resp.Body.Close()
+ if err != nil {
+ continue
+ }
+
+ fp := gofeed.NewParser()
+ feed, err := fp.Parse(bytes.NewReader(body))
+ if err != nil {
+ continue
+ }
+
+ for _, item := range feed.Items {
+ id, err := getArxivIdentifier(item.GUID)
+ if err != nil || id == "" {
+ id, err = getArxivIdentifier(item.Link)
+ if err != nil || id == "" {
+ continue
+ }
+ }
+
+ title := normalizeSpace(item.Title)
+ if title == "" {
+ continue
+ }
+
+ baseID := stripArxivVersion(id)
+ originalURL, exists := idToURL[baseID]
+ if !exists {
+ continue
+ }
+
+ content := ""
+ if config.WithContent {
+ content = normalizeSpace(item.Description)
+ }
+
+ articles = append(articles, &Article{
+ Title: title,
+ Content: content,
+ URL: originalURL,
+ })
+ }
+ }
+
+ return articles, nil
+}