aboutsummaryrefslogtreecommitdiff
path: root/routes.go
blob: 39bb7a9196b435421afb843e217c3a0a4d0412fc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
// ROUTING STRATEGY
//
// Routes URLs to the appropriate extraction handler. The order matters:
// 1. arxiv - direct arXiv URLs and IDs, use specialized arXiv API
// 2. s2 - Semantic Scholar for DOI-based sources, richer metadata
// 3. rawhtml - fallback for direct publisher URLs, generic extraction
package main

import (
	"fmt"
	"net/url"
	"regexp"
	"strings"
)

var (
	// regex to extract arXiv identifier from various arXiv URLs.
	// supports new (2109.05857) and old (math-ph/0301015) formats.
	arxivIdentifierRegex = regexp.MustCompile(`(?:arxiv\.org/(?:abs|pdf)/|arXiv:)([a-z-]+/[0-9]{7}|[0-9]{4}\.[0-9]{4,5}(?:v[0-9]+)?)(?:\.pdf)?`)

	// regex to find a DOI in a string.
	doiRegex = regexp.MustCompile(`(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)`)
)

// route determines the primary enrichment strategy for a URL.
// returns the route str: "arxiv", "s2", or "rawhtml".
func Route(urlStr string) string {
	parsedURL, err := url.Parse(urlStr)
	if err != nil {
		return "rawhtml" // fallback if URL is unparseable
	}

	hostname := parsedURL.Hostname()

	// 1. arXiv.org or arXiv ID pattern in URL
	if hostname == "arxiv.org" || strings.Contains(urlStr, "arxiv.org/abs/") {
		if _, err := getArxivIdentifier(urlStr); err == nil {
			return "arxiv"
		}
	}

	// 2. direct DOI link from doi.org
	if hostname == "doi.org" {
		return "s2"
	}

	// 3. DOI present in URL path (e.g. some publisher sites)
	if doi := getDOI(urlStr); doi != "" {
		return "s2"
	}

	// 4. fallback to rawhtml
	return "rawhtml"
}

// routeArticle determines the route for an article and sets the Route field.
func routeArticle(article *Article) {
	article.Route = Route(article.URL)
}

func getArxivIdentifier(articleURL string) (string, error) {
	matches := arxivIdentifierRegex.FindStringSubmatch(articleURL)
	if len(matches) > 1 {
		return matches[1], nil
	}
	return "", fmt.Errorf("no arXiv identifier found")
}

func getDOI(text string) string {
	matches := doiRegex.FindStringSubmatch(text)
	if len(matches) > 1 {
		return matches[1]
	}
	return ""
}