1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
// ROUTING STRATEGY
//
// Routes URLs to the appropriate extraction handler. The order matters:
// 1. arxiv - direct arXiv URLs and IDs, use specialized arXiv API
// 2. s2 - Semantic Scholar for DOI-based sources, richer metadata
// 3. rawhtml - fallback for direct publisher URLs, generic extraction
package main
import (
"fmt"
"net/url"
"regexp"
"strings"
)
var (
// regex to extract arXiv identifier from various arXiv URLs.
// supports new (2109.05857) and old (math-ph/0301015) formats.
arxivIdentifierRegex = regexp.MustCompile(`(?:arxiv\.org/(?:abs|pdf)/|arXiv:)([a-z-]+/[0-9]{7}|[0-9]{4}\.[0-9]{4,5}(?:v[0-9]+)?)(?:\.pdf)?`)
// regex to find a DOI in a string.
doiRegex = regexp.MustCompile(`(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)`)
)
// route determines the primary enrichment strategy for a URL.
// returns the route str: "arxiv", "s2", or "rawhtml".
func Route(urlStr string) string {
parsedURL, err := url.Parse(urlStr)
if err != nil {
return "rawhtml" // fallback if URL is unparseable
}
hostname := parsedURL.Hostname()
// 1. arXiv.org or arXiv ID pattern in URL
if hostname == "arxiv.org" || strings.Contains(urlStr, "arxiv.org/abs/") {
if _, err := getArxivIdentifier(urlStr); err == nil {
return "arxiv"
}
}
// 2. direct DOI link from doi.org
if hostname == "doi.org" {
return "s2"
}
// 3. DOI present in URL path (e.g. some publisher sites)
if doi := getDOI(urlStr); doi != "" {
return "s2"
}
// 4. fallback to rawhtml
return "rawhtml"
}
// routeArticle determines the route for an article and sets the Route field.
func routeArticle(article *Article) {
article.Route = Route(article.URL)
}
func getArxivIdentifier(articleURL string) (string, error) {
matches := arxivIdentifierRegex.FindStringSubmatch(articleURL)
if len(matches) > 1 {
return matches[1], nil
}
return "", fmt.Errorf("no arXiv identifier found")
}
func getDOI(text string) string {
matches := doiRegex.FindStringSubmatch(text)
if len(matches) > 1 {
return matches[1]
}
return ""
}
|