diff options
Diffstat (limited to 'routes.go')
| -rw-r--r-- | routes.go | 75 |
1 files changed, 75 insertions, 0 deletions
diff --git a/routes.go b/routes.go new file mode 100644 index 0000000..39bb7a9 --- /dev/null +++ b/routes.go @@ -0,0 +1,75 @@ +// ROUTING STRATEGY +// +// Routes URLs to the appropriate extraction handler. The order matters: +// 1. arxiv - direct arXiv URLs and IDs, use specialized arXiv API +// 2. s2 - Semantic Scholar for DOI-based sources, richer metadata +// 3. rawhtml - fallback for direct publisher URLs, generic extraction +package main + +import ( + "fmt" + "net/url" + "regexp" + "strings" +) + +var ( + // regex to extract arXiv identifier from various arXiv URLs. + // supports new (2109.05857) and old (math-ph/0301015) formats. + arxivIdentifierRegex = regexp.MustCompile(`(?:arxiv\.org/(?:abs|pdf)/|arXiv:)([a-z-]+/[0-9]{7}|[0-9]{4}\.[0-9]{4,5}(?:v[0-9]+)?)(?:\.pdf)?`) + + // regex to find a DOI in a string. + doiRegex = regexp.MustCompile(`(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)`) +) + +// route determines the primary enrichment strategy for a URL. +// returns the route str: "arxiv", "s2", or "rawhtml". +func Route(urlStr string) string { + parsedURL, err := url.Parse(urlStr) + if err != nil { + return "rawhtml" // fallback if URL is unparseable + } + + hostname := parsedURL.Hostname() + + // 1. arXiv.org or arXiv ID pattern in URL + if hostname == "arxiv.org" || strings.Contains(urlStr, "arxiv.org/abs/") { + if _, err := getArxivIdentifier(urlStr); err == nil { + return "arxiv" + } + } + + // 2. direct DOI link from doi.org + if hostname == "doi.org" { + return "s2" + } + + // 3. DOI present in URL path (e.g. some publisher sites) + if doi := getDOI(urlStr); doi != "" { + return "s2" + } + + // 4. fallback to rawhtml + return "rawhtml" +} + +// routeArticle determines the route for an article and sets the Route field. +func routeArticle(article *Article) { + article.Route = Route(article.URL) +} + +func getArxivIdentifier(articleURL string) (string, error) { + matches := arxivIdentifierRegex.FindStringSubmatch(articleURL) + if len(matches) > 1 { + return matches[1], nil + } + return "", fmt.Errorf("no arXiv identifier found") +} + +func getDOI(text string) string { + matches := doiRegex.FindStringSubmatch(text) + if len(matches) > 1 { + return matches[1] + } + return "" +} |
