aboutsummaryrefslogtreecommitdiff
path: root/routes.go
diff options
context:
space:
mode:
authorSam Scholten2025-12-15 19:35:46 +1000
committerSam Scholten2025-12-15 19:35:57 +1000
commit3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2 (patch)
tree42b1f0e0a346a1cf087df90e29a100edbd66b3eb /routes.go
downloadscholfetch-main.tar.gz
scholfetch-main.zip
Init v0.1.0HEADmain
Diffstat (limited to 'routes.go')
-rw-r--r--routes.go75
1 files changed, 75 insertions, 0 deletions
diff --git a/routes.go b/routes.go
new file mode 100644
index 0000000..39bb7a9
--- /dev/null
+++ b/routes.go
@@ -0,0 +1,75 @@
+// ROUTING STRATEGY
+//
+// Routes URLs to the appropriate extraction handler. The order matters:
+// 1. arxiv - direct arXiv URLs and IDs, use specialized arXiv API
+// 2. s2 - Semantic Scholar for DOI-based sources, richer metadata
+// 3. rawhtml - fallback for direct publisher URLs, generic extraction
+package main
+
+import (
+ "fmt"
+ "net/url"
+ "regexp"
+ "strings"
+)
+
+var (
+ // regex to extract arXiv identifier from various arXiv URLs.
+ // supports new (2109.05857) and old (math-ph/0301015) formats.
+ arxivIdentifierRegex = regexp.MustCompile(`(?:arxiv\.org/(?:abs|pdf)/|arXiv:)([a-z-]+/[0-9]{7}|[0-9]{4}\.[0-9]{4,5}(?:v[0-9]+)?)(?:\.pdf)?`)
+
+ // regex to find a DOI in a string.
+ doiRegex = regexp.MustCompile(`(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)`)
+)
+
+// route determines the primary enrichment strategy for a URL.
+// returns the route str: "arxiv", "s2", or "rawhtml".
+func Route(urlStr string) string {
+ parsedURL, err := url.Parse(urlStr)
+ if err != nil {
+ return "rawhtml" // fallback if URL is unparseable
+ }
+
+ hostname := parsedURL.Hostname()
+
+ // 1. arXiv.org or arXiv ID pattern in URL
+ if hostname == "arxiv.org" || strings.Contains(urlStr, "arxiv.org/abs/") {
+ if _, err := getArxivIdentifier(urlStr); err == nil {
+ return "arxiv"
+ }
+ }
+
+ // 2. direct DOI link from doi.org
+ if hostname == "doi.org" {
+ return "s2"
+ }
+
+ // 3. DOI present in URL path (e.g. some publisher sites)
+ if doi := getDOI(urlStr); doi != "" {
+ return "s2"
+ }
+
+ // 4. fallback to rawhtml
+ return "rawhtml"
+}
+
+// routeArticle determines the route for an article and sets the Route field.
+func routeArticle(article *Article) {
+ article.Route = Route(article.URL)
+}
+
+func getArxivIdentifier(articleURL string) (string, error) {
+ matches := arxivIdentifierRegex.FindStringSubmatch(articleURL)
+ if len(matches) > 1 {
+ return matches[1], nil
+ }
+ return "", fmt.Errorf("no arXiv identifier found")
+}
+
+func getDOI(text string) string {
+ matches := doiRegex.FindStringSubmatch(text)
+ if len(matches) > 1 {
+ return matches[1]
+ }
+ return ""
+}