1 files changed, 75 insertions, 0 deletions
diff --git a/routes.go b/routes.go
new file mode 100644
index 0000000..39bb7a9
--- /dev/null
+++ b/routes.go
@@ -0,0 +1,75 @@
+// ROUTING STRATEGY
+//
+// Routes URLs to the appropriate extraction handler. The order matters:
+// 1. arxiv - direct arXiv URLs and IDs, use specialized arXiv API
+// 2. s2 - Semantic Scholar for DOI-based sources, richer metadata
+// 3. rawhtml - fallback for direct publisher URLs, generic extraction
+package main
+
+import (
+	"fmt"
+	"net/url"
+	"regexp"
+	"strings"
+)
+
+var (
+	// regex to extract arXiv identifier from various arXiv URLs.
+	// supports new (2109.05857) and old (math-ph/0301015) formats.
+	arxivIdentifierRegex = regexp.MustCompile(`(?:arxiv\.org/(?:abs|pdf)/|arXiv:)([a-z-]+/[0-9]{7}|[0-9]{4}\.[0-9]{4,5}(?:v[0-9]+)?)(?:\.pdf)?`)
+
+	// regex to find a DOI in a string.
+	doiRegex = regexp.MustCompile(`(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)`)
+)
+
+// route determines the primary enrichment strategy for a URL.
+// returns the route str: "arxiv", "s2", or "rawhtml".
+func Route(urlStr string) string {
+	parsedURL, err := url.Parse(urlStr)
+	if err != nil {
+		return "rawhtml" // fallback if URL is unparseable
+	}
+
+	hostname := parsedURL.Hostname()
+
+	// 1. arXiv.org or arXiv ID pattern in URL
+	if hostname == "arxiv.org" || strings.Contains(urlStr, "arxiv.org/abs/") {
+		if _, err := getArxivIdentifier(urlStr); err == nil {
+			return "arxiv"
+		}
+	}
+
+	// 2. direct DOI link from doi.org
+	if hostname == "doi.org" {
+		return "s2"
+	}
+
+	// 3. DOI present in URL path (e.g. some publisher sites)
+	if doi := getDOI(urlStr); doi != "" {
+		return "s2"
+	}
+
+	// 4. fallback to rawhtml
+	return "rawhtml"
+}
+
+// routeArticle determines the route for an article and sets the Route field.
+func routeArticle(article *Article) {
+	article.Route = Route(article.URL)
+}
+
+func getArxivIdentifier(articleURL string) (string, error) {
+	matches := arxivIdentifierRegex.FindStringSubmatch(articleURL)
+	if len(matches) > 1 {
+		return matches[1], nil
+	}
+	return "", fmt.Errorf("no arXiv identifier found")
+}
+
+func getDOI(text string) string {
+	matches := doiRegex.FindStringSubmatch(text)
+	if len(matches) > 1 {
+		return matches[1]
+	}
+	return ""
+}