// ROUTING STRATEGY // // Routes URLs to the appropriate extraction handler. The order matters: // 1. arxiv - direct arXiv URLs and IDs, use specialized arXiv API // 2. s2 - Semantic Scholar for DOI-based sources, richer metadata // 3. rawhtml - fallback for direct publisher URLs, generic extraction package main import ( "fmt" "net/url" "regexp" "strings" ) var ( // regex to extract arXiv identifier from various arXiv URLs. // supports new (2109.05857) and old (math-ph/0301015) formats. arxivIdentifierRegex = regexp.MustCompile(`(?:arxiv\.org/(?:abs|pdf)/|arXiv:)([a-z-]+/[0-9]{7}|[0-9]{4}\.[0-9]{4,5}(?:v[0-9]+)?)(?:\.pdf)?`) // regex to find a DOI in a string. doiRegex = regexp.MustCompile(`(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)`) ) // route determines the primary enrichment strategy for a URL. // returns the route str: "arxiv", "s2", or "rawhtml". func Route(urlStr string) string { parsedURL, err := url.Parse(urlStr) if err != nil { return "rawhtml" // fallback if URL is unparseable } hostname := parsedURL.Hostname() // 1. arXiv.org or arXiv ID pattern in URL if hostname == "arxiv.org" || strings.Contains(urlStr, "arxiv.org/abs/") { if _, err := getArxivIdentifier(urlStr); err == nil { return "arxiv" } } // 2. direct DOI link from doi.org if hostname == "doi.org" { return "s2" } // 3. DOI present in URL path (e.g. some publisher sites) if doi := getDOI(urlStr); doi != "" { return "s2" } // 4. fallback to rawhtml return "rawhtml" } // routeArticle determines the route for an article and sets the Route field. func routeArticle(article *Article) { article.Route = Route(article.URL) } func getArxivIdentifier(articleURL string) (string, error) { matches := arxivIdentifierRegex.FindStringSubmatch(articleURL) if len(matches) > 1 { return matches[1], nil } return "", fmt.Errorf("no arXiv identifier found") } func getDOI(text string) string { matches := doiRegex.FindStringSubmatch(text) if len(matches) > 1 { return matches[1] } return "" }