// RAW HTML HANDLER // // Fallback handler for URLs that don't match arXiv or Semantic Scholar patterns. // // STRATEGY: // - progressive extraction tries multiple metadata sources in order // - JSON-LD structured data first (highest quality) // - citation meta tags (scholarly articles) // - open Graph (social media) // - basic HTML tags (last resort) package main import ( "context" "encoding/json" "fmt" "net/http" "github.com/PuerkitoBio/goquery" ) // fetchRawHTML attempts to fetch article content by parsing HTML metadata. func fetchRawHTML(ctx context.Context, config *Config, urlStr string) (*Article, error) { req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) if err != nil { return nil, fmt.Errorf("failed to build request for %s: %w", urlStr, err) } resp, err := config.HTTP.Do(req) if err != nil { return nil, fmt.Errorf("failed to fetch %s: %w", urlStr, err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("HTTP %d when fetching %s", resp.StatusCode, urlStr) } doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { return nil, fmt.Errorf("failed to parse HTML from %s: %w", urlStr, err) } // extract title using various strategies strategies := []func(*goquery.Document) string{ extractTitleFromJSONLD, extractTitleFromCitationMeta, extractTitleFromOpenGraph, func(d *goquery.Document) string { return normalizeSpace(d.Find("title").First().Text()) }, } var title string for _, strategy := range strategies { title = strategy(doc) if title != "" { break } } if title == "" { return nil, fmt.Errorf("no title found for %s", urlStr) } article := &Article{ URL: urlStr, Title: title, } // only fetch content if requested if config.WithContent { contentStrategies := []func(*goquery.Document) string{ extractContentFromJSONLD, extractContentFromCitationMeta, extractContentFromOpenGraph, extractContentFromBasicMeta, } var content string for _, strategy := range contentStrategies { content = strategy(doc) if content != "" && len(content) > 50 { break } } if content != "" { article.Content = content } } return article, nil } func extractTitleFromJSONLD(doc *goquery.Document) string { var title string doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) { if title != "" { return } var data map[string]interface{} if json.Unmarshal([]byte(s.Text()), &data) == nil { t := getStringFromJSONLD(data, []string{"name", "headline", "title"}) if t != "" { title = normalizeSpace(t) } } }) return title } func extractContentFromJSONLD(doc *goquery.Document) string { var content string doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) { if content != "" { return } var data map[string]interface{} if json.Unmarshal([]byte(s.Text()), &data) == nil { d := getStringFromJSONLD(data, []string{"description", "abstract", "summary"}) if d != "" { content = normalizeSpace(d) if len(content) > 5000 { content = content[:5000] + "..." } } } }) return content } func extractTitleFromCitationMeta(doc *goquery.Document) string { title, _ := doc.Find("meta[name='citation_title']").Attr("content") return normalizeSpace(title) } func extractContentFromCitationMeta(doc *goquery.Document) string { content, _ := doc.Find("meta[name='citation_abstract']").Attr("content") return normalizeSpace(content) } func extractTitleFromOpenGraph(doc *goquery.Document) string { title, _ := doc.Find("meta[property='og:title']").Attr("content") return normalizeSpace(title) } func extractContentFromOpenGraph(doc *goquery.Document) string { content, _ := doc.Find("meta[property='og:description']").Attr("content") return normalizeSpace(content) } func extractContentFromBasicMeta(doc *goquery.Document) string { contentRaw, _ := doc.Find("meta[name='description']").Attr("content") content := normalizeSpace(contentRaw) // if meta description is too short, try to extract from the body if len(content) < 100 { selectors := []string{"article", "main", ".abstract", ".summary", "[role='main']", ".content", ".entry-content"} for _, selector := range selectors { if contentText := extractCleanText(doc, selector); len(contentText) > len(content) { content = contentText break } } } if len(content) > 5000 { content = content[:5000] } if len(content) < 50 { return "" } return content } func extractCleanText(doc *goquery.Document, selector string) string { element := doc.Find(selector).First() if element.Length() == 0 { return "" } element.Find("script, style, nav, header, footer, aside").Remove() text := element.Text() text = normalizeSpace(text) if len(text) > 5000 { text = text[:5000] } return text } func getStringFromJSONLD(data map[string]interface{}, fields []string) string { for _, field := range fields { if val, ok := data[field].(string); ok && val != "" { return val } } return "" }