diff options
| author | Sam Scholten | 2025-12-15 19:35:46 +1000 |
|---|---|---|
| committer | Sam Scholten | 2025-12-15 19:35:57 +1000 |
| commit | 3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2 (patch) | |
| tree | 42b1f0e0a346a1cf087df90e29a100edbd66b3eb /html.go | |
| download | scholfetch-3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2.tar.gz scholfetch-3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2.zip | |
Diffstat (limited to 'html.go')
| -rw-r--r-- | html.go | 198 |
1 files changed, 198 insertions, 0 deletions
@@ -0,0 +1,198 @@ +// RAW HTML HANDLER +// +// Fallback handler for URLs that don't match arXiv or Semantic Scholar patterns. +// +// STRATEGY: +// - progressive extraction tries multiple metadata sources in order +// - JSON-LD structured data first (highest quality) +// - citation meta tags (scholarly articles) +// - open Graph (social media) +// - basic HTML tags (last resort) +package main + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + + "github.com/PuerkitoBio/goquery" +) + +// fetchRawHTML attempts to fetch article content by parsing HTML metadata. +func fetchRawHTML(ctx context.Context, config *Config, urlStr string) (*Article, error) { + req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) + if err != nil { + return nil, fmt.Errorf("failed to build request for %s: %w", urlStr, err) + } + + resp, err := config.HTTP.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to fetch %s: %w", urlStr, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("HTTP %d when fetching %s", resp.StatusCode, urlStr) + } + + doc, err := goquery.NewDocumentFromReader(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to parse HTML from %s: %w", urlStr, err) + } + + // extract title using various strategies + strategies := []func(*goquery.Document) string{ + extractTitleFromJSONLD, + extractTitleFromCitationMeta, + extractTitleFromOpenGraph, + func(d *goquery.Document) string { return normalizeSpace(d.Find("title").First().Text()) }, + } + + var title string + for _, strategy := range strategies { + title = strategy(doc) + if title != "" { + break + } + } + + if title == "" { + return nil, fmt.Errorf("no title found for %s", urlStr) + } + + article := &Article{ + URL: urlStr, + Title: title, + } + + // only fetch content if requested + if config.WithContent { + contentStrategies := []func(*goquery.Document) string{ + extractContentFromJSONLD, + extractContentFromCitationMeta, + extractContentFromOpenGraph, + extractContentFromBasicMeta, + } + + var content string + for _, strategy := range contentStrategies { + content = strategy(doc) + if content != "" && len(content) > 50 { + break + } + } + + if content != "" { + article.Content = content + } + } + + return article, nil +} + +func extractTitleFromJSONLD(doc *goquery.Document) string { + var title string + doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) { + if title != "" { + return + } + var data map[string]interface{} + if json.Unmarshal([]byte(s.Text()), &data) == nil { + t := getStringFromJSONLD(data, []string{"name", "headline", "title"}) + if t != "" { + title = normalizeSpace(t) + } + } + }) + return title +} + +func extractContentFromJSONLD(doc *goquery.Document) string { + var content string + doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) { + if content != "" { + return + } + var data map[string]interface{} + if json.Unmarshal([]byte(s.Text()), &data) == nil { + d := getStringFromJSONLD(data, []string{"description", "abstract", "summary"}) + if d != "" { + content = normalizeSpace(d) + if len(content) > 5000 { + content = content[:5000] + "..." + } + } + } + }) + return content +} + +func extractTitleFromCitationMeta(doc *goquery.Document) string { + title, _ := doc.Find("meta[name='citation_title']").Attr("content") + return normalizeSpace(title) +} + +func extractContentFromCitationMeta(doc *goquery.Document) string { + content, _ := doc.Find("meta[name='citation_abstract']").Attr("content") + return normalizeSpace(content) +} + +func extractTitleFromOpenGraph(doc *goquery.Document) string { + title, _ := doc.Find("meta[property='og:title']").Attr("content") + return normalizeSpace(title) +} + +func extractContentFromOpenGraph(doc *goquery.Document) string { + content, _ := doc.Find("meta[property='og:description']").Attr("content") + return normalizeSpace(content) +} + +func extractContentFromBasicMeta(doc *goquery.Document) string { + contentRaw, _ := doc.Find("meta[name='description']").Attr("content") + content := normalizeSpace(contentRaw) + + // if meta description is too short, try to extract from the body + if len(content) < 100 { + selectors := []string{"article", "main", ".abstract", ".summary", "[role='main']", ".content", ".entry-content"} + for _, selector := range selectors { + if contentText := extractCleanText(doc, selector); len(contentText) > len(content) { + content = contentText + break + } + } + } + + if len(content) > 5000 { + content = content[:5000] + } + + if len(content) < 50 { + return "" + } + + return content +} + +func extractCleanText(doc *goquery.Document, selector string) string { + element := doc.Find(selector).First() + if element.Length() == 0 { + return "" + } + element.Find("script, style, nav, header, footer, aside").Remove() + text := element.Text() + text = normalizeSpace(text) + if len(text) > 5000 { + text = text[:5000] + } + return text +} + +func getStringFromJSONLD(data map[string]interface{}, fields []string) string { + for _, field := range fields { + if val, ok := data[field].(string); ok && val != "" { + return val + } + } + return "" +} |
