// RAW HTML HANDLER
//
// Fallback handler for URLs that don't match arXiv or Semantic Scholar patterns.
//
// STRATEGY:
// - progressive extraction tries multiple metadata sources in order
// - JSON-LD structured data first (highest quality)
// - citation meta tags (scholarly articles)
// - open Graph (social media)
// - basic HTML tags (last resort)
package main
import (
"context"
"encoding/json"
"fmt"
"net/http"
"github.com/PuerkitoBio/goquery"
)
// fetchRawHTML attempts to fetch article content by parsing HTML metadata.
func fetchRawHTML(ctx context.Context, config *Config, urlStr string) (*Article, error) {
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
if err != nil {
return nil, fmt.Errorf("failed to build request for %s: %w", urlStr, err)
}
resp, err := config.HTTP.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to fetch %s: %w", urlStr, err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("HTTP %d when fetching %s", resp.StatusCode, urlStr)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML from %s: %w", urlStr, err)
}
// extract title using various strategies
strategies := []func(*goquery.Document) string{
extractTitleFromJSONLD,
extractTitleFromCitationMeta,
extractTitleFromOpenGraph,
func(d *goquery.Document) string { return normalizeSpace(d.Find("title").First().Text()) },
}
var title string
for _, strategy := range strategies {
title = strategy(doc)
if title != "" {
break
}
}
if title == "" {
return nil, fmt.Errorf("no title found for %s", urlStr)
}
article := &Article{
URL: urlStr,
Title: title,
}
// only fetch content if requested
if config.WithContent {
contentStrategies := []func(*goquery.Document) string{
extractContentFromJSONLD,
extractContentFromCitationMeta,
extractContentFromOpenGraph,
extractContentFromBasicMeta,
}
var content string
for _, strategy := range contentStrategies {
content = strategy(doc)
if content != "" && len(content) > 50 {
break
}
}
if content != "" {
article.Content = content
}
}
return article, nil
}
func extractTitleFromJSONLD(doc *goquery.Document) string {
var title string
doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
if title != "" {
return
}
var data map[string]interface{}
if json.Unmarshal([]byte(s.Text()), &data) == nil {
t := getStringFromJSONLD(data, []string{"name", "headline", "title"})
if t != "" {
title = normalizeSpace(t)
}
}
})
return title
}
func extractContentFromJSONLD(doc *goquery.Document) string {
var content string
doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
if content != "" {
return
}
var data map[string]interface{}
if json.Unmarshal([]byte(s.Text()), &data) == nil {
d := getStringFromJSONLD(data, []string{"description", "abstract", "summary"})
if d != "" {
content = normalizeSpace(d)
if len(content) > 5000 {
content = content[:5000] + "..."
}
}
}
})
return content
}
func extractTitleFromCitationMeta(doc *goquery.Document) string {
title, _ := doc.Find("meta[name='citation_title']").Attr("content")
return normalizeSpace(title)
}
func extractContentFromCitationMeta(doc *goquery.Document) string {
content, _ := doc.Find("meta[name='citation_abstract']").Attr("content")
return normalizeSpace(content)
}
func extractTitleFromOpenGraph(doc *goquery.Document) string {
title, _ := doc.Find("meta[property='og:title']").Attr("content")
return normalizeSpace(title)
}
func extractContentFromOpenGraph(doc *goquery.Document) string {
content, _ := doc.Find("meta[property='og:description']").Attr("content")
return normalizeSpace(content)
}
func extractContentFromBasicMeta(doc *goquery.Document) string {
contentRaw, _ := doc.Find("meta[name='description']").Attr("content")
content := normalizeSpace(contentRaw)
// if meta description is too short, try to extract from the body
if len(content) < 100 {
selectors := []string{"article", "main", ".abstract", ".summary", "[role='main']", ".content", ".entry-content"}
for _, selector := range selectors {
if contentText := extractCleanText(doc, selector); len(contentText) > len(content) {
content = contentText
break
}
}
}
if len(content) > 5000 {
content = content[:5000]
}
if len(content) < 50 {
return ""
}
return content
}
func extractCleanText(doc *goquery.Document, selector string) string {
element := doc.Find(selector).First()
if element.Length() == 0 {
return ""
}
element.Find("script, style, nav, header, footer, aside").Remove()
text := element.Text()
text = normalizeSpace(text)
if len(text) > 5000 {
text = text[:5000]
}
return text
}
func getStringFromJSONLD(data map[string]interface{}, fields []string) string {
for _, field := range fields {
if val, ok := data[field].(string); ok && val != "" {
return val
}
}
return ""
}