aboutsummaryrefslogtreecommitdiff
path: root/html.go
diff options
context:
space:
mode:
authorSam Scholten2025-12-15 19:35:46 +1000
committerSam Scholten2025-12-15 19:35:57 +1000
commit3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2 (patch)
tree42b1f0e0a346a1cf087df90e29a100edbd66b3eb /html.go
downloadscholfetch-3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2.tar.gz
scholfetch-3562d2fd34bb98d29c7cf6e4d4130129a7bb24f2.zip
Init v0.1.0HEADmain
Diffstat (limited to 'html.go')
-rw-r--r--html.go198
1 files changed, 198 insertions, 0 deletions
diff --git a/html.go b/html.go
new file mode 100644
index 0000000..0995865
--- /dev/null
+++ b/html.go
@@ -0,0 +1,198 @@
+// RAW HTML HANDLER
+//
+// Fallback handler for URLs that don't match arXiv or Semantic Scholar patterns.
+//
+// STRATEGY:
+// - progressive extraction tries multiple metadata sources in order
+// - JSON-LD structured data first (highest quality)
+// - citation meta tags (scholarly articles)
+// - open Graph (social media)
+// - basic HTML tags (last resort)
+package main
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "net/http"
+
+ "github.com/PuerkitoBio/goquery"
+)
+
+// fetchRawHTML attempts to fetch article content by parsing HTML metadata.
+func fetchRawHTML(ctx context.Context, config *Config, urlStr string) (*Article, error) {
+ req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
+ if err != nil {
+ return nil, fmt.Errorf("failed to build request for %s: %w", urlStr, err)
+ }
+
+ resp, err := config.HTTP.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("failed to fetch %s: %w", urlStr, err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("HTTP %d when fetching %s", resp.StatusCode, urlStr)
+ }
+
+ doc, err := goquery.NewDocumentFromReader(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse HTML from %s: %w", urlStr, err)
+ }
+
+ // extract title using various strategies
+ strategies := []func(*goquery.Document) string{
+ extractTitleFromJSONLD,
+ extractTitleFromCitationMeta,
+ extractTitleFromOpenGraph,
+ func(d *goquery.Document) string { return normalizeSpace(d.Find("title").First().Text()) },
+ }
+
+ var title string
+ for _, strategy := range strategies {
+ title = strategy(doc)
+ if title != "" {
+ break
+ }
+ }
+
+ if title == "" {
+ return nil, fmt.Errorf("no title found for %s", urlStr)
+ }
+
+ article := &Article{
+ URL: urlStr,
+ Title: title,
+ }
+
+ // only fetch content if requested
+ if config.WithContent {
+ contentStrategies := []func(*goquery.Document) string{
+ extractContentFromJSONLD,
+ extractContentFromCitationMeta,
+ extractContentFromOpenGraph,
+ extractContentFromBasicMeta,
+ }
+
+ var content string
+ for _, strategy := range contentStrategies {
+ content = strategy(doc)
+ if content != "" && len(content) > 50 {
+ break
+ }
+ }
+
+ if content != "" {
+ article.Content = content
+ }
+ }
+
+ return article, nil
+}
+
+func extractTitleFromJSONLD(doc *goquery.Document) string {
+ var title string
+ doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
+ if title != "" {
+ return
+ }
+ var data map[string]interface{}
+ if json.Unmarshal([]byte(s.Text()), &data) == nil {
+ t := getStringFromJSONLD(data, []string{"name", "headline", "title"})
+ if t != "" {
+ title = normalizeSpace(t)
+ }
+ }
+ })
+ return title
+}
+
+func extractContentFromJSONLD(doc *goquery.Document) string {
+ var content string
+ doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
+ if content != "" {
+ return
+ }
+ var data map[string]interface{}
+ if json.Unmarshal([]byte(s.Text()), &data) == nil {
+ d := getStringFromJSONLD(data, []string{"description", "abstract", "summary"})
+ if d != "" {
+ content = normalizeSpace(d)
+ if len(content) > 5000 {
+ content = content[:5000] + "..."
+ }
+ }
+ }
+ })
+ return content
+}
+
+func extractTitleFromCitationMeta(doc *goquery.Document) string {
+ title, _ := doc.Find("meta[name='citation_title']").Attr("content")
+ return normalizeSpace(title)
+}
+
+func extractContentFromCitationMeta(doc *goquery.Document) string {
+ content, _ := doc.Find("meta[name='citation_abstract']").Attr("content")
+ return normalizeSpace(content)
+}
+
+func extractTitleFromOpenGraph(doc *goquery.Document) string {
+ title, _ := doc.Find("meta[property='og:title']").Attr("content")
+ return normalizeSpace(title)
+}
+
+func extractContentFromOpenGraph(doc *goquery.Document) string {
+ content, _ := doc.Find("meta[property='og:description']").Attr("content")
+ return normalizeSpace(content)
+}
+
+func extractContentFromBasicMeta(doc *goquery.Document) string {
+ contentRaw, _ := doc.Find("meta[name='description']").Attr("content")
+ content := normalizeSpace(contentRaw)
+
+ // if meta description is too short, try to extract from the body
+ if len(content) < 100 {
+ selectors := []string{"article", "main", ".abstract", ".summary", "[role='main']", ".content", ".entry-content"}
+ for _, selector := range selectors {
+ if contentText := extractCleanText(doc, selector); len(contentText) > len(content) {
+ content = contentText
+ break
+ }
+ }
+ }
+
+ if len(content) > 5000 {
+ content = content[:5000]
+ }
+
+ if len(content) < 50 {
+ return ""
+ }
+
+ return content
+}
+
+func extractCleanText(doc *goquery.Document, selector string) string {
+ element := doc.Find(selector).First()
+ if element.Length() == 0 {
+ return ""
+ }
+ element.Find("script, style, nav, header, footer, aside").Remove()
+ text := element.Text()
+ text = normalizeSpace(text)
+ if len(text) > 5000 {
+ text = text[:5000]
+ }
+ return text
+}
+
+func getStringFromJSONLD(data map[string]interface{}, fields []string) string {
+ for _, field := range fields {
+ if val, ok := data[field].(string); ok && val != "" {
+ return val
+ }
+ }
+ return ""
+}