aboutsummaryrefslogtreecommitdiff
path: root/scrape.go
diff options
context:
space:
mode:
authorSam Scholten2026-06-14 20:00:15 +1000
committerSam Scholten2026-06-14 20:00:15 +1000
commitdecc46c876e7b5552f5f5ecac4ee4f1a64ad1d62 (patch)
tree46875e236a062189115c0cd8ed8f1d82980c16b7 /scrape.go
downloadabvjt-main.tar.gz
abvjt-main.zip
Initial implementation: scrape, serve, UI, container, deploymentHEADmain
Diffstat (limited to 'scrape.go')
-rw-r--r--scrape.go175
1 files changed, 175 insertions, 0 deletions
diff --git a/scrape.go b/scrape.go
new file mode 100644
index 0000000..7b61667
--- /dev/null
+++ b/scrape.go
@@ -0,0 +1,175 @@
+package main
+
+import (
+ "errors"
+ "flag"
+ "fmt"
+ "io"
+ "net/http"
+ "strings"
+ "time"
+
+ "golang.org/x/net/html"
+)
+
+const wosBaseURL = "https://wos-help.webofscience.com/WOKRS535R111/help/WOS"
+
+var pageList = []string{
+ "0-9", "A", "B", "C", "D", "E", "F", "G", "H", "I",
+ "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S",
+ "T", "U", "V", "W", "X", "Y", "Z",
+}
+
+type ScrapeCommand struct {
+ DBPath string
+}
+
+func (c *ScrapeCommand) Name() string { return "scrape" }
+
+func (c *ScrapeCommand) Init(args []string) error {
+ fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
+ fs.StringVar(&c.DBPath, "db", "data/abvjt.db", "Path to SQLite database")
+ if err := fs.Parse(args); err != nil {
+ if errors.Is(err, flag.ErrHelp) {
+ fs.Usage()
+ return nil
+ }
+ return err
+ }
+ return nil
+}
+
+func (c *ScrapeCommand) Run(stdin io.Reader, stdout io.Writer) error {
+ db, err := OpenDB(c.DBPath)
+ if err != nil {
+ return err
+ }
+ defer db.Close()
+
+ client := &http.Client{
+ Timeout: 30 * time.Second,
+ }
+
+ var totalInserted int
+ for _, page := range pageList {
+ url := fmt.Sprintf("%s/%s_abrvjt.html", wosBaseURL, page)
+ fmt.Fprintf(stdout, "Fetching %s...\n", url)
+
+ journals, err := fetchPage(client, url)
+ if err != nil {
+ return fmt.Errorf("failed to fetch page %s: %w", page, err)
+ }
+
+ if len(journals) == 0 {
+ fmt.Fprintf(stdout, " Warning: no journals found on page %s\n", page)
+ continue
+ }
+
+ if err := db.InsertJournals(journals); err != nil {
+ return fmt.Errorf("failed to insert page %s: %w", page, err)
+ }
+
+ totalInserted += len(journals)
+ fmt.Fprintf(stdout, " Inserted %d journals\n", len(journals))
+
+ time.Sleep(200 * time.Millisecond)
+ }
+
+ count, err := db.Count()
+ if err != nil {
+ return fmt.Errorf("failed to get final count: %w", err)
+ }
+
+ fmt.Fprintf(stdout, "\nDone. Total journals inserted: %d (DB count: %d)\n", totalInserted, count)
+ return nil
+}
+
+func fetchPage(client *http.Client, url string) ([]Journal, error) {
+ req, err := http.NewRequest("GET", url, nil)
+ if err != nil {
+ return nil, err
+ }
+ req.Header.Set("User-Agent", "abvjt/1.0")
+
+ resp, err := client.Do(req)
+ if err != nil {
+ return nil, err
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
+ }
+
+ return parseWOSPage(resp.Body)
+}
+
+func parseWOSPage(r io.Reader) ([]Journal, error) {
+ tokenizer := html.NewTokenizer(r)
+ var journals []Journal
+ var nameBuf strings.Builder
+ var abbrevBuf strings.Builder
+
+ const (
+ stateDefault = iota
+ stateInName
+ stateExpectDD
+ stateInAbbrev
+ )
+ state := stateDefault
+
+ for {
+ tokenType := tokenizer.Next()
+ if tokenType == html.ErrorToken {
+ err := tokenizer.Err()
+ if err == io.EOF {
+ break
+ }
+ return nil, err
+ }
+
+ switch tokenType {
+ case html.StartTagToken:
+ token := tokenizer.Token()
+ switch token.Data {
+ case "dt":
+ state = stateInName
+ nameBuf.Reset()
+ case "b":
+ if state == stateInName {
+ state = stateExpectDD
+ }
+ case "dd":
+ if state == stateExpectDD {
+ state = stateInAbbrev
+ abbrevBuf.Reset()
+ }
+ }
+
+ case html.EndTagToken:
+ token := tokenizer.Token()
+ if token.Data == "b" && state == stateInAbbrev {
+ name := strings.TrimSpace(nameBuf.String())
+ abbrev := strings.TrimSpace(abbrevBuf.String())
+ if name != "" {
+ journals = append(journals, Journal{
+ FullName: name,
+ Abbreviation: abbrev,
+ })
+ }
+ state = stateDefault
+ }
+
+ case html.TextToken:
+ text := tokenizer.Token().Data
+ switch state {
+ case stateInName:
+ nameBuf.WriteString(text)
+ case stateInAbbrev:
+ abbrevBuf.WriteString(text)
+ }
+ }
+ }
+
+ return journals, nil
+}