diff options
| author | Sam Scholten | 2026-06-14 20:00:15 +1000 |
|---|---|---|
| committer | Sam Scholten | 2026-06-14 20:00:15 +1000 |
| commit | decc46c876e7b5552f5f5ecac4ee4f1a64ad1d62 (patch) | |
| tree | 46875e236a062189115c0cd8ed8f1d82980c16b7 /scrape.go | |
| download | abvjt-decc46c876e7b5552f5f5ecac4ee4f1a64ad1d62.tar.gz abvjt-decc46c876e7b5552f5f5ecac4ee4f1a64ad1d62.zip | |
Diffstat (limited to 'scrape.go')
| -rw-r--r-- | scrape.go | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/scrape.go b/scrape.go new file mode 100644 index 0000000..7b61667 --- /dev/null +++ b/scrape.go @@ -0,0 +1,175 @@ +package main + +import ( + "errors" + "flag" + "fmt" + "io" + "net/http" + "strings" + "time" + + "golang.org/x/net/html" +) + +const wosBaseURL = "https://wos-help.webofscience.com/WOKRS535R111/help/WOS" + +var pageList = []string{ + "0-9", "A", "B", "C", "D", "E", "F", "G", "H", "I", + "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", + "T", "U", "V", "W", "X", "Y", "Z", +} + +type ScrapeCommand struct { + DBPath string +} + +func (c *ScrapeCommand) Name() string { return "scrape" } + +func (c *ScrapeCommand) Init(args []string) error { + fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError) + fs.StringVar(&c.DBPath, "db", "data/abvjt.db", "Path to SQLite database") + if err := fs.Parse(args); err != nil { + if errors.Is(err, flag.ErrHelp) { + fs.Usage() + return nil + } + return err + } + return nil +} + +func (c *ScrapeCommand) Run(stdin io.Reader, stdout io.Writer) error { + db, err := OpenDB(c.DBPath) + if err != nil { + return err + } + defer db.Close() + + client := &http.Client{ + Timeout: 30 * time.Second, + } + + var totalInserted int + for _, page := range pageList { + url := fmt.Sprintf("%s/%s_abrvjt.html", wosBaseURL, page) + fmt.Fprintf(stdout, "Fetching %s...\n", url) + + journals, err := fetchPage(client, url) + if err != nil { + return fmt.Errorf("failed to fetch page %s: %w", page, err) + } + + if len(journals) == 0 { + fmt.Fprintf(stdout, " Warning: no journals found on page %s\n", page) + continue + } + + if err := db.InsertJournals(journals); err != nil { + return fmt.Errorf("failed to insert page %s: %w", page, err) + } + + totalInserted += len(journals) + fmt.Fprintf(stdout, " Inserted %d journals\n", len(journals)) + + time.Sleep(200 * time.Millisecond) + } + + count, err := db.Count() + if err != nil { + return fmt.Errorf("failed to get final count: %w", err) + } + + fmt.Fprintf(stdout, "\nDone. Total journals inserted: %d (DB count: %d)\n", totalInserted, count) + return nil +} + +func fetchPage(client *http.Client, url string) ([]Journal, error) { + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, err + } + req.Header.Set("User-Agent", "abvjt/1.0") + + resp, err := client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("HTTP %d", resp.StatusCode) + } + + return parseWOSPage(resp.Body) +} + +func parseWOSPage(r io.Reader) ([]Journal, error) { + tokenizer := html.NewTokenizer(r) + var journals []Journal + var nameBuf strings.Builder + var abbrevBuf strings.Builder + + const ( + stateDefault = iota + stateInName + stateExpectDD + stateInAbbrev + ) + state := stateDefault + + for { + tokenType := tokenizer.Next() + if tokenType == html.ErrorToken { + err := tokenizer.Err() + if err == io.EOF { + break + } + return nil, err + } + + switch tokenType { + case html.StartTagToken: + token := tokenizer.Token() + switch token.Data { + case "dt": + state = stateInName + nameBuf.Reset() + case "b": + if state == stateInName { + state = stateExpectDD + } + case "dd": + if state == stateExpectDD { + state = stateInAbbrev + abbrevBuf.Reset() + } + } + + case html.EndTagToken: + token := tokenizer.Token() + if token.Data == "b" && state == stateInAbbrev { + name := strings.TrimSpace(nameBuf.String()) + abbrev := strings.TrimSpace(abbrevBuf.String()) + if name != "" { + journals = append(journals, Journal{ + FullName: name, + Abbreviation: abbrev, + }) + } + state = stateDefault + } + + case html.TextToken: + text := tokenizer.Token().Data + switch state { + case stateInName: + nameBuf.WriteString(text) + case stateInAbbrev: + abbrevBuf.WriteString(text) + } + } + } + + return journals, nil +} |
