package main import ( "errors" "flag" "fmt" "io" "net/http" "strings" "time" "golang.org/x/net/html" ) const wosBaseURL = "https://wos-help.webofscience.com/WOKRS535R111/help/WOS" var pageList = []string{ "0-9", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", } type ScrapeCommand struct { DBPath string } func (c *ScrapeCommand) Name() string { return "scrape" } func (c *ScrapeCommand) Init(args []string) error { fs := flag.NewFlagSet(c.Name(), flag.ContinueOnError) fs.StringVar(&c.DBPath, "db", "data/abvjt.db", "Path to SQLite database") if err := fs.Parse(args); err != nil { if errors.Is(err, flag.ErrHelp) { fs.Usage() return nil } return err } return nil } func (c *ScrapeCommand) Run(stdin io.Reader, stdout io.Writer) error { db, err := OpenDB(c.DBPath) if err != nil { return err } defer db.Close() client := &http.Client{ Timeout: 30 * time.Second, } var totalInserted int for _, page := range pageList { url := fmt.Sprintf("%s/%s_abrvjt.html", wosBaseURL, page) fmt.Fprintf(stdout, "Fetching %s...\n", url) journals, err := fetchPage(client, url) if err != nil { return fmt.Errorf("failed to fetch page %s: %w", page, err) } if len(journals) == 0 { fmt.Fprintf(stdout, " Warning: no journals found on page %s\n", page) continue } if err := db.InsertJournals(journals); err != nil { return fmt.Errorf("failed to insert page %s: %w", page, err) } totalInserted += len(journals) fmt.Fprintf(stdout, " Inserted %d journals\n", len(journals)) time.Sleep(200 * time.Millisecond) } count, err := db.Count() if err != nil { return fmt.Errorf("failed to get final count: %w", err) } fmt.Fprintf(stdout, "\nDone. Total journals inserted: %d (DB count: %d)\n", totalInserted, count) return nil } func fetchPage(client *http.Client, url string) ([]Journal, error) { req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", "abvjt/1.0") resp, err := client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("HTTP %d", resp.StatusCode) } return parseWOSPage(resp.Body) } func parseWOSPage(r io.Reader) ([]Journal, error) { tokenizer := html.NewTokenizer(r) var journals []Journal var nameBuf strings.Builder var abbrevBuf strings.Builder const ( stateDefault = iota stateInName stateExpectDD stateInAbbrev ) state := stateDefault for { tokenType := tokenizer.Next() if tokenType == html.ErrorToken { err := tokenizer.Err() if err == io.EOF { break } return nil, err } switch tokenType { case html.StartTagToken: token := tokenizer.Token() switch token.Data { case "dt": state = stateInName nameBuf.Reset() case "b": if state == stateInName { state = stateExpectDD } case "dd": if state == stateExpectDD { state = stateInAbbrev abbrevBuf.Reset() } } case html.EndTagToken: token := tokenizer.Token() if token.Data == "b" && state == stateInAbbrev { name := strings.TrimSpace(nameBuf.String()) abbrev := strings.TrimSpace(abbrevBuf.String()) if name != "" { journals = append(journals, Journal{ FullName: name, Abbreviation: abbrev, }) } state = stateDefault } case html.TextToken: text := tokenizer.Token().Data switch state { case stateInName: nameBuf.WriteString(text) case stateInAbbrev: abbrevBuf.WriteString(text) } } } return journals, nil }