package main import ( "fmt" "github.com/PuerkitoBio/goquery" "log" "os" "strconv" "strings" "time" ) const PROGRAM_NAME = "autoscrape" func main() { if len(os.Args) <= 2 { fmt.Print("Usage: " + PROGRAM_NAME + " [page-parameter] [max-pages]\n") os.Exit(1) } url := os.Args[1] selector := os.Args[2] page := "" num_pages := -1 if len(os.Args) > 3 { page = os.Args[3] } if len(os.Args) > 4 { var err error num_pages, err = strconv.Atoi(os.Args[4]) if err != nil { num_pages = -1 } } last_content := "" curr_page := 1 ret := "" if page != "" { fmt.Println(">> Page parameter detected! ADVANCED MODE!") for curr_page != num_pages { r_url := url + page if strings.Index(r_url, "%d") == -1 { r_url = r_url + "%d" } r_url = fmt.Sprintf(r_url, curr_page) fmt.Println(">>>> Fetching " + r_url) new_content := pullOutText(fetchPage(r_url), selector) if new_content == last_content { num_pages = curr_page } else { fmt.Print(new_content) last_content = new_content } curr_page += 1 time.Sleep(1 * time.Second) } } else { fmt.Print(pullOutText(fetchPage(url), selector)) } fmt.Print(ret) } func fetchPage(url string) *goquery.Document { doc, err := goquery.NewDocument(url) if err != nil { log.Fatal(err) } return doc } func pullElementsAsText(d *goquery.Document, sel string) string { var ret string d.Find(sel).Each(func(i int, s *goquery.Selection) { ret = ret + strings.TrimSpace(s.Text()) + "\n" }) return ret } func pullOutText(d *goquery.Document, sel string) string { var ret string d.Find(sel).Each(func(i int, s *goquery.Selection) { ret = ret + strings.TrimSpace(s.Text()) + "\n" }) return ret }