diff --git a/autoscrape.go b/autoscrape.go new file mode 100644 index 0000000..1c932b9 --- /dev/null +++ b/autoscrape.go @@ -0,0 +1,86 @@ +package main + +import ( + "fmt" + "github.com/PuerkitoBio/goquery" + "log" + "os" + "strconv" + "strings" + "time" +) + +const PROGRAM_NAME = "autoscrape" + +func main() { + if len(os.Args) <= 2 { + fmt.Print("Usage: " + PROGRAM_NAME + " [page-parameter] [max-pages]\n") + os.Exit(1) + } + url := os.Args[1] + selector := os.Args[2] + + page := "" + num_pages := -1 + if len(os.Args) > 3 { + page = os.Args[3] + } + if len(os.Args) > 4 { + var err error + num_pages, err = strconv.Atoi(os.Args[4]) + if err != nil { + num_pages = -1 + } + } + + last_content := "" + curr_page := 1 + ret := "" + if page != "" { + fmt.Println(">> Page parameter detected! ADVANCED MODE!") + for curr_page != num_pages { + r_url := url + page + if strings.Index(r_url, "%d") == -1 { + r_url = r_url + "%d" + } + r_url = fmt.Sprintf(r_url, curr_page) + fmt.Println(">>>> Fetching " + r_url) + new_content := pullOutText(fetchPage(r_url), selector) + if new_content == last_content { + num_pages = curr_page + } else { + fmt.Print(new_content) + last_content = new_content + } + curr_page += 1 + time.Sleep(1 * time.Second) + } + } else { + fmt.Print(pullOutText(fetchPage(url), selector)) + } + fmt.Print(ret) +} + +func fetchPage(url string) *goquery.Document { + doc, err := goquery.NewDocument(url) + if err != nil { + log.Fatal(err) + } + return doc +} + +func pullElementsAsText(d *goquery.Document, sel string) string { + var ret string + d.Find(sel).Each(func(i int, s *goquery.Selection) { + ret = ret + strings.TrimSpace(s.Text()) + "\n" + }) + return ret +} + +func pullOutText(d *goquery.Document, sel string) string { + var ret string + d.Find(sel).Each(func(i int, s *goquery.Selection) { + ret = ret + strings.TrimSpace(s.Text()) + "\n" + }) + return ret +} \ No newline at end of file