autoscrape/autoscrape.go

86 lines
1.7 KiB
Go

package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"log"
"os"
"strconv"
"strings"
"time"
)
const PROGRAM_NAME = "autoscrape"
func main() {
if len(os.Args) <= 2 {
fmt.Print("Usage: " + PROGRAM_NAME + " <url> <selector> [page-parameter] [max-pages]\n")
os.Exit(1)
}
url := os.Args[1]
selector := os.Args[2]
page := ""
num_pages := -1
if len(os.Args) > 3 {
page = os.Args[3]
}
if len(os.Args) > 4 {
var err error
num_pages, err = strconv.Atoi(os.Args[4])
if err != nil {
num_pages = -1
}
}
last_content := ""
curr_page := 1
ret := ""
if page != "" {
fmt.Println(">> Page parameter detected! ADVANCED MODE!")
for curr_page != num_pages {
r_url := url + page
if strings.Index(r_url, "%d") == -1 {
r_url = r_url + "%d"
}
r_url = fmt.Sprintf(r_url, curr_page)
fmt.Println(">>>> Fetching " + r_url)
new_content := pullOutText(fetchPage(r_url), selector)
if new_content == last_content {
num_pages = curr_page
} else {
fmt.Print(new_content)
last_content = new_content
}
curr_page += 1
time.Sleep(1 * time.Second)
}
} else {
fmt.Print(pullOutText(fetchPage(url), selector))
}
fmt.Print(ret)
}
func fetchPage(url string) *goquery.Document {
doc, err := goquery.NewDocument(url)
if err != nil {
log.Fatal(err)
}
return doc
}
func pullElementsAsText(d *goquery.Document, sel string) string {
var ret string
d.Find(sel).Each(func(i int, s *goquery.Selection) {
ret = ret + strings.TrimSpace(s.Text()) + "\n"
})
return ret
}
func pullOutText(d *goquery.Document, sel string) string {
var ret string
d.Find(sel).Each(func(i int, s *goquery.Selection) {
ret = ret + strings.TrimSpace(s.Text()) + "\n"
})
return ret
}