86 lines
1.7 KiB
Go
86 lines
1.7 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"github.com/PuerkitoBio/goquery"
|
|
"log"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const PROGRAM_NAME = "autoscrape"
|
|
|
|
func main() {
|
|
if len(os.Args) <= 2 {
|
|
fmt.Print("Usage: " + PROGRAM_NAME + " <url> <selector> [page-parameter] [max-pages]\n")
|
|
os.Exit(1)
|
|
}
|
|
url := os.Args[1]
|
|
selector := os.Args[2]
|
|
|
|
page := ""
|
|
num_pages := -1
|
|
if len(os.Args) > 3 {
|
|
page = os.Args[3]
|
|
}
|
|
if len(os.Args) > 4 {
|
|
var err error
|
|
num_pages, err = strconv.Atoi(os.Args[4])
|
|
if err != nil {
|
|
num_pages = -1
|
|
}
|
|
}
|
|
|
|
last_content := ""
|
|
curr_page := 1
|
|
ret := ""
|
|
if page != "" {
|
|
fmt.Println(">> Page parameter detected! ADVANCED MODE!")
|
|
for curr_page != num_pages {
|
|
r_url := url + page
|
|
if strings.Index(r_url, "%d") == -1 {
|
|
r_url = r_url + "%d"
|
|
}
|
|
r_url = fmt.Sprintf(r_url, curr_page)
|
|
fmt.Println(">>>> Fetching " + r_url)
|
|
new_content := pullOutText(fetchPage(r_url), selector)
|
|
if new_content == last_content {
|
|
num_pages = curr_page
|
|
} else {
|
|
fmt.Print(new_content)
|
|
last_content = new_content
|
|
}
|
|
curr_page += 1
|
|
time.Sleep(1 * time.Second)
|
|
}
|
|
} else {
|
|
fmt.Print(pullOutText(fetchPage(url), selector))
|
|
}
|
|
fmt.Print(ret)
|
|
}
|
|
|
|
func fetchPage(url string) *goquery.Document {
|
|
doc, err := goquery.NewDocument(url)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
return doc
|
|
}
|
|
|
|
func pullElementsAsText(d *goquery.Document, sel string) string {
|
|
var ret string
|
|
d.Find(sel).Each(func(i int, s *goquery.Selection) {
|
|
ret = ret + strings.TrimSpace(s.Text()) + "\n"
|
|
})
|
|
return ret
|
|
}
|
|
|
|
func pullOutText(d *goquery.Document, sel string) string {
|
|
var ret string
|
|
d.Find(sel).Each(func(i int, s *goquery.Selection) {
|
|
ret = ret + strings.TrimSpace(s.Text()) + "\n"
|
|
})
|
|
return ret
|
|
} |