Adding the Source
This commit is contained in:
parent
0130ac92a5
commit
c47f704231
86
autoscrape.go
Normal file
86
autoscrape.go
Normal file
@ -0,0 +1,86 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"log"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const PROGRAM_NAME = "autoscrape"
|
||||
|
||||
func main() {
|
||||
if len(os.Args) <= 2 {
|
||||
fmt.Print("Usage: " + PROGRAM_NAME + " <url|url-json> <selector|selector-json> [page-parameter] [max-pages]\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
url := os.Args[1]
|
||||
selector := os.Args[2]
|
||||
|
||||
page := ""
|
||||
num_pages := -1
|
||||
if len(os.Args) > 3 {
|
||||
page = os.Args[3]
|
||||
}
|
||||
if len(os.Args) > 4 {
|
||||
var err error
|
||||
num_pages, err = strconv.Atoi(os.Args[4])
|
||||
if err != nil {
|
||||
num_pages = -1
|
||||
}
|
||||
}
|
||||
|
||||
last_content := ""
|
||||
curr_page := 1
|
||||
ret := ""
|
||||
if page != "" {
|
||||
fmt.Println(">> Page parameter detected! ADVANCED MODE!")
|
||||
for curr_page != num_pages {
|
||||
r_url := url + page
|
||||
if strings.Index(r_url, "%d") == -1 {
|
||||
r_url = r_url + "%d"
|
||||
}
|
||||
r_url = fmt.Sprintf(r_url, curr_page)
|
||||
fmt.Println(">>>> Fetching " + r_url)
|
||||
new_content := pullOutText(fetchPage(r_url), selector)
|
||||
if new_content == last_content {
|
||||
num_pages = curr_page
|
||||
} else {
|
||||
fmt.Print(new_content)
|
||||
last_content = new_content
|
||||
}
|
||||
curr_page += 1
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
} else {
|
||||
fmt.Print(pullOutText(fetchPage(url), selector))
|
||||
}
|
||||
fmt.Print(ret)
|
||||
}
|
||||
|
||||
func fetchPage(url string) *goquery.Document {
|
||||
doc, err := goquery.NewDocument(url)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
return doc
|
||||
}
|
||||
|
||||
func pullElementsAsText(d *goquery.Document, sel string) string {
|
||||
var ret string
|
||||
d.Find(sel).Each(func(i int, s *goquery.Selection) {
|
||||
ret = ret + strings.TrimSpace(s.Text()) + "\n"
|
||||
})
|
||||
return ret
|
||||
}
|
||||
|
||||
func pullOutText(d *goquery.Document, sel string) string {
|
||||
var ret string
|
||||
d.Find(sel).Each(func(i int, s *goquery.Selection) {
|
||||
ret = ret + strings.TrimSpace(s.Text()) + "\n"
|
||||
})
|
||||
return ret
|
||||
}
|
Loading…
Reference in New Issue
Block a user