Adding the Source
This commit is contained in:
		
							
								
								
									
										86
									
								
								autoscrape.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										86
									
								
								autoscrape.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,86 @@
 | 
			
		||||
package main
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"github.com/PuerkitoBio/goquery"
 | 
			
		||||
	"log"
 | 
			
		||||
	"os"
 | 
			
		||||
	"strconv"
 | 
			
		||||
	"strings"
 | 
			
		||||
	"time"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
const PROGRAM_NAME = "autoscrape"
 | 
			
		||||
 | 
			
		||||
func main() {
 | 
			
		||||
	if len(os.Args) <= 2 {
 | 
			
		||||
		fmt.Print("Usage: " + PROGRAM_NAME + " <url|url-json> <selector|selector-json> [page-parameter] [max-pages]\n")
 | 
			
		||||
		os.Exit(1)
 | 
			
		||||
	}
 | 
			
		||||
	url := os.Args[1]
 | 
			
		||||
	selector := os.Args[2]
 | 
			
		||||
 | 
			
		||||
	page := ""
 | 
			
		||||
	num_pages := -1
 | 
			
		||||
	if len(os.Args) > 3 {
 | 
			
		||||
		page = os.Args[3]
 | 
			
		||||
	}
 | 
			
		||||
	if len(os.Args) > 4 {
 | 
			
		||||
		var err error
 | 
			
		||||
		num_pages, err = strconv.Atoi(os.Args[4])
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			num_pages = -1
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	last_content := ""
 | 
			
		||||
	curr_page := 1
 | 
			
		||||
	ret := ""
 | 
			
		||||
	if page != "" {
 | 
			
		||||
		fmt.Println(">> Page parameter detected! ADVANCED MODE!")
 | 
			
		||||
		for curr_page != num_pages {
 | 
			
		||||
			r_url := url + page
 | 
			
		||||
			if strings.Index(r_url, "%d") == -1 {
 | 
			
		||||
				r_url = r_url + "%d"
 | 
			
		||||
			}
 | 
			
		||||
			r_url = fmt.Sprintf(r_url, curr_page)
 | 
			
		||||
			fmt.Println(">>>> Fetching " + r_url)
 | 
			
		||||
			new_content := pullOutText(fetchPage(r_url), selector)
 | 
			
		||||
			if new_content == last_content {
 | 
			
		||||
				num_pages = curr_page
 | 
			
		||||
			} else {
 | 
			
		||||
				fmt.Print(new_content)
 | 
			
		||||
				last_content = new_content
 | 
			
		||||
			}
 | 
			
		||||
			curr_page += 1
 | 
			
		||||
			time.Sleep(1 * time.Second)
 | 
			
		||||
		}
 | 
			
		||||
	} else {
 | 
			
		||||
		fmt.Print(pullOutText(fetchPage(url), selector))
 | 
			
		||||
	}
 | 
			
		||||
	fmt.Print(ret)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func fetchPage(url string) *goquery.Document {
 | 
			
		||||
	doc, err := goquery.NewDocument(url)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		log.Fatal(err)
 | 
			
		||||
	}
 | 
			
		||||
	return doc
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func pullElementsAsText(d *goquery.Document, sel string) string {
 | 
			
		||||
	var ret string
 | 
			
		||||
	d.Find(sel).Each(func(i int, s *goquery.Selection) {
 | 
			
		||||
		ret = ret + strings.TrimSpace(s.Text()) + "\n"
 | 
			
		||||
	})
 | 
			
		||||
	return ret
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func pullOutText(d *goquery.Document, sel string) string {
 | 
			
		||||
	var ret string
 | 
			
		||||
	d.Find(sel).Each(func(i int, s *goquery.Selection) {
 | 
			
		||||
		ret = ret + strings.TrimSpace(s.Text()) + "\n"
 | 
			
		||||
	})
 | 
			
		||||
	return ret
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user