fullscrape/fullscrape.go

292 lines
6.8 KiB
Go
Raw Normal View History

package main
import (
"bytes"
"fmt"
"golang.org/x/net/html"
"io/ioutil"
"net/http"
"os"
"strconv"
"strings"
"time"
)
const PROGRAM_NAME = "fullscrape"
func main() {
req_url := os.Args[1] //"http://golang.org/"
out_dir := os.Args[2]
depthFlag := -1
norewriteFlag := false
crossdomainFlag := false
throttleFlag := 1000
var err error
if len(os.Args) > 3 {
tst := os.Args[3]
depthArg := strings.IndexRune(tst, 'd')
if depthArg >= 0 {
// The actual depth value should either be depthArg+1
// or, if that is '=', depthArg+2
if tst[depthArg+1] == '=' {
depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+2])
} else {
depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+1])
}
if err != nil {
fmt.Printf("Invalid depth given (must be an integer): %s\n", depthFlag)
os.Exit(1)
}
}
norewriteFlag = (strings.IndexRune(tst, 'n') >= 0)
crossdomainFlag = (strings.IndexRune(tst, 'x') >= 0)
throttleArg := strings.IndexRune(tst, 't')
if throttleArg >= 0 {
// The actual throttle value should either be throttleArg+1...
// or, if that is '=', throttleArg+2...
if tst[depthArg+1] == '=' {
// The throttle argument MUST have a space after it
throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+2])
} else {
throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+1])
}
if err != nil {
fmt.Printf("Invalid depth given (must be milliseconds as an integer): %s\n", depthFlag)
os.Exit(1)
}
}
}
if err := CreateDirIfNotExist(out_dir); err != nil {
fmt.Print("Unable to create initial directory %s\n", out_dir)
fmt.Print("Error: %s\n", err)
os.Exit(1)
}
c := new(Crawler)
// Make sure we have the protocol
if strings.Index(req_url, "http") != 0 {
req_url = "http://" + req_url
}
if !strings.HasSuffix(req_url, "/") {
req_url = req_url + "/"
}
c.rootUrl = req_url
c.outDir = out_dir
c.fixUrls = norewriteFlag
c.xDomain = crossdomainFlag
c.depth = depthFlag
c.throttle = time.Duration(throttleFlag)
c.Crawl()
}
type unprocessed struct {
depth int
url []string
}
type Crawler struct {
rootUrl string
outDir string
fixUrls bool
xDomain bool
depth int
throttle time.Duration
}
func (c *Crawler) Crawl() {
if c.depth >= 0 {
fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.depth, c.fixUrls, c.xDomain, c.throttle)
} else {
fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.fixUrls, c.xDomain, c.throttle)
}
// Setup channel for inputs to be processed
up := make(chan unprocessed, 0)
// Kick off processing and count how many pages are left to process
go c.getPage(c.rootUrl, c.depth, up)
outstanding := 1
visited := make(map[string]bool)
status := fmt.Sprintf("Files %d/%d", len(visited), outstanding+len(visited))
for outstanding > 0 {
done := len(visited) - outstanding
if done < 0 {
done = 0
}
fmt.Print(strings.Repeat("", len(status)))
status = fmt.Sprintf("Files %d/%d", done, len(visited))
fmt.Print(status)
if c.throttle > 0 {
time.Sleep(time.Millisecond * c.throttle)
}
// Pop a visit from the channel
next := <-up
outstanding--
// If we're too deep, skip it
if next.depth == 0 {
continue
}
// Loop over all urls to visit from that page
for _, link := range next.url {
// Check that we haven't visited them before
if visited[link] {
continue
}
// All good to visit them
outstanding++
visited[link] = true
go c.getPage(link, next.depth, up)
}
}
fmt.Print(strings.Repeat("", len(status)))
status = fmt.Sprintf("Files %d/%d", len(visited), len(visited))
fmt.Printf("%s\n", status)
}
func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
_, urls, err := c.Fetch(url)
//body, urls, err := c.Fetch(url)
//fmt.Printf("Found: %s\n", url)
if err != nil {
fmt.Println(err)
}
//fmt.Printf("Pulled URLS: %s\n", urls)
r <- unprocessed{depth - 1, urls}
}
func (c *Crawler) Fetch(url string) (string, []string, error) {
urls := make([]string, 0)
// Ok, go get URL
response, err := http.Get(url)
if err != nil || response.StatusCode != 200 {
return "", nil, err
}
body, err := ioutil.ReadAll(response.Body)
response.Body.Close()
// Save the body to the appropriate directory
save_file := strings.TrimPrefix(url, c.rootUrl)
if save_file == "" {
save_file = "index.html"
} else {
save_arr := strings.Split(save_file, "/")
save_dir := strings.Join(save_arr[:len(save_arr)-1], "/")
if strings.Index(save_dir, "/") != 0 {
save_dir = "/" + save_dir
}
save_dir = c.outDir + save_dir
if len(save_arr) > 0 {
if err = CreateDirIfNotExist(save_dir); err != nil {
fmt.Printf("PANIC: Unable to create directory %s\n", save_dir)
fmt.Printf("Error: %s\n", err)
os.Exit(1)
}
}
}
WriteFile(string(body), c.outDir+"/"+save_file)
// Read the body into a buffer
bd_reader := bytes.NewReader(body)
z := html.NewTokenizer(bd_reader)
tt := z.Next()
// Is this an HTML file?
if tt != html.DoctypeToken {
// Nope, so we're done here
return string(body), urls, nil
}
for {
tt := z.Next()
switch {
case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
t := z.Token()
if t.Data == "link" || t.Data == "a" {
for _, a := range t.Attr {
if a.Key == "href" {
if c.CheckUrl(a.Val) {
urls = append(urls, c.FormatUrl(a.Val))
}
break
}
}
} else if t.Data == "img" || t.Data == "script" {
for _, a := range t.Attr {
if a.Key == "src" {
if c.CheckUrl(a.Val) {
urls = append(urls, c.FormatUrl(a.Val))
}
break
}
}
}
}
if tt == html.ErrorToken {
break
}
}
if err != nil {
return "", nil, err
}
return string(body), urls, nil
}
/* CheckUrl checks if we should follow the url or not
* The main purpose is for cross-domain checks
*/
func (c *Crawler) CheckUrl(url string) bool {
// Ignore anchor urls
if strings.IndexRune(url, '#') >= 0 {
return false
}
// Ignore "mailto" links
if strings.HasPrefix(url, "mailto:") {
return false
}
if !c.xDomain {
if strings.HasPrefix(url, "http") {
return strings.HasPrefix(url, c.rootUrl)
}
}
return true
}
func (c *Crawler) FormatUrl(url string) string {
// If the URL doesn't start with http, then it should be relative
if strings.Index(url, "http") != 0 {
url = c.rootUrl + url
}
return url
}
func CreateDirIfNotExist(dir string) error {
// Check if out_dir exists, if not, try to make it
_, err := os.Stat(dir)
if os.IsNotExist(err) {
// Doesn't exist, try to create
if err = os.MkdirAll(dir, 0755); err != nil {
return err
}
}
// Directory exists, just return
return nil
}
func WriteFile(d string, filename string) error {
do := []byte(d)
//fmt.Printf("Writing %s\n", filename)
return ioutil.WriteFile(filename, do, 0664)
}