fullscrape/fullscrape.go

package main

import (
	"bytes"
	"fmt"
	"golang.org/x/net/html"
	"io/ioutil"
	"net/http"
	"os"
	"strconv"
	"strings"
	"time"
)

const PROGRAM_NAME = "fullscrape"

func main() {
	req_url := os.Args[1] //"http://golang.org/"
	out_dir := os.Args[2]
	depthFlag := -1
	norewriteFlag := false
	crossdomainFlag := false
	throttleFlag := 1000
	var err error

	if len(os.Args) > 3 {
		tst := os.Args[3]
		depthArg := strings.IndexRune(tst, 'd')
		if depthArg >= 0 {
			// The actual depth value should either be depthArg+1
			// or, if that is '=', depthArg+2
			if tst[depthArg+1] == '=' {
				depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+2])
			} else {
				depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+1])
			}
			if err != nil {
				fmt.Printf("Invalid depth given (must be an integer): %s\n", depthFlag)
				os.Exit(1)
			}
		}
		norewriteFlag = (strings.IndexRune(tst, 'n') >= 0)
		crossdomainFlag = (strings.IndexRune(tst, 'x') >= 0)
		throttleArg := strings.IndexRune(tst, 't')
		if throttleArg >= 0 {
			// The actual throttle value should either be throttleArg+1...
			// or, if that is '=', throttleArg+2...
			if tst[depthArg+1] == '=' {
				// The throttle argument MUST have a space after it
				throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+2])
			} else {
				throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+1])
			}
			if err != nil {
				fmt.Printf("Invalid depth given (must be milliseconds as an integer): %s\n", depthFlag)
				os.Exit(1)
			}
		}
	}

	if err := CreateDirIfNotExist(out_dir); err != nil {
		fmt.Print("Unable to create initial directory %s\n", out_dir)
		fmt.Print("Error: %s\n", err)
		os.Exit(1)
	}

	c := new(Crawler)
	// Make sure we have the protocol
	if strings.Index(req_url, "http") != 0 {
		req_url = "http://" + req_url
	}
	if !strings.HasSuffix(req_url, "/") {
		req_url = req_url + "/"
	}
	c.rootUrl = req_url
	c.outDir = out_dir
	c.fixUrls = norewriteFlag
	c.xDomain = crossdomainFlag
	c.depth = depthFlag
	c.throttle = time.Duration(throttleFlag)
	// Parse out the Domain and TLD
	c.domain, c.tld, err = c.ParseUrl(c.rootUrl)
	if err != nil {
		fmt.Print(err)
		os.Exit(1)
	}

	c.Crawl()
}

type unprocessed struct {
	depth int
	url   []string
}

type Crawler struct {
	rootUrl  string
	outDir   string
	fixUrls  bool
	xDomain  bool
	depth    int
	throttle time.Duration
	domain   string
	tld      string
}

func (c *Crawler) Crawl() {
	if c.depth >= 0 {
		fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.depth, c.fixUrls, c.xDomain, c.throttle)
	} else {
		fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.fixUrls, c.xDomain, c.throttle)
	}

	// Setup channel for inputs to be processed
	up := make(chan unprocessed, 0)

	// Kick off processing and count how many pages are left to process
	go c.getPage(c.rootUrl, c.depth, up)
	outstanding := 1

	visited := make(map[string]bool)

	status := fmt.Sprintf("Files %d/%d", len(visited), outstanding+len(visited))
	for outstanding > 0 {
		done := len(visited) - outstanding
		if done < 0 {
			done = 0
		}
		fmt.Print(strings.Repeat("", len(status)))
		status = fmt.Sprintf("Files %d/%d", done, len(visited))
		fmt.Print(status)
		if c.throttle > 0 {
			time.Sleep(time.Millisecond * c.throttle)
		}
		// Pop a visit from the channel
		next := <-up
		outstanding--

		// If we're too deep, skip it
		if next.depth == 0 {
			continue
		}

		// Loop over all urls to visit from that page
		for _, link := range next.url {
			// Check that we haven't visited them before
			if visited[link] {
				continue
			}

			// All good to visit them
			outstanding++
			visited[link] = true
			go c.getPage(link, next.depth, up)
		}
	}
	//fmt.Print(strings.Repeat("", len(status)))
	status = fmt.Sprintf("Files %d/%d", len(visited), len(visited))
	fmt.Printf("%s\n", status)
}

func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
	_, urls, err := c.Fetch(url)
	//body, urls, err := c.Fetch(url)
	fmt.Printf("Found: %s\n", url)
	if err != nil {
		fmt.Println(err)
	}

	//fmt.Printf("Pulled URLS: %s\n", urls)

	r <- unprocessed{depth - 1, urls}
}

func (c *Crawler) Fetch(url string) (string, []string, error) {
	urls := make([]string, 0)
	// Ok, go get URL
	response, err := http.Get(url)
	if err != nil || response.StatusCode != 200 {
		return "", nil, err
	}
	body, err := ioutil.ReadAll(response.Body)
	response.Body.Close()

	// Save the body to the appropriate directory
	save_file := strings.TrimPrefix(url, c.rootUrl)
	if save_file == "" {
		save_file = "index.html"
	} else {
		save_arr := strings.Split(save_file, "/")
		save_dir := strings.Join(save_arr[:len(save_arr)-1], "/")
		if strings.Index(save_dir, "/") != 0 {
			save_dir = "/" + save_dir
		}
		save_dir = c.outDir + save_dir

		if len(save_arr) > 0 {
			if err = CreateDirIfNotExist(save_dir); err != nil {
				fmt.Printf("PANIC: Unable to create directory %s\n", save_dir)
				fmt.Printf("Error: %s\n", err)
				os.Exit(1)
			}
		}
	}

	WriteFile(string(body), c.outDir+"/"+save_file)

	// Read the body into a buffer
	bd_reader := bytes.NewReader(body)
	z := html.NewTokenizer(bd_reader)
	tt := z.Next()
	// Is this an HTML file?
	if tt != html.DoctypeToken {
		// Nope, so we're done here
		return string(body), urls, nil
	}

	for {
		tt := z.Next()
		switch {
		case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
			t := z.Token()
			if t.Data == "link" || t.Data == "a" {
				for _, a := range t.Attr {
					if a.Key == "href" {
						if c.CheckUrl(a.Val) {
							urls = append(urls, c.FormatUrl(a.Val))
						} else {
							fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
						}
						break
					}
				}
			} else if t.Data == "img" || t.Data == "script" {
				for _, a := range t.Attr {
					if a.Key == "src" {
						if c.CheckUrl(a.Val) {
							urls = append(urls, c.FormatUrl(a.Val))
						} else {
							fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
						}
						break
					}
				}
			}
		}
		if tt == html.ErrorToken {
			break
		}
	}

	if err != nil {
		return "", nil, err
	}

	return string(body), urls, nil
}

/* ParseUrl parses a URL and returns its Domain and TLD
 * (An error is returned if it can't...)
 */
func (c *Crawler) ParseUrl(url string) (string, string, error) {
	part_slice := strings.Split(url, "//")
	server := part_slice[1]
	part_slice = strings.Split(server, "/")
	server = part_slice[0]
	part_slice = strings.Split(server, ".")
	var tld, domain string

	// We just want the last two parts of the slice
	if len(part_slice) >= 2 {
		part_slice = part_slice[len(part_slice)-2:]
		domain = part_slice[0]
		tld = part_slice[1]
	} else {
		// There aren't two parts ot the URL?! That's not right...
		return "", "", fmt.Errorf("Invalid URL Given: %s\n", url)
	}
	return domain, tld, nil
}

/* CheckUrl checks if we should follow the url or not
 * The main purpose is for cross-domain checks
 */
func (c *Crawler) CheckUrl(url string) bool {
	// Ignore anchor urls
	if strings.IndexRune(url, '#') >= 0 {
		return false
	}
	// Ignore "mailto" links
	if strings.HasPrefix(url, "mailto:") {
		return false
	}
	if !c.xDomain {
		if strings.HasPrefix(url, "http") {
			// Make sure that this url is in the same domain
			tst_dmn, tst_tld, err := c.ParseUrl(url)
			if err != nil {
				// Error parsing the Domain/TLD out of the URL...
				return false
			}
			return (tst_dmn == c.domain && tst_tld == c.tld)
		}
	}
	return true
}

func (c *Crawler) FormatUrl(url string) string {
	// If the URL doesn't start with http, then it should be relative
	if strings.Index(url, "http") != 0 {
		url = c.rootUrl + url
	}
	return url
}

func CreateDirIfNotExist(dir string) error {
	// Check if out_dir exists, if not, try to make it
	_, err := os.Stat(dir)
	if os.IsNotExist(err) {
		// Doesn't exist, try to create
		if err = os.MkdirAll(dir, 0755); err != nil {
			return err
		}
	}
	// Directory exists, just return
	return nil
}

func WriteFile(d string, filename string) error {
	do := []byte(d)
	//fmt.Printf("Writing %s\n", filename)
	return ioutil.WriteFile(filename, do, 0664)
}