fullscrape/fullscrape.go

package main

import (
	"bytes"
	"fmt"
	"io/ioutil"
	"net/http"
	"os"
	"regexp"
	"strconv"
	"strings"
	"time"

	"golang.org/x/net/html"
)

const programName = "fullscrape"

func main() {
	if len(os.Args) <= 1 {
		printUsage()
		os.Exit(1)
	}
	reqURL := os.Args[1] //"http://golang.org/"
	outDir := ""
	if len(os.Args) > 2 {
		outDir = os.Args[2]
	}
	depthFlag := -1
	norewriteFlag := false
	crossdomainFlag := false
	throttleFlag := 1000
	var err error

	if len(os.Args) > 3 {
		tst := os.Args[3]
		depthArg := strings.IndexRune(tst, 'd')
		if depthArg >= 0 {
			// The actual depth value should either be depthArg+1
			// or, if that is '=', depthArg+2
			if tst[depthArg+1] == '=' {
				depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+2])
			} else {
				depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+1])
			}
			if err != nil {
				fmt.Printf("Invalid depth given (must be an integer): %s\n", depthFlag)
				os.Exit(1)
			}
		}
		norewriteFlag = (strings.IndexRune(tst, 'n') >= 0)
		crossdomainFlag = (strings.IndexRune(tst, 'x') >= 0)
		throttleArg := strings.IndexRune(tst, 't')
		if throttleArg >= 0 {
			// The actual throttle value should either be throttleArg+1...
			// or, if that is '=', throttleArg+2...
			if tst[depthArg+1] == '=' {
				// The throttle argument MUST have a space after it
				throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+2])
			} else {
				throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+1])
			}
			if err != nil {
				fmt.Printf("Invalid depth given (must be milliseconds as an integer): %s\n", depthFlag)
				os.Exit(1)
			}
		}
	}

	if err := CreateDirIfNotExist(outDir); err != nil {
		fmt.Print("Unable to create initial directory %s\n", outDir)
		fmt.Print("Error: %s\n", err)
		os.Exit(1)
	}

	c := new(Crawler)
	// Make sure we have the protocol
	if strings.Index(reqURL, "http") != 0 {
		reqURL = "http://" + reqURL
	}
	if !strings.HasSuffix(reqURL, "/") {
		reqURL = reqURL + "/"
	}
	c.rootURL = reqURL
	c.outDir = outDir
	c.fixUrls = norewriteFlag
	c.xDomain = crossdomainFlag
	c.depth = depthFlag
	c.throttle = time.Duration(throttleFlag)
	// Parse out the Domain and TLD
	c.domain, c.tld, err = c.ParseURL(c.rootURL)
	if err != nil {
		fmt.Print(err)
		os.Exit(1)
	}

	c.Crawl()
}

type unprocessed struct {
	depth int
	url   []string
}

// Crawler crawls
type Crawler struct {
	rootURL  string
	outDir   string
	fixUrls  bool
	xDomain  bool
	depth    int
	throttle time.Duration
	domain   string
	tld      string
}

// Crawl tells the crawler to start crawlin'
func (c *Crawler) Crawl() {
	if c.depth >= 0 {
		fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.depth, c.fixUrls, c.xDomain, c.throttle)
	} else {
		fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.fixUrls, c.xDomain, c.throttle)
	}

	// Setup channel for inputs to be processed
	up := make(chan unprocessed, 0)

	// Kick off processing and count how many pages are left to process
	go c.getPage(c.rootURL, c.depth, up)
	outstanding := 1

	visited := make(map[string]bool)

	status := fmt.Sprintf("Files %d/%d\n", len(visited), outstanding+len(visited))
	for outstanding > 0 {
		done := len(visited) - outstanding
		if done < 0 {
			done = 0
		}
		status = fmt.Sprintf("Files %d/%d\n", done, len(visited))
		fmt.Print(status)
		if c.throttle > 0 {
			time.Sleep(time.Millisecond * c.throttle)
		}
		// Pop a visit from the channel
		next := <-up
		outstanding--

		// If we're too deep, skip it
		if next.depth == 0 {
			continue
		}

		// Loop over all urls to visit from that page
		for _, link := range next.url {
			// Check that we haven't visited them before
			if visited[link] {
				continue
			}

			// All good to visit them
			outstanding++
			visited[link] = true
			go c.getPage(link, next.depth, up)
		}
	}
	status = fmt.Sprintf("Files %d/%d\n", len(visited), len(visited))
	fmt.Printf("%s\n", status)
}

func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
	_, urls, err := c.Fetch(url)
	//body, urls, err := c.Fetch(url)
	fmt.Printf("Found: %s\n", url)
	if err != nil {
		fmt.Println(err)
	}

	//fmt.Printf("Pulled URLS: %s\n", urls)

	r <- unprocessed{depth - 1, urls}
}

// Fetch initiates a page get
func (c *Crawler) Fetch(url string) (string, []string, error) {
	var urls []string
	// Ok, go get URL
	response, err := http.Get(url)
	if err != nil || response.StatusCode != 200 {
		return "", nil, err
	}
	body, err := ioutil.ReadAll(response.Body)
	response.Body.Close()

	// Save the body to the appropriate directory
	saveFile := strings.TrimPrefix(url, c.rootURL)
	if saveFile == "" {
		saveFile = "index.html"
	} else {
		saveArr := strings.Split(saveFile, "/")
		saveDir := strings.Join(saveArr[:len(saveArr)-1], "/")
		if strings.Index(saveDir, "/") != 0 {
			saveDir = "/" + saveDir
		}
		saveDir = c.outDir + saveDir

		if len(saveArr) > 0 {
			if err = CreateDirIfNotExist(saveDir); err != nil {
				fmt.Printf("PANIC: Unable to create directory %s\n", saveDir)
				fmt.Printf("Error: %s\n", err)
				os.Exit(1)
			}
		}
	}

	WriteFile(string(body), c.outDir+"/"+saveFile)

	fmt.Println("Parsing " + c.outDir + saveFile)

	// Read the body into a buffer
	bdReader := bytes.NewReader(body)
	z := html.NewTokenizer(bdReader)
	tt := z.Next()
	// Is this an HTML file
	switch tt {
	case html.DoctypeToken:
		urls = append(urls, c.ParseHTML(body)...)
	case html.TextToken:
		parsedURLs := c.ParseText(body)
		fmt.Println("Found urls in text file: ")
		// Find file directory
		urlLoc := saveFile[:strings.LastIndex(saveFile, "/")+1]
		for i := range parsedURLs {
			if parsedURLs[i][0] == '/' {
				parsedURLs[i] = c.FormatURL(parsedURLs[i][1:])
			} else if parsedURLs[i][0] == '.' {
				parsedURLs[i] = c.FormatURL(urlLoc + parsedURLs[i])
			}
		}
		urls = append(urls, parsedURLs...)
	}
	return string(body), urls, nil

	if err != nil {
		return "", nil, err
	}

	return string(body), urls, nil
}

// ParseHTML parses an html tokenizer and returns a list of urls
func (c *Crawler) ParseHTML(bd []byte) []string {
	var urls []string
	bdReader := bytes.NewReader(bd)
	z := html.NewTokenizer(bdReader)
	tt := z.Next()
	for {
		tt = z.Next()
		switch {
		case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
			t := z.Token()
			if t.Data == "link" || t.Data == "a" {
				for _, a := range t.Attr {
					if a.Key == "href" {
						if c.CheckURL(a.Val) {
							urls = append(urls, c.FormatURL(a.Val))
						} else {
							fmt.Printf("CheckURL Failed For: %s\n", a.Val)
						}
						break
					}
				}
			} else if t.Data == "img" || t.Data == "script" {
				for _, a := range t.Attr {
					if a.Key == "src" {
						if c.CheckURL(a.Val) {
							urls = append(urls, c.FormatURL(a.Val))
						} else {
							fmt.Printf("CheckURL Failed For: %s\n", a.Val)
						}
						break
					}
				}
			}
		}
		if tt == html.ErrorToken {
			break
		}
	}
	return urls
}

// ParseText parses a text file and returns a list of urls
func (c *Crawler) ParseText(bd []byte) []string {
	var cssURLs = regexp.MustCompile(`url\(([^\)]*)\)`)
	var urls []string
	matches := cssURLs.FindAllSubmatch(bd, -1)
	for _, v := range matches {
		urls = append(urls, string(v[1]))
	}
	for i := range urls {
		if urls[i][0] == '\'' || urls[i][0] == '"' {
			urls[i] = urls[i][1 : len(urls[i])-1]
		}
	}
	return urls
}

// ParseURL parses a URL and returns its Domain and TLD
func (c *Crawler) ParseURL(url string) (string, string, error) {
	partSlice := strings.Split(url, "//")
	server := partSlice[1]
	partSlice = strings.Split(server, "/")
	server = partSlice[0]
	partSlice = strings.Split(server, ".")
	var tld, domain string

	// We just want the last two parts of the slice
	if len(partSlice) >= 2 {
		partSlice = partSlice[len(partSlice)-2:]
		domain = partSlice[0]
		tld = partSlice[1]
	} else {
		// There aren't two parts ot the URL?! That's not right...
		return "", "", fmt.Errorf("Invalid URL Given: %s\n", url)
	}
	return domain, tld, nil
}

// CheckURL checks if we should follow the url or not
// The main purpose is for cross-domain checks
func (c *Crawler) CheckURL(url string) bool {
	// Ignore anchor urls
	if strings.IndexRune(url, '#') >= 0 {
		return false
	}
	// Ignore "mailto" links
	if strings.HasPrefix(url, "mailto:") {
		return false
	}
	if !c.xDomain {
		if strings.HasPrefix(url, "http") {
			// Make sure that this url is in the same domain
			tstDmn, tstTld, err := c.ParseURL(url)
			if err != nil {
				// Error parsing the Domain/TLD out of the URL...
				return false
			}
			return (tstDmn == c.domain && tstTld == c.tld)
		}
	}
	return true
}

// FormatURL takes a url and, unless it starts with "http"
// appends it to the end of c.rootURL
func (c *Crawler) FormatURL(url string) string {
	// If the URL doesn't start with http, then it should be relative
	if strings.Index(url, "http") != 0 {
		url = c.rootURL + url
	}
	return url
}

// CreateDirIfNotExist Checks if directory 'dir' exists
// If it doesn't, it creates it.
func CreateDirIfNotExist(dir string) error {
	// Check if outDir exists, if not, try to make it
	_, err := os.Stat(dir)
	if os.IsNotExist(err) {
		// Doesn't exist, try to create
		if err = os.MkdirAll(dir, 0755); err != nil {
			return err
		}
	}
	// Directory exists, just return
	return nil
}

// WriteFile writes the data 'd' to 'filename'
func WriteFile(d string, filename string) error {
	do := []byte(d)
	//fmt.Printf("Writing %s\n", filename)
	return ioutil.WriteFile(filename, do, 0664)
}

func printUsage() {
	fmt.Println("Usage: ...")
}