From 9afaeee97d22733127ed20f71a1abf5960f68f7e Mon Sep 17 00:00:00 2001
From: Brian Buller <brbuller@gmail.com>
Date: Thu, 30 Jul 2015 14:55:05 -0500
Subject: [PATCH] Mostly Functional

Just need to clean up a few things and we're good to go!
---
 fullscrape.go | 239 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 239 insertions(+)
 create mode 100644 fullscrape.go
diff --git a/fullscrape.go b/fullscrape.go
new file mode 100644
index 0000000..7cb09b8
--- /dev/null
+++ b/fullscrape.go
@@ -0,0 +1,239 @@
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"golang.org/x/net/html"
+	"io/ioutil"
+	"net/http"
+	"os"
+	"strconv"
+	"strings"
+)
+
+const PROGRAM_NAME = "fullscrape"
+
+func main() {
+	if len(os.Args) <= 3 {
+		fmt.Print("Usage: " + PROGRAM_NAME + " <url> <output-directory> <depth> [-nx]\n")
+		fmt.Print("      -n: Don't rewrite urls in source files to work locally\n")
+		fmt.Print("      -x: Cross domains when following links\n")
+		os.Exit(1)
+	}
+	fix_urls := true
+	cross_domains := false
+	req_url := os.Args[1] //"http://golang.org/"
+	out_dir := os.Args[2]
+	req_depth, err := strconv.Atoi(os.Args[3]) //4
+	if err != nil {
+		fmt.Print("Invalid Depth specified. Please give a number.\n")
+		fmt.Print("Usage: " + PROGRAM_NAME + " <url> <output-directory> <depth> [-n]\n")
+		os.Exit(1)
+	}
+	if len(os.Args) > 3 {
+		tst_arg := os.Args[4]
+		if strings.Index(tst_arg, "n") != -1 {
+			fix_urls = false
+		} else if strings.Index(tst_arg, "x") != -1 {
+			cross_domains = true
+		}
+	}
+	if err = CreateDirIfNotExist(out_dir); err != nil {
+		fmt.Print("Unable to create initial directory %s\n", out_dir)
+		fmt.Print("Error: %s\n", err)
+		os.Exit(1)
+	}
+
+	c := new(Crawler)
+	// Make sure we have the protocol
+	if strings.Index(req_url, "http") != 0 {
+		req_url = "http://" + req_url
+	}
+	if !strings.HasSuffix(req_url, "/") {
+		req_url = req_url + "/"
+	}
+	c.rootUrl = req_url
+	c.outDir = out_dir
+	c.fixUrls = fix_urls
+	c.xDomain = cross_domains
+
+	c.Crawl(req_url, req_depth)
+}
+
+type unprocessed struct {
+	depth int
+	url   []string
+}
+
+type Crawler struct {
+	rootUrl string
+	outDir  string
+	fixUrls bool
+	xDomain bool
+}
+
+func (c *Crawler) Crawl(url string, depth int) {
+	// Setup channel for inputs to be processed
+	up := make(chan unprocessed, 0)
+
+	// Kick off processing and count how many pages are left to process
+	go c.getPage(url, depth, up)
+	outstanding := 1
+
+	visited := make(map[string]bool)
+	for outstanding > 0 {
+		// Pop a visit from the channel
+		next := <-up
+		outstanding--
+
+		// If we're too deep, skip it
+		if next.depth <= 0 {
+			continue
+		}
+
+		// Loop over all urls to visit from that page
+		for _, link := range next.url {
+			// Check that we haven't visited them before
+			if visited[link] {
+				continue
+			}
+
+			// All good to visit them
+			outstanding++
+			visited[link] = true
+			go c.getPage(link, depth, up)
+		}
+	}
+}
+
+func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
+	_, urls, err := c.Fetch(url)
+	//body, urls, err := c.Fetch(url)
+	fmt.Printf("Found: %s\n", url)
+	if err != nil {
+		fmt.Println(err)
+	}
+
+	fmt.Printf("Pulled URLS: %s\n", urls)
+
+	r <- unprocessed{depth - 1, urls}
+}
+
+func (c *Crawler) Fetch(url string) (string, []string, error) {
+	urls := make([]string, 0)
+	// Ok, go get URL
+	response, err := http.Get(url)
+	if err != nil {
+		return "", nil, err
+	}
+	body, err := ioutil.ReadAll(response.Body)
+	response.Body.Close()
+
+	// Save the body to the appropriate directory
+	save_file := strings.TrimPrefix(url, c.rootUrl)
+	if save_file == "" {
+		save_file = "index.html"
+	} else {
+		save_arr := strings.Split(save_file, "/")
+		save_dir := strings.Join(save_arr[:len(save_arr)-1], "/")
+		if strings.Index(save_dir, "/") != 0 {
+			save_dir = "/" + save_dir
+		}
+		save_dir = c.outDir + save_dir
+
+		if len(save_arr) > 0 {
+			if err = CreateDirIfNotExist(save_dir); err != nil {
+				fmt.Printf("PANIC: Unable to create directory %s\n", save_dir)
+				fmt.Printf("Error: %s\n", err)
+				os.Exit(1)
+			}
+		}
+	}
+
+	WriteFile(string(body), c.outDir+"/"+save_file)
+
+	// Read the body into a buffer
+	bd_reader := bytes.NewReader(body)
+	z := html.NewTokenizer(bd_reader)
+	tt := z.Next()
+	// Is this an HTML file?
+	if tt != html.DoctypeToken {
+		// Nope, so we're done here
+		return string(body), urls, nil
+	}
+
+	for {
+		tt := z.Next()
+		switch {
+		case tt == html.StartTagToken:
+			t := z.Token()
+			if t.Data == "a" || t.Data == "link" {
+				for _, a := range t.Attr {
+					if a.Key == "href" {
+						if c.CheckUrl(a.Val) {
+							urls = append(urls, c.FormatUrl(a.Val))
+						}
+						break
+					}
+				}
+			} else if t.Data == "img" {
+				for _, a := range t.Attr {
+					if a.Key == "src" {
+						if c.CheckUrl(a.Val) {
+							urls = append(urls, c.FormatUrl(a.Val))
+						}
+						break
+					}
+				}
+			}
+		}
+		if tt == html.ErrorToken {
+			break
+		}
+	}
+
+	if err != nil {
+		return "", nil, err
+	}
+
+	return string(body), urls, nil
+}
+
+/* CheckUrl checks if we should follow the url or not
+ * The main purpose is for cross-domain checks
+ */
+func (c *Crawler) CheckUrl(url string) bool {
+	if !c.xDomain {
+		if strings.HasPrefix(url, "http") {
+			return strings.HasPrefix(url, c.rootUrl)
+		}
+	}
+	return true
+}
+
+func (c *Crawler) FormatUrl(url string) string {
+	// If the URL doesn't start with http, then it should be relative
+	if strings.Index(url, "http") != 0 {
+		url = c.rootUrl + url
+	}
+	return url
+}
+
+func CreateDirIfNotExist(dir string) error {
+	// Check if out_dir exists, if not, try to make it
+	_, err := os.Stat(dir)
+	if os.IsNotExist(err) {
+		// Doesn't exist, try to create
+		if err = os.MkdirAll(dir, 0755); err != nil {
+			return err
+		}
+	}
+	// Directory exists, just return
+	return nil
+}
+
+func WriteFile(d string, filename string) error {
+	do := []byte(d)
+	fmt.Printf("Writing %s\n", filename)
+	return ioutil.WriteFile(filename, do, 0664)
+}
\ No newline at end of file