From fe664510567f9abb9892b3f71de56e7bbd05cc06 Mon Sep 17 00:00:00 2001
From: Brian Buller <brian@bullercodeworks.com>
Date: Thu, 21 Jan 2016 12:14:09 -0600
Subject: [PATCH] gofmt and some error checking

---
 .gitignore    |   0
 LICENSE       |   0
 fullscrape.go | 137 ++++++++++++++++++++++++++++----------------------
 3 files changed, 77 insertions(+), 60 deletions(-)
 mode change 100755 => 100644 .gitignore
 mode change 100755 => 100644 LICENSE

diff --git a/.gitignore b/.gitignore
old mode 100755
new mode 100644
diff --git a/LICENSE b/LICENSE
old mode 100755
new mode 100644
diff --git a/fullscrape.go b/fullscrape.go
index a1c68fe..287c24e 100644
--- a/fullscrape.go
+++ b/fullscrape.go
@@ -3,20 +3,28 @@ package main
 import (
 	"bytes"
 	"fmt"
-	"golang.org/x/net/html"
 	"io/ioutil"
 	"net/http"
 	"os"
 	"strconv"
 	"strings"
 	"time"
+
+	"golang.org/x/net/html"
 )
 
-const PROGRAM_NAME = "fullscrape"
+const programName = "fullscrape"
 
 func main() {
-	req_url := os.Args[1] //"http://golang.org/"
-	out_dir := os.Args[2]
+	if len(os.Args) <= 1 {
+		printUsage()
+		os.Exit(1)
+	}
+	reqURL := os.Args[1] //"http://golang.org/"
+	outDir := ""
+	if len(os.Args) > 2 {
+		outDir = os.Args[2]
+	}
 	depthFlag := -1
 	norewriteFlag := false
 	crossdomainFlag := false
@@ -58,28 +66,28 @@ func main() {
 		}
 	}
 
-	if err := CreateDirIfNotExist(out_dir); err != nil {
-		fmt.Print("Unable to create initial directory %s\n", out_dir)
+	if err := CreateDirIfNotExist(outDir); err != nil {
+		fmt.Print("Unable to create initial directory %s\n", outDir)
 		fmt.Print("Error: %s\n", err)
 		os.Exit(1)
 	}
 
 	c := new(Crawler)
 	// Make sure we have the protocol
-	if strings.Index(req_url, "http") != 0 {
-		req_url = "http://" + req_url
+	if strings.Index(reqURL, "http") != 0 {
+		reqURL = "http://" + reqURL
 	}
-	if !strings.HasSuffix(req_url, "/") {
-		req_url = req_url + "/"
+	if !strings.HasSuffix(reqURL, "/") {
+		reqURL = reqURL + "/"
 	}
-	c.rootUrl = req_url
-	c.outDir = out_dir
+	c.rootURL = reqURL
+	c.outDir = outDir
 	c.fixUrls = norewriteFlag
 	c.xDomain = crossdomainFlag
 	c.depth = depthFlag
 	c.throttle = time.Duration(throttleFlag)
 	// Parse out the Domain and TLD
-	c.domain, c.tld, err = c.ParseUrl(c.rootUrl)
+	c.domain, c.tld, err = c.ParseURL(c.rootURL)
 	if err != nil {
 		fmt.Print(err)
 		os.Exit(1)
@@ -93,8 +101,9 @@ type unprocessed struct {
 	url   []string
 }
 
+// Crawler crawls
 type Crawler struct {
-	rootUrl  string
+	rootURL  string
 	outDir   string
 	fixUrls  bool
 	xDomain  bool
@@ -104,18 +113,19 @@ type Crawler struct {
 	tld      string
 }
 
+// Crawl tells the crawler to start crawlin'
 func (c *Crawler) Crawl() {
 	if c.depth >= 0 {
-		fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.depth, c.fixUrls, c.xDomain, c.throttle)
+		fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.depth, c.fixUrls, c.xDomain, c.throttle)
 	} else {
-		fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.fixUrls, c.xDomain, c.throttle)
+		fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.fixUrls, c.xDomain, c.throttle)
 	}
 
 	// Setup channel for inputs to be processed
 	up := make(chan unprocessed, 0)
 
 	// Kick off processing and count how many pages are left to process
-	go c.getPage(c.rootUrl, c.depth, up)
+	go c.getPage(c.rootURL, c.depth, up)
 	outstanding := 1
 
 	visited := make(map[string]bool)
@@ -172,8 +182,9 @@ func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
 	r <- unprocessed{depth - 1, urls}
 }
 
+// Fetch initiates a page get
 func (c *Crawler) Fetch(url string) (string, []string, error) {
-	urls := make([]string, 0)
+	var urls []string
 	// Ok, go get URL
 	response, err := http.Get(url)
 	if err != nil || response.StatusCode != 200 {
@@ -183,31 +194,31 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
 	response.Body.Close()
 
 	// Save the body to the appropriate directory
-	save_file := strings.TrimPrefix(url, c.rootUrl)
-	if save_file == "" {
-		save_file = "index.html"
+	saveFile := strings.TrimPrefix(url, c.rootURL)
+	if saveFile == "" {
+		saveFile = "index.html"
 	} else {
-		save_arr := strings.Split(save_file, "/")
-		save_dir := strings.Join(save_arr[:len(save_arr)-1], "/")
-		if strings.Index(save_dir, "/") != 0 {
-			save_dir = "/" + save_dir
+		saveArr := strings.Split(saveFile, "/")
+		saveDir := strings.Join(saveArr[:len(saveArr)-1], "/")
+		if strings.Index(saveDir, "/") != 0 {
+			saveDir = "/" + saveDir
 		}
-		save_dir = c.outDir + save_dir
+		saveDir = c.outDir + saveDir
 
-		if len(save_arr) > 0 {
-			if err = CreateDirIfNotExist(save_dir); err != nil {
-				fmt.Printf("PANIC: Unable to create directory %s\n", save_dir)
+		if len(saveArr) > 0 {
+			if err = CreateDirIfNotExist(saveDir); err != nil {
+				fmt.Printf("PANIC: Unable to create directory %s\n", saveDir)
 				fmt.Printf("Error: %s\n", err)
 				os.Exit(1)
 			}
 		}
 	}
 
-	WriteFile(string(body), c.outDir+"/"+save_file)
+	WriteFile(string(body), c.outDir+"/"+saveFile)
 
 	// Read the body into a buffer
-	bd_reader := bytes.NewReader(body)
-	z := html.NewTokenizer(bd_reader)
+	bdReader := bytes.NewReader(body)
+	z := html.NewTokenizer(bdReader)
 	tt := z.Next()
 	// Is this an HTML file?
 	if tt != html.DoctypeToken {
@@ -223,10 +234,10 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
 			if t.Data == "link" || t.Data == "a" {
 				for _, a := range t.Attr {
 					if a.Key == "href" {
-						if c.CheckUrl(a.Val) {
-							urls = append(urls, c.FormatUrl(a.Val))
+						if c.CheckURL(a.Val) {
+							urls = append(urls, c.FormatURL(a.Val))
 						} else {
-							fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
+							fmt.Printf("CheckURL Failed For: %s\n", a.Val)
 						}
 						break
 					}
@@ -234,10 +245,10 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
 			} else if t.Data == "img" || t.Data == "script" {
 				for _, a := range t.Attr {
 					if a.Key == "src" {
-						if c.CheckUrl(a.Val) {
-							urls = append(urls, c.FormatUrl(a.Val))
+						if c.CheckURL(a.Val) {
+							urls = append(urls, c.FormatURL(a.Val))
 						} else {
-							fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
+							fmt.Printf("CheckURL Failed For: %s\n", a.Val)
 						}
 						break
 					}
@@ -256,22 +267,20 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
 	return string(body), urls, nil
 }
 
-/* ParseUrl parses a URL and returns its Domain and TLD
- * (An error is returned if it can't...)
- */
-func (c *Crawler) ParseUrl(url string) (string, string, error) {
-	part_slice := strings.Split(url, "//")
-	server := part_slice[1]
-	part_slice = strings.Split(server, "/")
-	server = part_slice[0]
-	part_slice = strings.Split(server, ".")
+// ParseURL parses a URL and returns its Domain and TLD
+func (c *Crawler) ParseURL(url string) (string, string, error) {
+	partSlice := strings.Split(url, "//")
+	server := partSlice[1]
+	partSlice = strings.Split(server, "/")
+	server = partSlice[0]
+	partSlice = strings.Split(server, ".")
 	var tld, domain string
 
 	// We just want the last two parts of the slice
-	if len(part_slice) >= 2 {
-		part_slice = part_slice[len(part_slice)-2:]
-		domain = part_slice[0]
-		tld = part_slice[1]
+	if len(partSlice) >= 2 {
+		partSlice = partSlice[len(partSlice)-2:]
+		domain = partSlice[0]
+		tld = partSlice[1]
 	} else {
 		// There aren't two parts ot the URL?! That's not right...
 		return "", "", fmt.Errorf("Invalid URL Given: %s\n", url)
@@ -279,10 +288,9 @@ func (c *Crawler) ParseUrl(url string) (string, string, error) {
 	return domain, tld, nil
 }
 
-/* CheckUrl checks if we should follow the url or not
- * The main purpose is for cross-domain checks
- */
-func (c *Crawler) CheckUrl(url string) bool {
+// CheckURL checks if we should follow the url or not
+// The main purpose is for cross-domain checks
+func (c *Crawler) CheckURL(url string) bool {
 	// Ignore anchor urls
 	if strings.IndexRune(url, '#') >= 0 {
 		return false
@@ -294,27 +302,31 @@ func (c *Crawler) CheckUrl(url string) bool {
 	if !c.xDomain {
 		if strings.HasPrefix(url, "http") {
 			// Make sure that this url is in the same domain
-			tst_dmn, tst_tld, err := c.ParseUrl(url)
+			tstDmn, tstTld, err := c.ParseURL(url)
 			if err != nil {
 				// Error parsing the Domain/TLD out of the URL...
 				return false
 			}
-			return (tst_dmn == c.domain && tst_tld == c.tld)
+			return (tstDmn == c.domain && tstTld == c.tld)
 		}
 	}
 	return true
 }
 
-func (c *Crawler) FormatUrl(url string) string {
+// FormatURL takes a url and, unless it starts with "http"
+// appends it to the end of c.rootURL
+func (c *Crawler) FormatURL(url string) string {
 	// If the URL doesn't start with http, then it should be relative
 	if strings.Index(url, "http") != 0 {
-		url = c.rootUrl + url
+		url = c.rootURL + url
 	}
 	return url
 }
 
+// CreateDirIfNotExist Checks if directory 'dir' exists
+// If it doesn't, it creates it.
 func CreateDirIfNotExist(dir string) error {
-	// Check if out_dir exists, if not, try to make it
+	// Check if outDir exists, if not, try to make it
 	_, err := os.Stat(dir)
 	if os.IsNotExist(err) {
 		// Doesn't exist, try to create
@@ -326,8 +338,13 @@ func CreateDirIfNotExist(dir string) error {
 	return nil
 }
 
+// WriteFile writes the data 'd' to 'filename'
 func WriteFile(d string, filename string) error {
 	do := []byte(d)
 	//fmt.Printf("Writing %s\n", filename)
 	return ioutil.WriteFile(filename, do, 0664)
+}
+
+func printUsage() {
+	fmt.Println("Usage: ...")
 }
\ No newline at end of file