gofmt and some error checking

2016-01-21 12:14:09 -06:00 · 2016-01-21 12:14:09 -06:00 · fe66451056
commit fe66451056
parent 197750425d
3 changed files with 77 additions and 60 deletions
--- a/.gitignore
+++ b/.gitignore
--- a/0
+++ b/0
--- a/fullscrape.go
+++ b/fullscrape.go
@ -3,20 +3,28 @@ package main
 import (
 	"bytes"
 	"fmt"
 	"golang.org/x/net/html"
 	"io/ioutil"
 	"net/http"
 	"os"
 	"strconv"
 	"strings"
 	"time"
 	"golang.org/x/net/html"
 )
-const PROGRAM_NAME = "fullscrape"
+const programName = "fullscrape"
 func main() {
-	req_url := os.Args[1] //"http://golang.org/"
+	if len(os.Args) <= 1 {
-	out_dir := os.Args[2]
+		printUsage()
 		os.Exit(1)
 	}
 	reqURL := os.Args[1] //"http://golang.org/"
 	outDir := ""
 	if len(os.Args) > 2 {
 		outDir = os.Args[2]
 	}
 	depthFlag := -1
 	norewriteFlag := false
 	crossdomainFlag := false
@ -58,28 +66,28 @@ func main() {
 		}
 	}
-	if err := CreateDirIfNotExist(out_dir); err != nil {
+	if err := CreateDirIfNotExist(outDir); err != nil {
-		fmt.Print("Unable to create initial directory %s\n", out_dir)
+		fmt.Print("Unable to create initial directory %s\n", outDir)
 		fmt.Print("Error: %s\n", err)
 		os.Exit(1)
 	}
 	c := new(Crawler)
 	// Make sure we have the protocol
-	if strings.Index(req_url, "http") != 0 {
+	if strings.Index(reqURL, "http") != 0 {
-		req_url = "http://" + req_url
+		reqURL = "http://" + reqURL
 	}
-	if !strings.HasSuffix(req_url, "/") {
+	if !strings.HasSuffix(reqURL, "/") {
-		req_url = req_url + "/"
+		reqURL = reqURL + "/"
 	}
-	c.rootUrl = req_url
+	c.rootURL = reqURL
-	c.outDir = out_dir
+	c.outDir = outDir
 	c.fixUrls = norewriteFlag
 	c.xDomain = crossdomainFlag
 	c.depth = depthFlag
 	c.throttle = time.Duration(throttleFlag)
 	// Parse out the Domain and TLD
-	c.domain, c.tld, err = c.ParseUrl(c.rootUrl)
+	c.domain, c.tld, err = c.ParseURL(c.rootURL)
 	if err != nil {
 		fmt.Print(err)
 		os.Exit(1)
@ -93,8 +101,9 @@ type unprocessed struct {
 	url   []string
 }
 // Crawler crawls
 type Crawler struct {
-	rootUrl  string
+	rootURL  string
 	outDir   string
 	fixUrls  bool
 	xDomain  bool
@ -104,18 +113,19 @@ type Crawler struct {
 	tld      string
 }
 // Crawl tells the crawler to start crawlin'
 func (c *Crawler) Crawl() {
 	if c.depth >= 0 {
-		fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.depth, c.fixUrls, c.xDomain, c.throttle)
+		fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.depth, c.fixUrls, c.xDomain, c.throttle)
 	} else {
-		fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.fixUrls, c.xDomain, c.throttle)
+		fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.fixUrls, c.xDomain, c.throttle)
 	}
 	// Setup channel for inputs to be processed
 	up := make(chan unprocessed, 0)
 	// Kick off processing and count how many pages are left to process
-	go c.getPage(c.rootUrl, c.depth, up)
+	go c.getPage(c.rootURL, c.depth, up)
 	outstanding := 1
 	visited := make(map[string]bool)
@ -172,8 +182,9 @@ func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
 	r <- unprocessed{depth - 1, urls}
 }
 // Fetch initiates a page get
 func (c *Crawler) Fetch(url string) (string, []string, error) {
-	urls := make([]string, 0)
+	var urls []string
 	// Ok, go get URL
 	response, err := http.Get(url)
 	if err != nil || response.StatusCode != 200 {
@ -183,31 +194,31 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
 	response.Body.Close()
 	// Save the body to the appropriate directory
-	save_file := strings.TrimPrefix(url, c.rootUrl)
+	saveFile := strings.TrimPrefix(url, c.rootURL)
-	if save_file == "" {
+	if saveFile == "" {
-		save_file = "index.html"
+		saveFile = "index.html"
 	} else {
-		save_arr := strings.Split(save_file, "/")
+		saveArr := strings.Split(saveFile, "/")
-		save_dir := strings.Join(save_arr[:len(save_arr)-1], "/")
+		saveDir := strings.Join(saveArr[:len(saveArr)-1], "/")
-		if strings.Index(save_dir, "/") != 0 {
+		if strings.Index(saveDir, "/") != 0 {
-			save_dir = "/" + save_dir
+			saveDir = "/" + saveDir
 		}
-		save_dir = c.outDir + save_dir
+		saveDir = c.outDir + saveDir
-		if len(save_arr) > 0 {
+		if len(saveArr) > 0 {
-			if err = CreateDirIfNotExist(save_dir); err != nil {
+			if err = CreateDirIfNotExist(saveDir); err != nil {
-				fmt.Printf("PANIC: Unable to create directory %s\n", save_dir)
+				fmt.Printf("PANIC: Unable to create directory %s\n", saveDir)
 				fmt.Printf("Error: %s\n", err)
 				os.Exit(1)
 			}
 		}
 	}
-	WriteFile(string(body), c.outDir+"/"+save_file)
+	WriteFile(string(body), c.outDir+"/"+saveFile)
 	// Read the body into a buffer
-	bd_reader := bytes.NewReader(body)
+	bdReader := bytes.NewReader(body)
-	z := html.NewTokenizer(bd_reader)
+	z := html.NewTokenizer(bdReader)
 	tt := z.Next()
 	// Is this an HTML file?
 	if tt != html.DoctypeToken {
@ -223,10 +234,10 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
 			if t.Data == "link" || t.Data == "a" {
 				for _, a := range t.Attr {
 					if a.Key == "href" {
-						if c.CheckUrl(a.Val) {
+						if c.CheckURL(a.Val) {
-							urls = append(urls, c.FormatUrl(a.Val))
+							urls = append(urls, c.FormatURL(a.Val))
 						} else {
-							fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
+							fmt.Printf("CheckURL Failed For: %s\n", a.Val)
 						}
 						break
 					}
@ -234,10 +245,10 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
 			} else if t.Data == "img" || t.Data == "script" {
 				for _, a := range t.Attr {
 					if a.Key == "src" {
-						if c.CheckUrl(a.Val) {
+						if c.CheckURL(a.Val) {
-							urls = append(urls, c.FormatUrl(a.Val))
+							urls = append(urls, c.FormatURL(a.Val))
 						} else {
-							fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
+							fmt.Printf("CheckURL Failed For: %s\n", a.Val)
 						}
 						break
 					}
@ -256,22 +267,20 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
 	return string(body), urls, nil
 }
-/* ParseUrl parses a URL and returns its Domain and TLD
+// ParseURL parses a URL and returns its Domain and TLD
- * (An error is returned if it can't...)
+func (c *Crawler) ParseURL(url string) (string, string, error) {
- */
+	partSlice := strings.Split(url, "//")
-func (c *Crawler) ParseUrl(url string) (string, string, error) {
+	server := partSlice[1]
-	part_slice := strings.Split(url, "//")
+	partSlice = strings.Split(server, "/")
-	server := part_slice[1]
+	server = partSlice[0]
-	part_slice = strings.Split(server, "/")
+	partSlice = strings.Split(server, ".")
 	server = part_slice[0]
 	part_slice = strings.Split(server, ".")
 	var tld, domain string
 	// We just want the last two parts of the slice
-	if len(part_slice) >= 2 {
+	if len(partSlice) >= 2 {
-		part_slice = part_slice[len(part_slice)-2:]
+		partSlice = partSlice[len(partSlice)-2:]
-		domain = part_slice[0]
+		domain = partSlice[0]
-		tld = part_slice[1]
+		tld = partSlice[1]
 	} else {
 		// There aren't two parts ot the URL?! That's not right...
 		return "", "", fmt.Errorf("Invalid URL Given: %s\n", url)
@ -279,10 +288,9 @@ func (c *Crawler) ParseUrl(url string) (string, string, error) {
 	return domain, tld, nil
 }
-/* CheckUrl checks if we should follow the url or not
+// CheckURL checks if we should follow the url or not
- * The main purpose is for cross-domain checks
+// The main purpose is for cross-domain checks
- */
+func (c *Crawler) CheckURL(url string) bool {
 func (c *Crawler) CheckUrl(url string) bool {
 	// Ignore anchor urls
 	if strings.IndexRune(url, '#') >= 0 {
 		return false
@ -294,27 +302,31 @@ func (c *Crawler) CheckUrl(url string) bool {
 	if !c.xDomain {
 		if strings.HasPrefix(url, "http") {
 			// Make sure that this url is in the same domain
-			tst_dmn, tst_tld, err := c.ParseUrl(url)
+			tstDmn, tstTld, err := c.ParseURL(url)
 			if err != nil {
 				// Error parsing the Domain/TLD out of the URL...
 				return false
 			}
-			return (tst_dmn == c.domain && tst_tld == c.tld)
+			return (tstDmn == c.domain && tstTld == c.tld)
 		}
 	}
 	return true
 }
-func (c *Crawler) FormatUrl(url string) string {
+// FormatURL takes a url and, unless it starts with "http"
 // appends it to the end of c.rootURL
 func (c *Crawler) FormatURL(url string) string {
 	// If the URL doesn't start with http, then it should be relative
 	if strings.Index(url, "http") != 0 {
-		url = c.rootUrl + url
+		url = c.rootURL + url
 	}
 	return url
 }
 // CreateDirIfNotExist Checks if directory 'dir' exists
 // If it doesn't, it creates it.
 func CreateDirIfNotExist(dir string) error {
-	// Check if out_dir exists, if not, try to make it
+	// Check if outDir exists, if not, try to make it
 	_, err := os.Stat(dir)
 	if os.IsNotExist(err) {
 		// Doesn't exist, try to create
@ -326,8 +338,13 @@ func CreateDirIfNotExist(dir string) error {
 	return nil
 }
 // WriteFile writes the data 'd' to 'filename'
 func WriteFile(d string, filename string) error {
 	do := []byte(d)
 	//fmt.Printf("Writing %s\n", filename)
 	return ioutil.WriteFile(filename, do, 0664)
 }
 func printUsage() {
 	fmt.Println("Usage: ...")
 }