From fe664510567f9abb9892b3f71de56e7bbd05cc06 Mon Sep 17 00:00:00 2001 From: Brian Buller Date: Thu, 21 Jan 2016 12:14:09 -0600 Subject: [PATCH] gofmt and some error checking --- .gitignore | 0 LICENSE | 0 fullscrape.go | 137 ++++++++++++++++++++++++++++---------------------- 3 files changed, 77 insertions(+), 60 deletions(-) mode change 100755 => 100644 .gitignore mode change 100755 => 100644 LICENSE diff --git a/.gitignore b/.gitignore old mode 100755 new mode 100644 diff --git a/LICENSE b/LICENSE old mode 100755 new mode 100644 diff --git a/fullscrape.go b/fullscrape.go index a1c68fe..287c24e 100644 --- a/fullscrape.go +++ b/fullscrape.go @@ -3,20 +3,28 @@ package main import ( "bytes" "fmt" - "golang.org/x/net/html" "io/ioutil" "net/http" "os" "strconv" "strings" "time" + + "golang.org/x/net/html" ) -const PROGRAM_NAME = "fullscrape" +const programName = "fullscrape" func main() { - req_url := os.Args[1] //"http://golang.org/" - out_dir := os.Args[2] + if len(os.Args) <= 1 { + printUsage() + os.Exit(1) + } + reqURL := os.Args[1] //"http://golang.org/" + outDir := "" + if len(os.Args) > 2 { + outDir = os.Args[2] + } depthFlag := -1 norewriteFlag := false crossdomainFlag := false @@ -58,28 +66,28 @@ func main() { } } - if err := CreateDirIfNotExist(out_dir); err != nil { - fmt.Print("Unable to create initial directory %s\n", out_dir) + if err := CreateDirIfNotExist(outDir); err != nil { + fmt.Print("Unable to create initial directory %s\n", outDir) fmt.Print("Error: %s\n", err) os.Exit(1) } c := new(Crawler) // Make sure we have the protocol - if strings.Index(req_url, "http") != 0 { - req_url = "http://" + req_url + if strings.Index(reqURL, "http") != 0 { + reqURL = "http://" + reqURL } - if !strings.HasSuffix(req_url, "/") { - req_url = req_url + "/" + if !strings.HasSuffix(reqURL, "/") { + reqURL = reqURL + "/" } - c.rootUrl = req_url - c.outDir = out_dir + c.rootURL = reqURL + c.outDir = outDir c.fixUrls = norewriteFlag c.xDomain = crossdomainFlag c.depth = depthFlag c.throttle = time.Duration(throttleFlag) // Parse out the Domain and TLD - c.domain, c.tld, err = c.ParseUrl(c.rootUrl) + c.domain, c.tld, err = c.ParseURL(c.rootURL) if err != nil { fmt.Print(err) os.Exit(1) @@ -93,8 +101,9 @@ type unprocessed struct { url []string } +// Crawler crawls type Crawler struct { - rootUrl string + rootURL string outDir string fixUrls bool xDomain bool @@ -104,18 +113,19 @@ type Crawler struct { tld string } +// Crawl tells the crawler to start crawlin' func (c *Crawler) Crawl() { if c.depth >= 0 { - fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.depth, c.fixUrls, c.xDomain, c.throttle) + fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.depth, c.fixUrls, c.xDomain, c.throttle) } else { - fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.fixUrls, c.xDomain, c.throttle) + fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.fixUrls, c.xDomain, c.throttle) } // Setup channel for inputs to be processed up := make(chan unprocessed, 0) // Kick off processing and count how many pages are left to process - go c.getPage(c.rootUrl, c.depth, up) + go c.getPage(c.rootURL, c.depth, up) outstanding := 1 visited := make(map[string]bool) @@ -172,8 +182,9 @@ func (c *Crawler) getPage(url string, depth int, r chan unprocessed) { r <- unprocessed{depth - 1, urls} } +// Fetch initiates a page get func (c *Crawler) Fetch(url string) (string, []string, error) { - urls := make([]string, 0) + var urls []string // Ok, go get URL response, err := http.Get(url) if err != nil || response.StatusCode != 200 { @@ -183,31 +194,31 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { response.Body.Close() // Save the body to the appropriate directory - save_file := strings.TrimPrefix(url, c.rootUrl) - if save_file == "" { - save_file = "index.html" + saveFile := strings.TrimPrefix(url, c.rootURL) + if saveFile == "" { + saveFile = "index.html" } else { - save_arr := strings.Split(save_file, "/") - save_dir := strings.Join(save_arr[:len(save_arr)-1], "/") - if strings.Index(save_dir, "/") != 0 { - save_dir = "/" + save_dir + saveArr := strings.Split(saveFile, "/") + saveDir := strings.Join(saveArr[:len(saveArr)-1], "/") + if strings.Index(saveDir, "/") != 0 { + saveDir = "/" + saveDir } - save_dir = c.outDir + save_dir + saveDir = c.outDir + saveDir - if len(save_arr) > 0 { - if err = CreateDirIfNotExist(save_dir); err != nil { - fmt.Printf("PANIC: Unable to create directory %s\n", save_dir) + if len(saveArr) > 0 { + if err = CreateDirIfNotExist(saveDir); err != nil { + fmt.Printf("PANIC: Unable to create directory %s\n", saveDir) fmt.Printf("Error: %s\n", err) os.Exit(1) } } } - WriteFile(string(body), c.outDir+"/"+save_file) + WriteFile(string(body), c.outDir+"/"+saveFile) // Read the body into a buffer - bd_reader := bytes.NewReader(body) - z := html.NewTokenizer(bd_reader) + bdReader := bytes.NewReader(body) + z := html.NewTokenizer(bdReader) tt := z.Next() // Is this an HTML file? if tt != html.DoctypeToken { @@ -223,10 +234,10 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { if t.Data == "link" || t.Data == "a" { for _, a := range t.Attr { if a.Key == "href" { - if c.CheckUrl(a.Val) { - urls = append(urls, c.FormatUrl(a.Val)) + if c.CheckURL(a.Val) { + urls = append(urls, c.FormatURL(a.Val)) } else { - fmt.Printf("CheckUrl Failed For: %s\n", a.Val) + fmt.Printf("CheckURL Failed For: %s\n", a.Val) } break } @@ -234,10 +245,10 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { } else if t.Data == "img" || t.Data == "script" { for _, a := range t.Attr { if a.Key == "src" { - if c.CheckUrl(a.Val) { - urls = append(urls, c.FormatUrl(a.Val)) + if c.CheckURL(a.Val) { + urls = append(urls, c.FormatURL(a.Val)) } else { - fmt.Printf("CheckUrl Failed For: %s\n", a.Val) + fmt.Printf("CheckURL Failed For: %s\n", a.Val) } break } @@ -256,22 +267,20 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { return string(body), urls, nil } -/* ParseUrl parses a URL and returns its Domain and TLD - * (An error is returned if it can't...) - */ -func (c *Crawler) ParseUrl(url string) (string, string, error) { - part_slice := strings.Split(url, "//") - server := part_slice[1] - part_slice = strings.Split(server, "/") - server = part_slice[0] - part_slice = strings.Split(server, ".") +// ParseURL parses a URL and returns its Domain and TLD +func (c *Crawler) ParseURL(url string) (string, string, error) { + partSlice := strings.Split(url, "//") + server := partSlice[1] + partSlice = strings.Split(server, "/") + server = partSlice[0] + partSlice = strings.Split(server, ".") var tld, domain string // We just want the last two parts of the slice - if len(part_slice) >= 2 { - part_slice = part_slice[len(part_slice)-2:] - domain = part_slice[0] - tld = part_slice[1] + if len(partSlice) >= 2 { + partSlice = partSlice[len(partSlice)-2:] + domain = partSlice[0] + tld = partSlice[1] } else { // There aren't two parts ot the URL?! That's not right... return "", "", fmt.Errorf("Invalid URL Given: %s\n", url) @@ -279,10 +288,9 @@ func (c *Crawler) ParseUrl(url string) (string, string, error) { return domain, tld, nil } -/* CheckUrl checks if we should follow the url or not - * The main purpose is for cross-domain checks - */ -func (c *Crawler) CheckUrl(url string) bool { +// CheckURL checks if we should follow the url or not +// The main purpose is for cross-domain checks +func (c *Crawler) CheckURL(url string) bool { // Ignore anchor urls if strings.IndexRune(url, '#') >= 0 { return false @@ -294,27 +302,31 @@ func (c *Crawler) CheckUrl(url string) bool { if !c.xDomain { if strings.HasPrefix(url, "http") { // Make sure that this url is in the same domain - tst_dmn, tst_tld, err := c.ParseUrl(url) + tstDmn, tstTld, err := c.ParseURL(url) if err != nil { // Error parsing the Domain/TLD out of the URL... return false } - return (tst_dmn == c.domain && tst_tld == c.tld) + return (tstDmn == c.domain && tstTld == c.tld) } } return true } -func (c *Crawler) FormatUrl(url string) string { +// FormatURL takes a url and, unless it starts with "http" +// appends it to the end of c.rootURL +func (c *Crawler) FormatURL(url string) string { // If the URL doesn't start with http, then it should be relative if strings.Index(url, "http") != 0 { - url = c.rootUrl + url + url = c.rootURL + url } return url } +// CreateDirIfNotExist Checks if directory 'dir' exists +// If it doesn't, it creates it. func CreateDirIfNotExist(dir string) error { - // Check if out_dir exists, if not, try to make it + // Check if outDir exists, if not, try to make it _, err := os.Stat(dir) if os.IsNotExist(err) { // Doesn't exist, try to create @@ -326,8 +338,13 @@ func CreateDirIfNotExist(dir string) error { return nil } +// WriteFile writes the data 'd' to 'filename' func WriteFile(d string, filename string) error { do := []byte(d) //fmt.Printf("Writing %s\n", filename) return ioutil.WriteFile(filename, do, 0664) +} + +func printUsage() { + fmt.Println("Usage: ...") } \ No newline at end of file