From 197750425d3a1ad493542a7ef24bfc1663d6a482 Mon Sep 17 00:00:00 2001 From: Brian Buller Date: Thu, 20 Aug 2015 09:16:02 -0500 Subject: [PATCH] Smarter URL checking It should handle subdomains appropriately now --- fullscrape.go | 47 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/fullscrape.go b/fullscrape.go index 224b6db..a1c68fe 100644 --- a/fullscrape.go +++ b/fullscrape.go @@ -78,6 +78,12 @@ func main() { c.xDomain = crossdomainFlag c.depth = depthFlag c.throttle = time.Duration(throttleFlag) + // Parse out the Domain and TLD + c.domain, c.tld, err = c.ParseUrl(c.rootUrl) + if err != nil { + fmt.Print(err) + os.Exit(1) + } c.Crawl() } @@ -94,6 +100,8 @@ type Crawler struct { xDomain bool depth int throttle time.Duration + domain string + tld string } func (c *Crawler) Crawl() { @@ -146,7 +154,7 @@ func (c *Crawler) Crawl() { go c.getPage(link, next.depth, up) } } - fmt.Print(strings.Repeat("", len(status))) + //fmt.Print(strings.Repeat("", len(status))) status = fmt.Sprintf("Files %d/%d", len(visited), len(visited)) fmt.Printf("%s\n", status) } @@ -154,7 +162,7 @@ func (c *Crawler) Crawl() { func (c *Crawler) getPage(url string, depth int, r chan unprocessed) { _, urls, err := c.Fetch(url) //body, urls, err := c.Fetch(url) - //fmt.Printf("Found: %s\n", url) + fmt.Printf("Found: %s\n", url) if err != nil { fmt.Println(err) } @@ -217,6 +225,8 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { if a.Key == "href" { if c.CheckUrl(a.Val) { urls = append(urls, c.FormatUrl(a.Val)) + } else { + fmt.Printf("CheckUrl Failed For: %s\n", a.Val) } break } @@ -226,6 +236,8 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { if a.Key == "src" { if c.CheckUrl(a.Val) { urls = append(urls, c.FormatUrl(a.Val)) + } else { + fmt.Printf("CheckUrl Failed For: %s\n", a.Val) } break } @@ -244,6 +256,29 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { return string(body), urls, nil } +/* ParseUrl parses a URL and returns its Domain and TLD + * (An error is returned if it can't...) + */ +func (c *Crawler) ParseUrl(url string) (string, string, error) { + part_slice := strings.Split(url, "//") + server := part_slice[1] + part_slice = strings.Split(server, "/") + server = part_slice[0] + part_slice = strings.Split(server, ".") + var tld, domain string + + // We just want the last two parts of the slice + if len(part_slice) >= 2 { + part_slice = part_slice[len(part_slice)-2:] + domain = part_slice[0] + tld = part_slice[1] + } else { + // There aren't two parts ot the URL?! That's not right... + return "", "", fmt.Errorf("Invalid URL Given: %s\n", url) + } + return domain, tld, nil +} + /* CheckUrl checks if we should follow the url or not * The main purpose is for cross-domain checks */ @@ -258,7 +293,13 @@ func (c *Crawler) CheckUrl(url string) bool { } if !c.xDomain { if strings.HasPrefix(url, "http") { - return strings.HasPrefix(url, c.rootUrl) + // Make sure that this url is in the same domain + tst_dmn, tst_tld, err := c.ParseUrl(url) + if err != nil { + // Error parsing the Domain/TLD out of the URL... + return false + } + return (tst_dmn == c.domain && tst_tld == c.tld) } } return true