Smarter URL checking

It should handle subdomains appropriately now
2015-08-20 09:16:02 -05:00
parent 336d7d7c7b
commit 197750425d
1 changed files with 44 additions and 3 deletions
--- a/fullscrape.go
+++ b/fullscrape.go
@@ -78,6 +78,12 @@ func main() {
 	c.xDomain = crossdomainFlag
 	c.depth = depthFlag
 	c.throttle = time.Duration(throttleFlag)
 	// Parse out the Domain and TLD
 	c.domain, c.tld, err = c.ParseUrl(c.rootUrl)
 	if err != nil {
 		fmt.Print(err)
 		os.Exit(1)
 	}
 	c.Crawl()
 }
@@ -94,6 +100,8 @@ type Crawler struct {
 	xDomain  bool
 	depth    int
 	throttle time.Duration
 	domain   string
 	tld      string
 }
 func (c *Crawler) Crawl() {
@@ -146,7 +154,7 @@ func (c *Crawler) Crawl() {
 			go c.getPage(link, next.depth, up)
 		}
 	}
-	fmt.Print(strings.Repeat("", len(status)))
+	//fmt.Print(strings.Repeat("", len(status)))
 	status = fmt.Sprintf("Files %d/%d", len(visited), len(visited))
 	fmt.Printf("%s\n", status)
 }
@@ -154,7 +162,7 @@ func (c *Crawler) Crawl() {
 func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
 	_, urls, err := c.Fetch(url)
 	//body, urls, err := c.Fetch(url)
-	//fmt.Printf("Found: %s\n", url)
+	fmt.Printf("Found: %s\n", url)
 	if err != nil {
 		fmt.Println(err)
 	}
@@ -217,6 +225,8 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
 					if a.Key == "href" {
 						if c.CheckUrl(a.Val) {
 							urls = append(urls, c.FormatUrl(a.Val))
 						} else {
 							fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
 						}
 						break
 					}
@@ -226,6 +236,8 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
 					if a.Key == "src" {
 						if c.CheckUrl(a.Val) {
 							urls = append(urls, c.FormatUrl(a.Val))
 						} else {
 							fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
 						}
 						break
 					}
@@ -244,6 +256,29 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
 	return string(body), urls, nil
 }
 /* ParseUrl parses a URL and returns its Domain and TLD
 * (An error is returned if it can't...)
 */
 func (c *Crawler) ParseUrl(url string) (string, string, error) {
 	part_slice := strings.Split(url, "//")
 	server := part_slice[1]
 	part_slice = strings.Split(server, "/")
 	server = part_slice[0]
 	part_slice = strings.Split(server, ".")
 	var tld, domain string
 	// We just want the last two parts of the slice
 	if len(part_slice) >= 2 {
 		part_slice = part_slice[len(part_slice)-2:]
 		domain = part_slice[0]
 		tld = part_slice[1]
 	} else {
 		// There aren't two parts ot the URL?! That's not right...
 		return "", "", fmt.Errorf("Invalid URL Given: %s\n", url)
 	}
 	return domain, tld, nil
 }
 /* CheckUrl checks if we should follow the url or not
 * The main purpose is for cross-domain checks
 */
@@ -258,7 +293,13 @@ func (c *Crawler) CheckUrl(url string) bool {
 	}
 	if !c.xDomain {
 		if strings.HasPrefix(url, "http") {
-			return strings.HasPrefix(url, c.rootUrl)
+			// Make sure that this url is in the same domain
 			tst_dmn, tst_tld, err := c.ParseUrl(url)
 			if err != nil {
 				// Error parsing the Domain/TLD out of the URL...
 				return false
 			}
 			return (tst_dmn == c.domain && tst_tld == c.tld)
 		}
 	}
 	return true