Smarter URL checking

It should handle subdomains appropriately now
This commit is contained in:
Brian Buller 2015-08-20 09:16:02 -05:00
parent 336d7d7c7b
commit 197750425d

View File

@ -78,6 +78,12 @@ func main() {
c.xDomain = crossdomainFlag c.xDomain = crossdomainFlag
c.depth = depthFlag c.depth = depthFlag
c.throttle = time.Duration(throttleFlag) c.throttle = time.Duration(throttleFlag)
// Parse out the Domain and TLD
c.domain, c.tld, err = c.ParseUrl(c.rootUrl)
if err != nil {
fmt.Print(err)
os.Exit(1)
}
c.Crawl() c.Crawl()
} }
@ -94,6 +100,8 @@ type Crawler struct {
xDomain bool xDomain bool
depth int depth int
throttle time.Duration throttle time.Duration
domain string
tld string
} }
func (c *Crawler) Crawl() { func (c *Crawler) Crawl() {
@ -146,7 +154,7 @@ func (c *Crawler) Crawl() {
go c.getPage(link, next.depth, up) go c.getPage(link, next.depth, up)
} }
} }
fmt.Print(strings.Repeat("", len(status))) //fmt.Print(strings.Repeat("", len(status)))
status = fmt.Sprintf("Files %d/%d", len(visited), len(visited)) status = fmt.Sprintf("Files %d/%d", len(visited), len(visited))
fmt.Printf("%s\n", status) fmt.Printf("%s\n", status)
} }
@ -154,7 +162,7 @@ func (c *Crawler) Crawl() {
func (c *Crawler) getPage(url string, depth int, r chan unprocessed) { func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
_, urls, err := c.Fetch(url) _, urls, err := c.Fetch(url)
//body, urls, err := c.Fetch(url) //body, urls, err := c.Fetch(url)
//fmt.Printf("Found: %s\n", url) fmt.Printf("Found: %s\n", url)
if err != nil { if err != nil {
fmt.Println(err) fmt.Println(err)
} }
@ -217,6 +225,8 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
if a.Key == "href" { if a.Key == "href" {
if c.CheckUrl(a.Val) { if c.CheckUrl(a.Val) {
urls = append(urls, c.FormatUrl(a.Val)) urls = append(urls, c.FormatUrl(a.Val))
} else {
fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
} }
break break
} }
@ -226,6 +236,8 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
if a.Key == "src" { if a.Key == "src" {
if c.CheckUrl(a.Val) { if c.CheckUrl(a.Val) {
urls = append(urls, c.FormatUrl(a.Val)) urls = append(urls, c.FormatUrl(a.Val))
} else {
fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
} }
break break
} }
@ -244,6 +256,29 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
return string(body), urls, nil return string(body), urls, nil
} }
/* ParseUrl parses a URL and returns its Domain and TLD
* (An error is returned if it can't...)
*/
func (c *Crawler) ParseUrl(url string) (string, string, error) {
part_slice := strings.Split(url, "//")
server := part_slice[1]
part_slice = strings.Split(server, "/")
server = part_slice[0]
part_slice = strings.Split(server, ".")
var tld, domain string
// We just want the last two parts of the slice
if len(part_slice) >= 2 {
part_slice = part_slice[len(part_slice)-2:]
domain = part_slice[0]
tld = part_slice[1]
} else {
// There aren't two parts ot the URL?! That's not right...
return "", "", fmt.Errorf("Invalid URL Given: %s\n", url)
}
return domain, tld, nil
}
/* CheckUrl checks if we should follow the url or not /* CheckUrl checks if we should follow the url or not
* The main purpose is for cross-domain checks * The main purpose is for cross-domain checks
*/ */
@ -258,7 +293,13 @@ func (c *Crawler) CheckUrl(url string) bool {
} }
if !c.xDomain { if !c.xDomain {
if strings.HasPrefix(url, "http") { if strings.HasPrefix(url, "http") {
return strings.HasPrefix(url, c.rootUrl) // Make sure that this url is in the same domain
tst_dmn, tst_tld, err := c.ParseUrl(url)
if err != nil {
// Error parsing the Domain/TLD out of the URL...
return false
}
return (tst_dmn == c.domain && tst_tld == c.tld)
} }
} }
return true return true