Smarter URL checking
It should handle subdomains appropriately now
This commit is contained in:
parent
336d7d7c7b
commit
197750425d
@ -78,6 +78,12 @@ func main() {
|
||||
c.xDomain = crossdomainFlag
|
||||
c.depth = depthFlag
|
||||
c.throttle = time.Duration(throttleFlag)
|
||||
// Parse out the Domain and TLD
|
||||
c.domain, c.tld, err = c.ParseUrl(c.rootUrl)
|
||||
if err != nil {
|
||||
fmt.Print(err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
c.Crawl()
|
||||
}
|
||||
@ -94,6 +100,8 @@ type Crawler struct {
|
||||
xDomain bool
|
||||
depth int
|
||||
throttle time.Duration
|
||||
domain string
|
||||
tld string
|
||||
}
|
||||
|
||||
func (c *Crawler) Crawl() {
|
||||
@ -146,7 +154,7 @@ func (c *Crawler) Crawl() {
|
||||
go c.getPage(link, next.depth, up)
|
||||
}
|
||||
}
|
||||
fmt.Print(strings.Repeat("", len(status)))
|
||||
//fmt.Print(strings.Repeat("", len(status)))
|
||||
status = fmt.Sprintf("Files %d/%d", len(visited), len(visited))
|
||||
fmt.Printf("%s\n", status)
|
||||
}
|
||||
@ -154,7 +162,7 @@ func (c *Crawler) Crawl() {
|
||||
func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
|
||||
_, urls, err := c.Fetch(url)
|
||||
//body, urls, err := c.Fetch(url)
|
||||
//fmt.Printf("Found: %s\n", url)
|
||||
fmt.Printf("Found: %s\n", url)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
@ -217,6 +225,8 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
||||
if a.Key == "href" {
|
||||
if c.CheckUrl(a.Val) {
|
||||
urls = append(urls, c.FormatUrl(a.Val))
|
||||
} else {
|
||||
fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
|
||||
}
|
||||
break
|
||||
}
|
||||
@ -226,6 +236,8 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
||||
if a.Key == "src" {
|
||||
if c.CheckUrl(a.Val) {
|
||||
urls = append(urls, c.FormatUrl(a.Val))
|
||||
} else {
|
||||
fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
|
||||
}
|
||||
break
|
||||
}
|
||||
@ -244,6 +256,29 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
||||
return string(body), urls, nil
|
||||
}
|
||||
|
||||
/* ParseUrl parses a URL and returns its Domain and TLD
|
||||
* (An error is returned if it can't...)
|
||||
*/
|
||||
func (c *Crawler) ParseUrl(url string) (string, string, error) {
|
||||
part_slice := strings.Split(url, "//")
|
||||
server := part_slice[1]
|
||||
part_slice = strings.Split(server, "/")
|
||||
server = part_slice[0]
|
||||
part_slice = strings.Split(server, ".")
|
||||
var tld, domain string
|
||||
|
||||
// We just want the last two parts of the slice
|
||||
if len(part_slice) >= 2 {
|
||||
part_slice = part_slice[len(part_slice)-2:]
|
||||
domain = part_slice[0]
|
||||
tld = part_slice[1]
|
||||
} else {
|
||||
// There aren't two parts ot the URL?! That's not right...
|
||||
return "", "", fmt.Errorf("Invalid URL Given: %s\n", url)
|
||||
}
|
||||
return domain, tld, nil
|
||||
}
|
||||
|
||||
/* CheckUrl checks if we should follow the url or not
|
||||
* The main purpose is for cross-domain checks
|
||||
*/
|
||||
@ -258,7 +293,13 @@ func (c *Crawler) CheckUrl(url string) bool {
|
||||
}
|
||||
if !c.xDomain {
|
||||
if strings.HasPrefix(url, "http") {
|
||||
return strings.HasPrefix(url, c.rootUrl)
|
||||
// Make sure that this url is in the same domain
|
||||
tst_dmn, tst_tld, err := c.ParseUrl(url)
|
||||
if err != nil {
|
||||
// Error parsing the Domain/TLD out of the URL...
|
||||
return false
|
||||
}
|
||||
return (tst_dmn == c.domain && tst_tld == c.tld)
|
||||
}
|
||||
}
|
||||
return true
|
||||
|
Loading…
Reference in New Issue
Block a user