Smarter URL checking
It should handle subdomains appropriately now
This commit is contained in:
parent
336d7d7c7b
commit
197750425d
@ -78,6 +78,12 @@ func main() {
|
|||||||
c.xDomain = crossdomainFlag
|
c.xDomain = crossdomainFlag
|
||||||
c.depth = depthFlag
|
c.depth = depthFlag
|
||||||
c.throttle = time.Duration(throttleFlag)
|
c.throttle = time.Duration(throttleFlag)
|
||||||
|
// Parse out the Domain and TLD
|
||||||
|
c.domain, c.tld, err = c.ParseUrl(c.rootUrl)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Print(err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
c.Crawl()
|
c.Crawl()
|
||||||
}
|
}
|
||||||
@ -94,6 +100,8 @@ type Crawler struct {
|
|||||||
xDomain bool
|
xDomain bool
|
||||||
depth int
|
depth int
|
||||||
throttle time.Duration
|
throttle time.Duration
|
||||||
|
domain string
|
||||||
|
tld string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Crawler) Crawl() {
|
func (c *Crawler) Crawl() {
|
||||||
@ -146,7 +154,7 @@ func (c *Crawler) Crawl() {
|
|||||||
go c.getPage(link, next.depth, up)
|
go c.getPage(link, next.depth, up)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fmt.Print(strings.Repeat("", len(status)))
|
//fmt.Print(strings.Repeat("", len(status)))
|
||||||
status = fmt.Sprintf("Files %d/%d", len(visited), len(visited))
|
status = fmt.Sprintf("Files %d/%d", len(visited), len(visited))
|
||||||
fmt.Printf("%s\n", status)
|
fmt.Printf("%s\n", status)
|
||||||
}
|
}
|
||||||
@ -154,7 +162,7 @@ func (c *Crawler) Crawl() {
|
|||||||
func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
|
func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
|
||||||
_, urls, err := c.Fetch(url)
|
_, urls, err := c.Fetch(url)
|
||||||
//body, urls, err := c.Fetch(url)
|
//body, urls, err := c.Fetch(url)
|
||||||
//fmt.Printf("Found: %s\n", url)
|
fmt.Printf("Found: %s\n", url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println(err)
|
fmt.Println(err)
|
||||||
}
|
}
|
||||||
@ -217,6 +225,8 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
|||||||
if a.Key == "href" {
|
if a.Key == "href" {
|
||||||
if c.CheckUrl(a.Val) {
|
if c.CheckUrl(a.Val) {
|
||||||
urls = append(urls, c.FormatUrl(a.Val))
|
urls = append(urls, c.FormatUrl(a.Val))
|
||||||
|
} else {
|
||||||
|
fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
|
||||||
}
|
}
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -226,6 +236,8 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
|||||||
if a.Key == "src" {
|
if a.Key == "src" {
|
||||||
if c.CheckUrl(a.Val) {
|
if c.CheckUrl(a.Val) {
|
||||||
urls = append(urls, c.FormatUrl(a.Val))
|
urls = append(urls, c.FormatUrl(a.Val))
|
||||||
|
} else {
|
||||||
|
fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
|
||||||
}
|
}
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -244,6 +256,29 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
|||||||
return string(body), urls, nil
|
return string(body), urls, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ParseUrl parses a URL and returns its Domain and TLD
|
||||||
|
* (An error is returned if it can't...)
|
||||||
|
*/
|
||||||
|
func (c *Crawler) ParseUrl(url string) (string, string, error) {
|
||||||
|
part_slice := strings.Split(url, "//")
|
||||||
|
server := part_slice[1]
|
||||||
|
part_slice = strings.Split(server, "/")
|
||||||
|
server = part_slice[0]
|
||||||
|
part_slice = strings.Split(server, ".")
|
||||||
|
var tld, domain string
|
||||||
|
|
||||||
|
// We just want the last two parts of the slice
|
||||||
|
if len(part_slice) >= 2 {
|
||||||
|
part_slice = part_slice[len(part_slice)-2:]
|
||||||
|
domain = part_slice[0]
|
||||||
|
tld = part_slice[1]
|
||||||
|
} else {
|
||||||
|
// There aren't two parts ot the URL?! That's not right...
|
||||||
|
return "", "", fmt.Errorf("Invalid URL Given: %s\n", url)
|
||||||
|
}
|
||||||
|
return domain, tld, nil
|
||||||
|
}
|
||||||
|
|
||||||
/* CheckUrl checks if we should follow the url or not
|
/* CheckUrl checks if we should follow the url or not
|
||||||
* The main purpose is for cross-domain checks
|
* The main purpose is for cross-domain checks
|
||||||
*/
|
*/
|
||||||
@ -258,7 +293,13 @@ func (c *Crawler) CheckUrl(url string) bool {
|
|||||||
}
|
}
|
||||||
if !c.xDomain {
|
if !c.xDomain {
|
||||||
if strings.HasPrefix(url, "http") {
|
if strings.HasPrefix(url, "http") {
|
||||||
return strings.HasPrefix(url, c.rootUrl)
|
// Make sure that this url is in the same domain
|
||||||
|
tst_dmn, tst_tld, err := c.ParseUrl(url)
|
||||||
|
if err != nil {
|
||||||
|
// Error parsing the Domain/TLD out of the URL...
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return (tst_dmn == c.domain && tst_tld == c.tld)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
|
Loading…
Reference in New Issue
Block a user