Pulling some css links
This commit is contained in:
parent
fe66451056
commit
b863bd5bde
@ -6,6 +6,7 @@ import (
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
@ -130,14 +131,13 @@ func (c *Crawler) Crawl() {
|
||||
|
||||
visited := make(map[string]bool)
|
||||
|
||||
status := fmt.Sprintf("Files %d/%d", len(visited), outstanding+len(visited))
|
||||
status := fmt.Sprintf("Files %d/%d\n", len(visited), outstanding+len(visited))
|
||||
for outstanding > 0 {
|
||||
done := len(visited) - outstanding
|
||||
if done < 0 {
|
||||
done = 0
|
||||
}
|
||||
fmt.Print(strings.Repeat("", len(status)))
|
||||
status = fmt.Sprintf("Files %d/%d", done, len(visited))
|
||||
status = fmt.Sprintf("Files %d/%d\n", done, len(visited))
|
||||
fmt.Print(status)
|
||||
if c.throttle > 0 {
|
||||
time.Sleep(time.Millisecond * c.throttle)
|
||||
@ -164,8 +164,7 @@ func (c *Crawler) Crawl() {
|
||||
go c.getPage(link, next.depth, up)
|
||||
}
|
||||
}
|
||||
//fmt.Print(strings.Repeat("", len(status)))
|
||||
status = fmt.Sprintf("Files %d/%d", len(visited), len(visited))
|
||||
status = fmt.Sprintf("Files %d/%d\n", len(visited), len(visited))
|
||||
fmt.Printf("%s\n", status)
|
||||
}
|
||||
|
||||
@ -214,20 +213,49 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
||||
}
|
||||
}
|
||||
|
||||
WriteFile(string(body), c.outDir+"/"+saveFile)
|
||||
WriteFile(string(body), c.outDir+saveFile)
|
||||
|
||||
fmt.Println("Parsing " + c.outDir + saveFile)
|
||||
|
||||
// Read the body into a buffer
|
||||
bdReader := bytes.NewReader(body)
|
||||
z := html.NewTokenizer(bdReader)
|
||||
tt := z.Next()
|
||||
// Is this an HTML file?
|
||||
if tt != html.DoctypeToken {
|
||||
// Nope, so we're done here
|
||||
// Is this an HTML file
|
||||
switch tt {
|
||||
case html.DoctypeToken:
|
||||
urls = append(urls, c.ParseHTML(body)...)
|
||||
case html.TextToken:
|
||||
parsedURLs := c.ParseText(body)
|
||||
fmt.Println("Found urls in text file: ")
|
||||
// Find file directory
|
||||
urlLoc := saveFile[:strings.LastIndex(saveFile, "/")+1]
|
||||
for i := range parsedURLs {
|
||||
if parsedURLs[i][0] == '/' {
|
||||
parsedURLs[i] = c.FormatURL(parsedURLs[i][1:])
|
||||
} else if parsedURLs[i][0] == '.' {
|
||||
parsedURLs[i] = c.FormatURL(urlLoc + parsedURLs[i])
|
||||
}
|
||||
}
|
||||
urls = append(urls, parsedURLs...)
|
||||
}
|
||||
return string(body), urls, nil
|
||||
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
return string(body), urls, nil
|
||||
}
|
||||
|
||||
for {
|
||||
// ParseHTML parses an html tokenizer and returns a list of urls
|
||||
func (c *Crawler) ParseHTML(bd []byte) []string {
|
||||
var urls []string
|
||||
bdReader := bytes.NewReader(bd)
|
||||
z := html.NewTokenizer(bdReader)
|
||||
tt := z.Next()
|
||||
for {
|
||||
tt = z.Next()
|
||||
switch {
|
||||
case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
|
||||
t := z.Token()
|
||||
@ -259,12 +287,23 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
return urls
|
||||
}
|
||||
|
||||
return string(body), urls, nil
|
||||
// ParseText parses a text file and returns a list of urls
|
||||
func (c *Crawler) ParseText(bd []byte) []string {
|
||||
var cssURLs = regexp.MustCompile(`url\(([^\)]*)\)`)
|
||||
var urls []string
|
||||
matches := cssURLs.FindAllSubmatch(bd, -1)
|
||||
for _, v := range matches {
|
||||
urls = append(urls, string(v[1]))
|
||||
}
|
||||
for i := range urls {
|
||||
if urls[i][0] == '\'' || urls[i][0] == '"' {
|
||||
urls[i] = urls[i][1 : len(urls[i])-1]
|
||||
}
|
||||
}
|
||||
return urls
|
||||
}
|
||||
|
||||
// ParseURL parses a URL and returns its Domain and TLD
|
||||
|
Loading…
Reference in New Issue
Block a user