Pulling some css links

This commit is contained in:
Brian Buller 2016-05-05 09:02:28 -05:00
parent fe66451056
commit b863bd5bde
1 changed files with 55 additions and 16 deletions

View File

@ -6,6 +6,7 @@ import (
"io/ioutil"
"net/http"
"os"
"regexp"
"strconv"
"strings"
"time"
@ -130,14 +131,13 @@ func (c *Crawler) Crawl() {
visited := make(map[string]bool)
status := fmt.Sprintf("Files %d/%d", len(visited), outstanding+len(visited))
status := fmt.Sprintf("Files %d/%d\n", len(visited), outstanding+len(visited))
for outstanding > 0 {
done := len(visited) - outstanding
if done < 0 {
done = 0
}
fmt.Print(strings.Repeat("", len(status)))
status = fmt.Sprintf("Files %d/%d", done, len(visited))
status = fmt.Sprintf("Files %d/%d\n", done, len(visited))
fmt.Print(status)
if c.throttle > 0 {
time.Sleep(time.Millisecond * c.throttle)
@ -164,8 +164,7 @@ func (c *Crawler) Crawl() {
go c.getPage(link, next.depth, up)
}
}
//fmt.Print(strings.Repeat("", len(status)))
status = fmt.Sprintf("Files %d/%d", len(visited), len(visited))
status = fmt.Sprintf("Files %d/%d\n", len(visited), len(visited))
fmt.Printf("%s\n", status)
}
@ -214,20 +213,49 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
}
}
WriteFile(string(body), c.outDir+"/"+saveFile)
WriteFile(string(body), c.outDir+saveFile)
fmt.Println("Parsing " + c.outDir + saveFile)
// Read the body into a buffer
bdReader := bytes.NewReader(body)
z := html.NewTokenizer(bdReader)
tt := z.Next()
// Is this an HTML file?
if tt != html.DoctypeToken {
// Nope, so we're done here
return string(body), urls, nil
// Is this an HTML file
switch tt {
case html.DoctypeToken:
urls = append(urls, c.ParseHTML(body)...)
case html.TextToken:
parsedURLs := c.ParseText(body)
fmt.Println("Found urls in text file: ")
// Find file directory
urlLoc := saveFile[:strings.LastIndex(saveFile, "/")+1]
for i := range parsedURLs {
if parsedURLs[i][0] == '/' {
parsedURLs[i] = c.FormatURL(parsedURLs[i][1:])
} else if parsedURLs[i][0] == '.' {
parsedURLs[i] = c.FormatURL(urlLoc + parsedURLs[i])
}
}
urls = append(urls, parsedURLs...)
}
return string(body), urls, nil
if err != nil {
return "", nil, err
}
return string(body), urls, nil
}
// ParseHTML parses an html tokenizer and returns a list of urls
func (c *Crawler) ParseHTML(bd []byte) []string {
var urls []string
bdReader := bytes.NewReader(bd)
z := html.NewTokenizer(bdReader)
tt := z.Next()
for {
tt := z.Next()
tt = z.Next()
switch {
case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
t := z.Token()
@ -259,12 +287,23 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
break
}
}
return urls
}
if err != nil {
return "", nil, err
// ParseText parses a text file and returns a list of urls
func (c *Crawler) ParseText(bd []byte) []string {
var cssURLs = regexp.MustCompile(`url\(([^\)]*)\)`)
var urls []string
matches := cssURLs.FindAllSubmatch(bd, -1)
for _, v := range matches {
urls = append(urls, string(v[1]))
}
return string(body), urls, nil
for i := range urls {
if urls[i][0] == '\'' || urls[i][0] == '"' {
urls[i] = urls[i][1 : len(urls[i])-1]
}
}
return urls
}
// ParseURL parses a URL and returns its Domain and TLD
@ -347,4 +386,4 @@ func WriteFile(d string, filename string) error {
func printUsage() {
fmt.Println("Usage: ...")
}
}