Pulling some css links
This commit is contained in:
parent
fe66451056
commit
b863bd5bde
@ -6,6 +6,7 @@ import (
|
|||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@ -130,14 +131,13 @@ func (c *Crawler) Crawl() {
|
|||||||
|
|
||||||
visited := make(map[string]bool)
|
visited := make(map[string]bool)
|
||||||
|
|
||||||
status := fmt.Sprintf("Files %d/%d", len(visited), outstanding+len(visited))
|
status := fmt.Sprintf("Files %d/%d\n", len(visited), outstanding+len(visited))
|
||||||
for outstanding > 0 {
|
for outstanding > 0 {
|
||||||
done := len(visited) - outstanding
|
done := len(visited) - outstanding
|
||||||
if done < 0 {
|
if done < 0 {
|
||||||
done = 0
|
done = 0
|
||||||
}
|
}
|
||||||
fmt.Print(strings.Repeat("", len(status)))
|
status = fmt.Sprintf("Files %d/%d\n", done, len(visited))
|
||||||
status = fmt.Sprintf("Files %d/%d", done, len(visited))
|
|
||||||
fmt.Print(status)
|
fmt.Print(status)
|
||||||
if c.throttle > 0 {
|
if c.throttle > 0 {
|
||||||
time.Sleep(time.Millisecond * c.throttle)
|
time.Sleep(time.Millisecond * c.throttle)
|
||||||
@ -164,8 +164,7 @@ func (c *Crawler) Crawl() {
|
|||||||
go c.getPage(link, next.depth, up)
|
go c.getPage(link, next.depth, up)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//fmt.Print(strings.Repeat("", len(status)))
|
status = fmt.Sprintf("Files %d/%d\n", len(visited), len(visited))
|
||||||
status = fmt.Sprintf("Files %d/%d", len(visited), len(visited))
|
|
||||||
fmt.Printf("%s\n", status)
|
fmt.Printf("%s\n", status)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -214,20 +213,49 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
WriteFile(string(body), c.outDir+"/"+saveFile)
|
WriteFile(string(body), c.outDir+saveFile)
|
||||||
|
|
||||||
|
fmt.Println("Parsing " + c.outDir + saveFile)
|
||||||
|
|
||||||
// Read the body into a buffer
|
// Read the body into a buffer
|
||||||
bdReader := bytes.NewReader(body)
|
bdReader := bytes.NewReader(body)
|
||||||
z := html.NewTokenizer(bdReader)
|
z := html.NewTokenizer(bdReader)
|
||||||
tt := z.Next()
|
tt := z.Next()
|
||||||
// Is this an HTML file?
|
// Is this an HTML file
|
||||||
if tt != html.DoctypeToken {
|
switch tt {
|
||||||
// Nope, so we're done here
|
case html.DoctypeToken:
|
||||||
return string(body), urls, nil
|
urls = append(urls, c.ParseHTML(body)...)
|
||||||
|
case html.TextToken:
|
||||||
|
parsedURLs := c.ParseText(body)
|
||||||
|
fmt.Println("Found urls in text file: ")
|
||||||
|
// Find file directory
|
||||||
|
urlLoc := saveFile[:strings.LastIndex(saveFile, "/")+1]
|
||||||
|
for i := range parsedURLs {
|
||||||
|
if parsedURLs[i][0] == '/' {
|
||||||
|
parsedURLs[i] = c.FormatURL(parsedURLs[i][1:])
|
||||||
|
} else if parsedURLs[i][0] == '.' {
|
||||||
|
parsedURLs[i] = c.FormatURL(urlLoc + parsedURLs[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
urls = append(urls, parsedURLs...)
|
||||||
|
}
|
||||||
|
return string(body), urls, nil
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return string(body), urls, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseHTML parses an html tokenizer and returns a list of urls
|
||||||
|
func (c *Crawler) ParseHTML(bd []byte) []string {
|
||||||
|
var urls []string
|
||||||
|
bdReader := bytes.NewReader(bd)
|
||||||
|
z := html.NewTokenizer(bdReader)
|
||||||
|
tt := z.Next()
|
||||||
for {
|
for {
|
||||||
tt := z.Next()
|
tt = z.Next()
|
||||||
switch {
|
switch {
|
||||||
case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
|
case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
|
||||||
t := z.Token()
|
t := z.Token()
|
||||||
@ -259,12 +287,23 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return urls
|
||||||
|
}
|
||||||
|
|
||||||
if err != nil {
|
// ParseText parses a text file and returns a list of urls
|
||||||
return "", nil, err
|
func (c *Crawler) ParseText(bd []byte) []string {
|
||||||
|
var cssURLs = regexp.MustCompile(`url\(([^\)]*)\)`)
|
||||||
|
var urls []string
|
||||||
|
matches := cssURLs.FindAllSubmatch(bd, -1)
|
||||||
|
for _, v := range matches {
|
||||||
|
urls = append(urls, string(v[1]))
|
||||||
}
|
}
|
||||||
|
for i := range urls {
|
||||||
return string(body), urls, nil
|
if urls[i][0] == '\'' || urls[i][0] == '"' {
|
||||||
|
urls[i] = urls[i][1 : len(urls[i])-1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return urls
|
||||||
}
|
}
|
||||||
|
|
||||||
// ParseURL parses a URL and returns its Domain and TLD
|
// ParseURL parses a URL and returns its Domain and TLD
|
||||||
@ -347,4 +386,4 @@ func WriteFile(d string, filename string) error {
|
|||||||
|
|
||||||
func printUsage() {
|
func printUsage() {
|
||||||
fmt.Println("Usage: ...")
|
fmt.Println("Usage: ...")
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user