From b863bd5bde53d269dbf8af1b72549a825e53c3a2 Mon Sep 17 00:00:00 2001 From: Brian Buller Date: Thu, 5 May 2016 09:02:28 -0500 Subject: [PATCH] Pulling some css links --- fullscrape.go | 71 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 16 deletions(-) diff --git a/fullscrape.go b/fullscrape.go index 287c24e..bbfa56c 100644 --- a/fullscrape.go +++ b/fullscrape.go @@ -6,6 +6,7 @@ import ( "io/ioutil" "net/http" "os" + "regexp" "strconv" "strings" "time" @@ -130,14 +131,13 @@ func (c *Crawler) Crawl() { visited := make(map[string]bool) - status := fmt.Sprintf("Files %d/%d", len(visited), outstanding+len(visited)) + status := fmt.Sprintf("Files %d/%d\n", len(visited), outstanding+len(visited)) for outstanding > 0 { done := len(visited) - outstanding if done < 0 { done = 0 } - fmt.Print(strings.Repeat("", len(status))) - status = fmt.Sprintf("Files %d/%d", done, len(visited)) + status = fmt.Sprintf("Files %d/%d\n", done, len(visited)) fmt.Print(status) if c.throttle > 0 { time.Sleep(time.Millisecond * c.throttle) @@ -164,8 +164,7 @@ func (c *Crawler) Crawl() { go c.getPage(link, next.depth, up) } } - //fmt.Print(strings.Repeat("", len(status))) - status = fmt.Sprintf("Files %d/%d", len(visited), len(visited)) + status = fmt.Sprintf("Files %d/%d\n", len(visited), len(visited)) fmt.Printf("%s\n", status) } @@ -214,20 +213,49 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { } } - WriteFile(string(body), c.outDir+"/"+saveFile) + WriteFile(string(body), c.outDir+saveFile) + + fmt.Println("Parsing " + c.outDir + saveFile) // Read the body into a buffer bdReader := bytes.NewReader(body) z := html.NewTokenizer(bdReader) tt := z.Next() - // Is this an HTML file? - if tt != html.DoctypeToken { - // Nope, so we're done here - return string(body), urls, nil + // Is this an HTML file + switch tt { + case html.DoctypeToken: + urls = append(urls, c.ParseHTML(body)...) + case html.TextToken: + parsedURLs := c.ParseText(body) + fmt.Println("Found urls in text file: ") + // Find file directory + urlLoc := saveFile[:strings.LastIndex(saveFile, "/")+1] + for i := range parsedURLs { + if parsedURLs[i][0] == '/' { + parsedURLs[i] = c.FormatURL(parsedURLs[i][1:]) + } else if parsedURLs[i][0] == '.' { + parsedURLs[i] = c.FormatURL(urlLoc + parsedURLs[i]) + } + } + urls = append(urls, parsedURLs...) + } + return string(body), urls, nil + + if err != nil { + return "", nil, err } + return string(body), urls, nil +} + +// ParseHTML parses an html tokenizer and returns a list of urls +func (c *Crawler) ParseHTML(bd []byte) []string { + var urls []string + bdReader := bytes.NewReader(bd) + z := html.NewTokenizer(bdReader) + tt := z.Next() for { - tt := z.Next() + tt = z.Next() switch { case tt == html.StartTagToken || tt == html.SelfClosingTagToken: t := z.Token() @@ -259,12 +287,23 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { break } } + return urls +} - if err != nil { - return "", nil, err +// ParseText parses a text file and returns a list of urls +func (c *Crawler) ParseText(bd []byte) []string { + var cssURLs = regexp.MustCompile(`url\(([^\)]*)\)`) + var urls []string + matches := cssURLs.FindAllSubmatch(bd, -1) + for _, v := range matches { + urls = append(urls, string(v[1])) } - - return string(body), urls, nil + for i := range urls { + if urls[i][0] == '\'' || urls[i][0] == '"' { + urls[i] = urls[i][1 : len(urls[i])-1] + } + } + return urls } // ParseURL parses a URL and returns its Domain and TLD @@ -347,4 +386,4 @@ func WriteFile(d string, filename string) error { func printUsage() { fmt.Println("Usage: ...") -} \ No newline at end of file +}