From 14d105740aad905feee5f29b1de05b0ad6c20625 Mon Sep 17 00:00:00 2001 From: Brian Buller Date: Fri, 14 Aug 2015 17:21:18 -0500 Subject: [PATCH] Looks like it's pretty much working! Yeah, I'm pretty awesome. --- fullscrape.go | 129 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 91 insertions(+), 38 deletions(-) diff --git a/fullscrape.go b/fullscrape.go index 7cb09b8..224b6db 100644 --- a/fullscrape.go +++ b/fullscrape.go @@ -9,36 +9,56 @@ import ( "os" "strconv" "strings" + "time" ) const PROGRAM_NAME = "fullscrape" func main() { - if len(os.Args) <= 3 { - fmt.Print("Usage: " + PROGRAM_NAME + " [-nx]\n") - fmt.Print(" -n: Don't rewrite urls in source files to work locally\n") - fmt.Print(" -x: Cross domains when following links\n") - os.Exit(1) - } - fix_urls := true - cross_domains := false req_url := os.Args[1] //"http://golang.org/" out_dir := os.Args[2] - req_depth, err := strconv.Atoi(os.Args[3]) //4 - if err != nil { - fmt.Print("Invalid Depth specified. Please give a number.\n") - fmt.Print("Usage: " + PROGRAM_NAME + " [-n]\n") - os.Exit(1) - } + depthFlag := -1 + norewriteFlag := false + crossdomainFlag := false + throttleFlag := 1000 + var err error + if len(os.Args) > 3 { - tst_arg := os.Args[4] - if strings.Index(tst_arg, "n") != -1 { - fix_urls = false - } else if strings.Index(tst_arg, "x") != -1 { - cross_domains = true + tst := os.Args[3] + depthArg := strings.IndexRune(tst, 'd') + if depthArg >= 0 { + // The actual depth value should either be depthArg+1 + // or, if that is '=', depthArg+2 + if tst[depthArg+1] == '=' { + depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+2]) + } else { + depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+1]) + } + if err != nil { + fmt.Printf("Invalid depth given (must be an integer): %s\n", depthFlag) + os.Exit(1) + } + } + norewriteFlag = (strings.IndexRune(tst, 'n') >= 0) + crossdomainFlag = (strings.IndexRune(tst, 'x') >= 0) + throttleArg := strings.IndexRune(tst, 't') + if throttleArg >= 0 { + // The actual throttle value should either be throttleArg+1... + // or, if that is '=', throttleArg+2... + if tst[depthArg+1] == '=' { + // The throttle argument MUST have a space after it + throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+2]) + } else { + throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+1]) + } + if err != nil { + fmt.Printf("Invalid depth given (must be milliseconds as an integer): %s\n", depthFlag) + os.Exit(1) + } } } - if err = CreateDirIfNotExist(out_dir); err != nil { + + if err := CreateDirIfNotExist(out_dir); err != nil { fmt.Print("Unable to create initial directory %s\n", out_dir) fmt.Print("Error: %s\n", err) os.Exit(1) @@ -54,10 +74,12 @@ func main() { } c.rootUrl = req_url c.outDir = out_dir - c.fixUrls = fix_urls - c.xDomain = cross_domains + c.fixUrls = norewriteFlag + c.xDomain = crossdomainFlag + c.depth = depthFlag + c.throttle = time.Duration(throttleFlag) - c.Crawl(req_url, req_depth) + c.Crawl() } type unprocessed struct { @@ -66,28 +88,48 @@ type unprocessed struct { } type Crawler struct { - rootUrl string - outDir string - fixUrls bool - xDomain bool + rootUrl string + outDir string + fixUrls bool + xDomain bool + depth int + throttle time.Duration } -func (c *Crawler) Crawl(url string, depth int) { +func (c *Crawler) Crawl() { + if c.depth >= 0 { + fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.depth, c.fixUrls, c.xDomain, c.throttle) + } else { + fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.fixUrls, c.xDomain, c.throttle) + } + // Setup channel for inputs to be processed up := make(chan unprocessed, 0) // Kick off processing and count how many pages are left to process - go c.getPage(url, depth, up) + go c.getPage(c.rootUrl, c.depth, up) outstanding := 1 visited := make(map[string]bool) + + status := fmt.Sprintf("Files %d/%d", len(visited), outstanding+len(visited)) for outstanding > 0 { + done := len(visited) - outstanding + if done < 0 { + done = 0 + } + fmt.Print(strings.Repeat("", len(status))) + status = fmt.Sprintf("Files %d/%d", done, len(visited)) + fmt.Print(status) + if c.throttle > 0 { + time.Sleep(time.Millisecond * c.throttle) + } // Pop a visit from the channel next := <-up outstanding-- // If we're too deep, skip it - if next.depth <= 0 { + if next.depth == 0 { continue } @@ -101,20 +143,23 @@ func (c *Crawler) Crawl(url string, depth int) { // All good to visit them outstanding++ visited[link] = true - go c.getPage(link, depth, up) + go c.getPage(link, next.depth, up) } } + fmt.Print(strings.Repeat("", len(status))) + status = fmt.Sprintf("Files %d/%d", len(visited), len(visited)) + fmt.Printf("%s\n", status) } func (c *Crawler) getPage(url string, depth int, r chan unprocessed) { _, urls, err := c.Fetch(url) //body, urls, err := c.Fetch(url) - fmt.Printf("Found: %s\n", url) + //fmt.Printf("Found: %s\n", url) if err != nil { fmt.Println(err) } - fmt.Printf("Pulled URLS: %s\n", urls) + //fmt.Printf("Pulled URLS: %s\n", urls) r <- unprocessed{depth - 1, urls} } @@ -123,7 +168,7 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { urls := make([]string, 0) // Ok, go get URL response, err := http.Get(url) - if err != nil { + if err != nil || response.StatusCode != 200 { return "", nil, err } body, err := ioutil.ReadAll(response.Body) @@ -165,9 +210,9 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { for { tt := z.Next() switch { - case tt == html.StartTagToken: + case tt == html.StartTagToken || tt == html.SelfClosingTagToken: t := z.Token() - if t.Data == "a" || t.Data == "link" { + if t.Data == "link" || t.Data == "a" { for _, a := range t.Attr { if a.Key == "href" { if c.CheckUrl(a.Val) { @@ -176,7 +221,7 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { break } } - } else if t.Data == "img" { + } else if t.Data == "img" || t.Data == "script" { for _, a := range t.Attr { if a.Key == "src" { if c.CheckUrl(a.Val) { @@ -203,6 +248,14 @@ func (c *Crawler) Fetch(url string) (string, []string, error) { * The main purpose is for cross-domain checks */ func (c *Crawler) CheckUrl(url string) bool { + // Ignore anchor urls + if strings.IndexRune(url, '#') >= 0 { + return false + } + // Ignore "mailto" links + if strings.HasPrefix(url, "mailto:") { + return false + } if !c.xDomain { if strings.HasPrefix(url, "http") { return strings.HasPrefix(url, c.rootUrl) @@ -234,6 +287,6 @@ func CreateDirIfNotExist(dir string) error { func WriteFile(d string, filename string) error { do := []byte(d) - fmt.Printf("Writing %s\n", filename) + //fmt.Printf("Writing %s\n", filename) return ioutil.WriteFile(filename, do, 0664) } \ No newline at end of file