From 9afaeee97d22733127ed20f71a1abf5960f68f7e Mon Sep 17 00:00:00 2001 From: Brian Buller Date: Thu, 30 Jul 2015 14:55:05 -0500 Subject: [PATCH] Mostly Functional Just need to clean up a few things and we're good to go! --- fullscrape.go | 239 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 239 insertions(+) create mode 100644 fullscrape.go diff --git a/fullscrape.go b/fullscrape.go new file mode 100644 index 0000000..7cb09b8 --- /dev/null +++ b/fullscrape.go @@ -0,0 +1,239 @@ +package main + +import ( + "bytes" + "fmt" + "golang.org/x/net/html" + "io/ioutil" + "net/http" + "os" + "strconv" + "strings" +) + +const PROGRAM_NAME = "fullscrape" + +func main() { + if len(os.Args) <= 3 { + fmt.Print("Usage: " + PROGRAM_NAME + " [-nx]\n") + fmt.Print(" -n: Don't rewrite urls in source files to work locally\n") + fmt.Print(" -x: Cross domains when following links\n") + os.Exit(1) + } + fix_urls := true + cross_domains := false + req_url := os.Args[1] //"http://golang.org/" + out_dir := os.Args[2] + req_depth, err := strconv.Atoi(os.Args[3]) //4 + if err != nil { + fmt.Print("Invalid Depth specified. Please give a number.\n") + fmt.Print("Usage: " + PROGRAM_NAME + " [-n]\n") + os.Exit(1) + } + if len(os.Args) > 3 { + tst_arg := os.Args[4] + if strings.Index(tst_arg, "n") != -1 { + fix_urls = false + } else if strings.Index(tst_arg, "x") != -1 { + cross_domains = true + } + } + if err = CreateDirIfNotExist(out_dir); err != nil { + fmt.Print("Unable to create initial directory %s\n", out_dir) + fmt.Print("Error: %s\n", err) + os.Exit(1) + } + + c := new(Crawler) + // Make sure we have the protocol + if strings.Index(req_url, "http") != 0 { + req_url = "http://" + req_url + } + if !strings.HasSuffix(req_url, "/") { + req_url = req_url + "/" + } + c.rootUrl = req_url + c.outDir = out_dir + c.fixUrls = fix_urls + c.xDomain = cross_domains + + c.Crawl(req_url, req_depth) +} + +type unprocessed struct { + depth int + url []string +} + +type Crawler struct { + rootUrl string + outDir string + fixUrls bool + xDomain bool +} + +func (c *Crawler) Crawl(url string, depth int) { + // Setup channel for inputs to be processed + up := make(chan unprocessed, 0) + + // Kick off processing and count how many pages are left to process + go c.getPage(url, depth, up) + outstanding := 1 + + visited := make(map[string]bool) + for outstanding > 0 { + // Pop a visit from the channel + next := <-up + outstanding-- + + // If we're too deep, skip it + if next.depth <= 0 { + continue + } + + // Loop over all urls to visit from that page + for _, link := range next.url { + // Check that we haven't visited them before + if visited[link] { + continue + } + + // All good to visit them + outstanding++ + visited[link] = true + go c.getPage(link, depth, up) + } + } +} + +func (c *Crawler) getPage(url string, depth int, r chan unprocessed) { + _, urls, err := c.Fetch(url) + //body, urls, err := c.Fetch(url) + fmt.Printf("Found: %s\n", url) + if err != nil { + fmt.Println(err) + } + + fmt.Printf("Pulled URLS: %s\n", urls) + + r <- unprocessed{depth - 1, urls} +} + +func (c *Crawler) Fetch(url string) (string, []string, error) { + urls := make([]string, 0) + // Ok, go get URL + response, err := http.Get(url) + if err != nil { + return "", nil, err + } + body, err := ioutil.ReadAll(response.Body) + response.Body.Close() + + // Save the body to the appropriate directory + save_file := strings.TrimPrefix(url, c.rootUrl) + if save_file == "" { + save_file = "index.html" + } else { + save_arr := strings.Split(save_file, "/") + save_dir := strings.Join(save_arr[:len(save_arr)-1], "/") + if strings.Index(save_dir, "/") != 0 { + save_dir = "/" + save_dir + } + save_dir = c.outDir + save_dir + + if len(save_arr) > 0 { + if err = CreateDirIfNotExist(save_dir); err != nil { + fmt.Printf("PANIC: Unable to create directory %s\n", save_dir) + fmt.Printf("Error: %s\n", err) + os.Exit(1) + } + } + } + + WriteFile(string(body), c.outDir+"/"+save_file) + + // Read the body into a buffer + bd_reader := bytes.NewReader(body) + z := html.NewTokenizer(bd_reader) + tt := z.Next() + // Is this an HTML file? + if tt != html.DoctypeToken { + // Nope, so we're done here + return string(body), urls, nil + } + + for { + tt := z.Next() + switch { + case tt == html.StartTagToken: + t := z.Token() + if t.Data == "a" || t.Data == "link" { + for _, a := range t.Attr { + if a.Key == "href" { + if c.CheckUrl(a.Val) { + urls = append(urls, c.FormatUrl(a.Val)) + } + break + } + } + } else if t.Data == "img" { + for _, a := range t.Attr { + if a.Key == "src" { + if c.CheckUrl(a.Val) { + urls = append(urls, c.FormatUrl(a.Val)) + } + break + } + } + } + } + if tt == html.ErrorToken { + break + } + } + + if err != nil { + return "", nil, err + } + + return string(body), urls, nil +} + +/* CheckUrl checks if we should follow the url or not + * The main purpose is for cross-domain checks + */ +func (c *Crawler) CheckUrl(url string) bool { + if !c.xDomain { + if strings.HasPrefix(url, "http") { + return strings.HasPrefix(url, c.rootUrl) + } + } + return true +} + +func (c *Crawler) FormatUrl(url string) string { + // If the URL doesn't start with http, then it should be relative + if strings.Index(url, "http") != 0 { + url = c.rootUrl + url + } + return url +} + +func CreateDirIfNotExist(dir string) error { + // Check if out_dir exists, if not, try to make it + _, err := os.Stat(dir) + if os.IsNotExist(err) { + // Doesn't exist, try to create + if err = os.MkdirAll(dir, 0755); err != nil { + return err + } + } + // Directory exists, just return + return nil +} + +func WriteFile(d string, filename string) error { + do := []byte(d) + fmt.Printf("Writing %s\n", filename) + return ioutil.WriteFile(filename, do, 0664) +} \ No newline at end of file