package main import ( "bytes" "fmt" "golang.org/x/net/html" "io/ioutil" "net/http" "os" "strconv" "strings" ) const PROGRAM_NAME = "fullscrape" func main() { if len(os.Args) <= 3 { fmt.Print("Usage: " + PROGRAM_NAME + " [-nx]\n") fmt.Print(" -n: Don't rewrite urls in source files to work locally\n") fmt.Print(" -x: Cross domains when following links\n") os.Exit(1) } fix_urls := true cross_domains := false req_url := os.Args[1] //"http://golang.org/" out_dir := os.Args[2] req_depth, err := strconv.Atoi(os.Args[3]) //4 if err != nil { fmt.Print("Invalid Depth specified. Please give a number.\n") fmt.Print("Usage: " + PROGRAM_NAME + " [-n]\n") os.Exit(1) } if len(os.Args) > 3 { tst_arg := os.Args[4] if strings.Index(tst_arg, "n") != -1 { fix_urls = false } else if strings.Index(tst_arg, "x") != -1 { cross_domains = true } } if err = CreateDirIfNotExist(out_dir); err != nil { fmt.Print("Unable to create initial directory %s\n", out_dir) fmt.Print("Error: %s\n", err) os.Exit(1) } c := new(Crawler) // Make sure we have the protocol if strings.Index(req_url, "http") != 0 { req_url = "http://" + req_url } if !strings.HasSuffix(req_url, "/") { req_url = req_url + "/" } c.rootUrl = req_url c.outDir = out_dir c.fixUrls = fix_urls c.xDomain = cross_domains c.Crawl(req_url, req_depth) } type unprocessed struct { depth int url []string } type Crawler struct { rootUrl string outDir string fixUrls bool xDomain bool } func (c *Crawler) Crawl(url string, depth int) { // Setup channel for inputs to be processed up := make(chan unprocessed, 0) // Kick off processing and count how many pages are left to process go c.getPage(url, depth, up) outstanding := 1 visited := make(map[string]bool) for outstanding > 0 { // Pop a visit from the channel next := <-up outstanding-- // If we're too deep, skip it if next.depth <= 0 { continue } // Loop over all urls to visit from that page for _, link := range next.url { // Check that we haven't visited them before if visited[link] { continue } // All good to visit them outstanding++ visited[link] = true go c.getPage(link, depth, up) } } } func (c *Crawler) getPage(url string, depth int, r chan unprocessed) { _, urls, err := c.Fetch(url) //body, urls, err := c.Fetch(url) fmt.Printf("Found: %s\n", url) if err != nil { fmt.Println(err) } fmt.Printf("Pulled URLS: %s\n", urls) r <- unprocessed{depth - 1, urls} } func (c *Crawler) Fetch(url string) (string, []string, error) { urls := make([]string, 0) // Ok, go get URL response, err := http.Get(url) if err != nil { return "", nil, err } body, err := ioutil.ReadAll(response.Body) response.Body.Close() // Save the body to the appropriate directory save_file := strings.TrimPrefix(url, c.rootUrl) if save_file == "" { save_file = "index.html" } else { save_arr := strings.Split(save_file, "/") save_dir := strings.Join(save_arr[:len(save_arr)-1], "/") if strings.Index(save_dir, "/") != 0 { save_dir = "/" + save_dir } save_dir = c.outDir + save_dir if len(save_arr) > 0 { if err = CreateDirIfNotExist(save_dir); err != nil { fmt.Printf("PANIC: Unable to create directory %s\n", save_dir) fmt.Printf("Error: %s\n", err) os.Exit(1) } } } WriteFile(string(body), c.outDir+"/"+save_file) // Read the body into a buffer bd_reader := bytes.NewReader(body) z := html.NewTokenizer(bd_reader) tt := z.Next() // Is this an HTML file? if tt != html.DoctypeToken { // Nope, so we're done here return string(body), urls, nil } for { tt := z.Next() switch { case tt == html.StartTagToken: t := z.Token() if t.Data == "a" || t.Data == "link" { for _, a := range t.Attr { if a.Key == "href" { if c.CheckUrl(a.Val) { urls = append(urls, c.FormatUrl(a.Val)) } break } } } else if t.Data == "img" { for _, a := range t.Attr { if a.Key == "src" { if c.CheckUrl(a.Val) { urls = append(urls, c.FormatUrl(a.Val)) } break } } } } if tt == html.ErrorToken { break } } if err != nil { return "", nil, err } return string(body), urls, nil } /* CheckUrl checks if we should follow the url or not * The main purpose is for cross-domain checks */ func (c *Crawler) CheckUrl(url string) bool { if !c.xDomain { if strings.HasPrefix(url, "http") { return strings.HasPrefix(url, c.rootUrl) } } return true } func (c *Crawler) FormatUrl(url string) string { // If the URL doesn't start with http, then it should be relative if strings.Index(url, "http") != 0 { url = c.rootUrl + url } return url } func CreateDirIfNotExist(dir string) error { // Check if out_dir exists, if not, try to make it _, err := os.Stat(dir) if os.IsNotExist(err) { // Doesn't exist, try to create if err = os.MkdirAll(dir, 0755); err != nil { return err } } // Directory exists, just return return nil } func WriteFile(d string, filename string) error { do := []byte(d) fmt.Printf("Writing %s\n", filename) return ioutil.WriteFile(filename, do, 0664) }