package main import ( "bytes" "fmt" "io/ioutil" "net/http" "os" "regexp" "strconv" "strings" "time" "golang.org/x/net/html" ) const programName = "fullscrape" func main() { if len(os.Args) <= 1 { printUsage() os.Exit(1) } reqURL := os.Args[1] //"http://golang.org/" outDir := "" if len(os.Args) > 2 { outDir = os.Args[2] } depthFlag := -1 norewriteFlag := false crossdomainFlag := false throttleFlag := 1000 var err error if len(os.Args) > 3 { tst := os.Args[3] depthArg := strings.IndexRune(tst, 'd') if depthArg >= 0 { // The actual depth value should either be depthArg+1 // or, if that is '=', depthArg+2 if tst[depthArg+1] == '=' { depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+2]) } else { depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+1]) } if err != nil { fmt.Printf("Invalid depth given (must be an integer): %s\n", depthFlag) os.Exit(1) } } norewriteFlag = (strings.IndexRune(tst, 'n') >= 0) crossdomainFlag = (strings.IndexRune(tst, 'x') >= 0) throttleArg := strings.IndexRune(tst, 't') if throttleArg >= 0 { // The actual throttle value should either be throttleArg+1... // or, if that is '=', throttleArg+2... if tst[depthArg+1] == '=' { // The throttle argument MUST have a space after it throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+2]) } else { throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+1]) } if err != nil { fmt.Printf("Invalid depth given (must be milliseconds as an integer): %s\n", depthFlag) os.Exit(1) } } } if err := CreateDirIfNotExist(outDir); err != nil { fmt.Print("Unable to create initial directory %s\n", outDir) fmt.Print("Error: %s\n", err) os.Exit(1) } c := new(Crawler) // Make sure we have the protocol if strings.Index(reqURL, "http") != 0 { reqURL = "http://" + reqURL } if !strings.HasSuffix(reqURL, "/") { reqURL = reqURL + "/" } c.rootURL = reqURL c.outDir = outDir c.fixUrls = norewriteFlag c.xDomain = crossdomainFlag c.depth = depthFlag c.throttle = time.Duration(throttleFlag) // Parse out the Domain and TLD c.domain, c.tld, err = c.ParseURL(c.rootURL) if err != nil { fmt.Print(err) os.Exit(1) } c.Crawl() } type unprocessed struct { depth int url []string } // Crawler crawls type Crawler struct { rootURL string outDir string fixUrls bool xDomain bool depth int throttle time.Duration domain string tld string } // Crawl tells the crawler to start crawlin' func (c *Crawler) Crawl() { if c.depth >= 0 { fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.depth, c.fixUrls, c.xDomain, c.throttle) } else { fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.fixUrls, c.xDomain, c.throttle) } // Setup channel for inputs to be processed up := make(chan unprocessed, 0) // Kick off processing and count how many pages are left to process go c.getPage(c.rootURL, c.depth, up) outstanding := 1 visited := make(map[string]bool) status := fmt.Sprintf("Files %d/%d\n", len(visited), outstanding+len(visited)) for outstanding > 0 { done := len(visited) - outstanding if done < 0 { done = 0 } status = fmt.Sprintf("Files %d/%d\n", done, len(visited)) fmt.Print(status) if c.throttle > 0 { time.Sleep(time.Millisecond * c.throttle) } // Pop a visit from the channel next := <-up outstanding-- // If we're too deep, skip it if next.depth == 0 { continue } // Loop over all urls to visit from that page for _, link := range next.url { // Check that we haven't visited them before if visited[link] { continue } // All good to visit them outstanding++ visited[link] = true go c.getPage(link, next.depth, up) } } status = fmt.Sprintf("Files %d/%d\n", len(visited), len(visited)) fmt.Printf("%s\n", status) } func (c *Crawler) getPage(url string, depth int, r chan unprocessed) { _, urls, err := c.Fetch(url) //body, urls, err := c.Fetch(url) fmt.Printf("Found: %s\n", url) if err != nil { fmt.Println(err) } //fmt.Printf("Pulled URLS: %s\n", urls) r <- unprocessed{depth - 1, urls} } // Fetch initiates a page get func (c *Crawler) Fetch(url string) (string, []string, error) { var urls []string // Ok, go get URL response, err := http.Get(url) if err != nil || response.StatusCode != 200 { return "", nil, err } body, err := ioutil.ReadAll(response.Body) response.Body.Close() // Save the body to the appropriate directory saveFile := strings.TrimPrefix(url, c.rootURL) if saveFile == "" { saveFile = "index.html" } else { saveArr := strings.Split(saveFile, "/") saveDir := strings.Join(saveArr[:len(saveArr)-1], "/") if strings.Index(saveDir, "/") != 0 { saveDir = "/" + saveDir } saveDir = c.outDir + saveDir if len(saveArr) > 0 { if err = CreateDirIfNotExist(saveDir); err != nil { fmt.Printf("PANIC: Unable to create directory %s\n", saveDir) fmt.Printf("Error: %s\n", err) os.Exit(1) } } } WriteFile(string(body), c.outDir+"/"+saveFile) fmt.Println("Parsing " + c.outDir + saveFile) // Read the body into a buffer bdReader := bytes.NewReader(body) z := html.NewTokenizer(bdReader) tt := z.Next() // Is this an HTML file switch tt { case html.DoctypeToken: urls = append(urls, c.ParseHTML(body)...) case html.TextToken: parsedURLs := c.ParseText(body) fmt.Println("Found urls in text file: ") // Find file directory urlLoc := saveFile[:strings.LastIndex(saveFile, "/")+1] for i := range parsedURLs { if parsedURLs[i][0] == '/' { parsedURLs[i] = c.FormatURL(parsedURLs[i][1:]) } else if parsedURLs[i][0] == '.' { parsedURLs[i] = c.FormatURL(urlLoc + parsedURLs[i]) } } urls = append(urls, parsedURLs...) } return string(body), urls, nil if err != nil { return "", nil, err } return string(body), urls, nil } // ParseHTML parses an html tokenizer and returns a list of urls func (c *Crawler) ParseHTML(bd []byte) []string { var urls []string bdReader := bytes.NewReader(bd) z := html.NewTokenizer(bdReader) tt := z.Next() for { tt = z.Next() switch { case tt == html.StartTagToken || tt == html.SelfClosingTagToken: t := z.Token() if t.Data == "link" || t.Data == "a" { for _, a := range t.Attr { if a.Key == "href" { if c.CheckURL(a.Val) { urls = append(urls, c.FormatURL(a.Val)) } else { fmt.Printf("CheckURL Failed For: %s\n", a.Val) } break } } } else if t.Data == "img" || t.Data == "script" { for _, a := range t.Attr { if a.Key == "src" { if c.CheckURL(a.Val) { urls = append(urls, c.FormatURL(a.Val)) } else { fmt.Printf("CheckURL Failed For: %s\n", a.Val) } break } } } } if tt == html.ErrorToken { break } } return urls } // ParseText parses a text file and returns a list of urls func (c *Crawler) ParseText(bd []byte) []string { var cssURLs = regexp.MustCompile(`url\(([^\)]*)\)`) var urls []string matches := cssURLs.FindAllSubmatch(bd, -1) for _, v := range matches { urls = append(urls, string(v[1])) } for i := range urls { if urls[i][0] == '\'' || urls[i][0] == '"' { urls[i] = urls[i][1 : len(urls[i])-1] } } return urls } // ParseURL parses a URL and returns its Domain and TLD func (c *Crawler) ParseURL(url string) (string, string, error) { partSlice := strings.Split(url, "//") server := partSlice[1] partSlice = strings.Split(server, "/") server = partSlice[0] partSlice = strings.Split(server, ".") var tld, domain string // We just want the last two parts of the slice if len(partSlice) >= 2 { partSlice = partSlice[len(partSlice)-2:] domain = partSlice[0] tld = partSlice[1] } else { // There aren't two parts ot the URL?! That's not right... return "", "", fmt.Errorf("Invalid URL Given: %s\n", url) } return domain, tld, nil } // CheckURL checks if we should follow the url or not // The main purpose is for cross-domain checks func (c *Crawler) CheckURL(url string) bool { // Ignore anchor urls if strings.IndexRune(url, '#') >= 0 { return false } // Ignore "mailto" links if strings.HasPrefix(url, "mailto:") { return false } if !c.xDomain { if strings.HasPrefix(url, "http") { // Make sure that this url is in the same domain tstDmn, tstTld, err := c.ParseURL(url) if err != nil { // Error parsing the Domain/TLD out of the URL... return false } return (tstDmn == c.domain && tstTld == c.tld) } } return true } // FormatURL takes a url and, unless it starts with "http" // appends it to the end of c.rootURL func (c *Crawler) FormatURL(url string) string { // If the URL doesn't start with http, then it should be relative if strings.Index(url, "http") != 0 { url = c.rootURL + url } return url } // CreateDirIfNotExist Checks if directory 'dir' exists // If it doesn't, it creates it. func CreateDirIfNotExist(dir string) error { // Check if outDir exists, if not, try to make it _, err := os.Stat(dir) if os.IsNotExist(err) { // Doesn't exist, try to create if err = os.MkdirAll(dir, 0755); err != nil { return err } } // Directory exists, just return return nil } // WriteFile writes the data 'd' to 'filename' func WriteFile(d string, filename string) error { do := []byte(d) //fmt.Printf("Writing %s\n", filename) return ioutil.WriteFile(filename, do, 0664) } func printUsage() { fmt.Println("Usage: ...") }