Mostly Functional

Just need to clean up a few things and we're good to go!
2015-07-30 14:55:05 -05:00
parent 8f0899e8a9
commit 9afaeee97d
1 changed files with 239 additions and 0 deletions
--- a/fullscrape.go
+++ b/fullscrape.go
@@ -0,0 +1,239 @@
 package main
 import (
 	"bytes"
 	"fmt"
 	"golang.org/x/net/html"
 	"io/ioutil"
 	"net/http"
 	"os"
 	"strconv"
 	"strings"
 )
 const PROGRAM_NAME = "fullscrape"
 func main() {
 	if len(os.Args) <= 3 {
 		fmt.Print("Usage: " + PROGRAM_NAME + " <url> <output-directory> <depth> [-nx]\n")
 		fmt.Print("      -n: Don't rewrite urls in source files to work locally\n")
 		fmt.Print("      -x: Cross domains when following links\n")
 		os.Exit(1)
 	}
 	fix_urls := true
 	cross_domains := false
 	req_url := os.Args[1] //"http://golang.org/"
 	out_dir := os.Args[2]
 	req_depth, err := strconv.Atoi(os.Args[3]) //4
 	if err != nil {
 		fmt.Print("Invalid Depth specified. Please give a number.\n")
 		fmt.Print("Usage: " + PROGRAM_NAME + " <url> <output-directory> <depth> [-n]\n")
 		os.Exit(1)
 	}
 	if len(os.Args) > 3 {
 		tst_arg := os.Args[4]
 		if strings.Index(tst_arg, "n") != -1 {
 			fix_urls = false
 		} else if strings.Index(tst_arg, "x") != -1 {
 			cross_domains = true
 		}
 	}
 	if err = CreateDirIfNotExist(out_dir); err != nil {
 		fmt.Print("Unable to create initial directory %s\n", out_dir)
 		fmt.Print("Error: %s\n", err)
 		os.Exit(1)
 	}
 	c := new(Crawler)
 	// Make sure we have the protocol
 	if strings.Index(req_url, "http") != 0 {
 		req_url = "http://" + req_url
 	}
 	if !strings.HasSuffix(req_url, "/") {
 		req_url = req_url + "/"
 	}
 	c.rootUrl = req_url
 	c.outDir = out_dir
 	c.fixUrls = fix_urls
 	c.xDomain = cross_domains
 	c.Crawl(req_url, req_depth)
 }
 type unprocessed struct {
 	depth int
 	url   []string
 }
 type Crawler struct {
 	rootUrl string
 	outDir  string
 	fixUrls bool
 	xDomain bool
 }
 func (c *Crawler) Crawl(url string, depth int) {
 	// Setup channel for inputs to be processed
 	up := make(chan unprocessed, 0)
 	// Kick off processing and count how many pages are left to process
 	go c.getPage(url, depth, up)
 	outstanding := 1
 	visited := make(map[string]bool)
 	for outstanding > 0 {
 		// Pop a visit from the channel
 		next := <-up
 		outstanding--
 		// If we're too deep, skip it
 		if next.depth <= 0 {
 			continue
 		}
 		// Loop over all urls to visit from that page
 		for _, link := range next.url {
 			// Check that we haven't visited them before
 			if visited[link] {
 				continue
 			}
 			// All good to visit them
 			outstanding++
 			visited[link] = true
 			go c.getPage(link, depth, up)
 		}
 	}
 }
 func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
 	_, urls, err := c.Fetch(url)
 	//body, urls, err := c.Fetch(url)
 	fmt.Printf("Found: %s\n", url)
 	if err != nil {
 		fmt.Println(err)
 	}
 	fmt.Printf("Pulled URLS: %s\n", urls)
 	r <- unprocessed{depth - 1, urls}
 }
 func (c *Crawler) Fetch(url string) (string, []string, error) {
 	urls := make([]string, 0)
 	// Ok, go get URL
 	response, err := http.Get(url)
 	if err != nil {
 		return "", nil, err
 	}
 	body, err := ioutil.ReadAll(response.Body)
 	response.Body.Close()
 	// Save the body to the appropriate directory
 	save_file := strings.TrimPrefix(url, c.rootUrl)
 	if save_file == "" {
 		save_file = "index.html"
 	} else {
 		save_arr := strings.Split(save_file, "/")
 		save_dir := strings.Join(save_arr[:len(save_arr)-1], "/")
 		if strings.Index(save_dir, "/") != 0 {
 			save_dir = "/" + save_dir
 		}
 		save_dir = c.outDir + save_dir
 		if len(save_arr) > 0 {
 			if err = CreateDirIfNotExist(save_dir); err != nil {
 				fmt.Printf("PANIC: Unable to create directory %s\n", save_dir)
 				fmt.Printf("Error: %s\n", err)
 				os.Exit(1)
 			}
 		}
 	}
 	WriteFile(string(body), c.outDir+"/"+save_file)
 	// Read the body into a buffer
 	bd_reader := bytes.NewReader(body)
 	z := html.NewTokenizer(bd_reader)
 	tt := z.Next()
 	// Is this an HTML file?
 	if tt != html.DoctypeToken {
 		// Nope, so we're done here
 		return string(body), urls, nil
 	}
 	for {
 		tt := z.Next()
 		switch {
 		case tt == html.StartTagToken:
 			t := z.Token()
 			if t.Data == "a" || t.Data == "link" {
 				for _, a := range t.Attr {
 					if a.Key == "href" {
 						if c.CheckUrl(a.Val) {
 							urls = append(urls, c.FormatUrl(a.Val))
 						}
 						break
 					}
 				}
 			} else if t.Data == "img" {
 				for _, a := range t.Attr {
 					if a.Key == "src" {
 						if c.CheckUrl(a.Val) {
 							urls = append(urls, c.FormatUrl(a.Val))
 						}
 						break
 					}
 				}
 			}
 		}
 		if tt == html.ErrorToken {
 			break
 		}
 	}
 	if err != nil {
 		return "", nil, err
 	}
 	return string(body), urls, nil
 }
 /* CheckUrl checks if we should follow the url or not
 * The main purpose is for cross-domain checks
 */
 func (c *Crawler) CheckUrl(url string) bool {
 	if !c.xDomain {
 		if strings.HasPrefix(url, "http") {
 			return strings.HasPrefix(url, c.rootUrl)
 		}
 	}
 	return true
 }
 func (c *Crawler) FormatUrl(url string) string {
 	// If the URL doesn't start with http, then it should be relative
 	if strings.Index(url, "http") != 0 {
 		url = c.rootUrl + url
 	}
 	return url
 }
 func CreateDirIfNotExist(dir string) error {
 	// Check if out_dir exists, if not, try to make it
 	_, err := os.Stat(dir)
 	if os.IsNotExist(err) {
 		// Doesn't exist, try to create
 		if err = os.MkdirAll(dir, 0755); err != nil {
 			return err
 		}
 	}
 	// Directory exists, just return
 	return nil
 }
 func WriteFile(d string, filename string) error {
 	do := []byte(d)
 	fmt.Printf("Writing %s\n", filename)
 	return ioutil.WriteFile(filename, do, 0664)
 }