gofmt and some error checking

This commit is contained in:
Brian Buller 2016-01-21 12:14:09 -06:00
parent 197750425d
commit fe66451056
3 changed files with 77 additions and 60 deletions

0
.gitignore vendored Executable file → Normal file
View File

0
LICENSE Executable file → Normal file
View File

View File

@ -3,20 +3,28 @@ package main
import ( import (
"bytes" "bytes"
"fmt" "fmt"
"golang.org/x/net/html"
"io/ioutil" "io/ioutil"
"net/http" "net/http"
"os" "os"
"strconv" "strconv"
"strings" "strings"
"time" "time"
"golang.org/x/net/html"
) )
const PROGRAM_NAME = "fullscrape" const programName = "fullscrape"
func main() { func main() {
req_url := os.Args[1] //"http://golang.org/" if len(os.Args) <= 1 {
out_dir := os.Args[2] printUsage()
os.Exit(1)
}
reqURL := os.Args[1] //"http://golang.org/"
outDir := ""
if len(os.Args) > 2 {
outDir = os.Args[2]
}
depthFlag := -1 depthFlag := -1
norewriteFlag := false norewriteFlag := false
crossdomainFlag := false crossdomainFlag := false
@ -58,28 +66,28 @@ func main() {
} }
} }
if err := CreateDirIfNotExist(out_dir); err != nil { if err := CreateDirIfNotExist(outDir); err != nil {
fmt.Print("Unable to create initial directory %s\n", out_dir) fmt.Print("Unable to create initial directory %s\n", outDir)
fmt.Print("Error: %s\n", err) fmt.Print("Error: %s\n", err)
os.Exit(1) os.Exit(1)
} }
c := new(Crawler) c := new(Crawler)
// Make sure we have the protocol // Make sure we have the protocol
if strings.Index(req_url, "http") != 0 { if strings.Index(reqURL, "http") != 0 {
req_url = "http://" + req_url reqURL = "http://" + reqURL
} }
if !strings.HasSuffix(req_url, "/") { if !strings.HasSuffix(reqURL, "/") {
req_url = req_url + "/" reqURL = reqURL + "/"
} }
c.rootUrl = req_url c.rootURL = reqURL
c.outDir = out_dir c.outDir = outDir
c.fixUrls = norewriteFlag c.fixUrls = norewriteFlag
c.xDomain = crossdomainFlag c.xDomain = crossdomainFlag
c.depth = depthFlag c.depth = depthFlag
c.throttle = time.Duration(throttleFlag) c.throttle = time.Duration(throttleFlag)
// Parse out the Domain and TLD // Parse out the Domain and TLD
c.domain, c.tld, err = c.ParseUrl(c.rootUrl) c.domain, c.tld, err = c.ParseURL(c.rootURL)
if err != nil { if err != nil {
fmt.Print(err) fmt.Print(err)
os.Exit(1) os.Exit(1)
@ -93,8 +101,9 @@ type unprocessed struct {
url []string url []string
} }
// Crawler crawls
type Crawler struct { type Crawler struct {
rootUrl string rootURL string
outDir string outDir string
fixUrls bool fixUrls bool
xDomain bool xDomain bool
@ -104,18 +113,19 @@ type Crawler struct {
tld string tld string
} }
// Crawl tells the crawler to start crawlin'
func (c *Crawler) Crawl() { func (c *Crawler) Crawl() {
if c.depth >= 0 { if c.depth >= 0 {
fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.depth, c.fixUrls, c.xDomain, c.throttle) fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.depth, c.fixUrls, c.xDomain, c.throttle)
} else { } else {
fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.fixUrls, c.xDomain, c.throttle) fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.fixUrls, c.xDomain, c.throttle)
} }
// Setup channel for inputs to be processed // Setup channel for inputs to be processed
up := make(chan unprocessed, 0) up := make(chan unprocessed, 0)
// Kick off processing and count how many pages are left to process // Kick off processing and count how many pages are left to process
go c.getPage(c.rootUrl, c.depth, up) go c.getPage(c.rootURL, c.depth, up)
outstanding := 1 outstanding := 1
visited := make(map[string]bool) visited := make(map[string]bool)
@ -172,8 +182,9 @@ func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
r <- unprocessed{depth - 1, urls} r <- unprocessed{depth - 1, urls}
} }
// Fetch initiates a page get
func (c *Crawler) Fetch(url string) (string, []string, error) { func (c *Crawler) Fetch(url string) (string, []string, error) {
urls := make([]string, 0) var urls []string
// Ok, go get URL // Ok, go get URL
response, err := http.Get(url) response, err := http.Get(url)
if err != nil || response.StatusCode != 200 { if err != nil || response.StatusCode != 200 {
@ -183,31 +194,31 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
response.Body.Close() response.Body.Close()
// Save the body to the appropriate directory // Save the body to the appropriate directory
save_file := strings.TrimPrefix(url, c.rootUrl) saveFile := strings.TrimPrefix(url, c.rootURL)
if save_file == "" { if saveFile == "" {
save_file = "index.html" saveFile = "index.html"
} else { } else {
save_arr := strings.Split(save_file, "/") saveArr := strings.Split(saveFile, "/")
save_dir := strings.Join(save_arr[:len(save_arr)-1], "/") saveDir := strings.Join(saveArr[:len(saveArr)-1], "/")
if strings.Index(save_dir, "/") != 0 { if strings.Index(saveDir, "/") != 0 {
save_dir = "/" + save_dir saveDir = "/" + saveDir
} }
save_dir = c.outDir + save_dir saveDir = c.outDir + saveDir
if len(save_arr) > 0 { if len(saveArr) > 0 {
if err = CreateDirIfNotExist(save_dir); err != nil { if err = CreateDirIfNotExist(saveDir); err != nil {
fmt.Printf("PANIC: Unable to create directory %s\n", save_dir) fmt.Printf("PANIC: Unable to create directory %s\n", saveDir)
fmt.Printf("Error: %s\n", err) fmt.Printf("Error: %s\n", err)
os.Exit(1) os.Exit(1)
} }
} }
} }
WriteFile(string(body), c.outDir+"/"+save_file) WriteFile(string(body), c.outDir+"/"+saveFile)
// Read the body into a buffer // Read the body into a buffer
bd_reader := bytes.NewReader(body) bdReader := bytes.NewReader(body)
z := html.NewTokenizer(bd_reader) z := html.NewTokenizer(bdReader)
tt := z.Next() tt := z.Next()
// Is this an HTML file? // Is this an HTML file?
if tt != html.DoctypeToken { if tt != html.DoctypeToken {
@ -223,10 +234,10 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
if t.Data == "link" || t.Data == "a" { if t.Data == "link" || t.Data == "a" {
for _, a := range t.Attr { for _, a := range t.Attr {
if a.Key == "href" { if a.Key == "href" {
if c.CheckUrl(a.Val) { if c.CheckURL(a.Val) {
urls = append(urls, c.FormatUrl(a.Val)) urls = append(urls, c.FormatURL(a.Val))
} else { } else {
fmt.Printf("CheckUrl Failed For: %s\n", a.Val) fmt.Printf("CheckURL Failed For: %s\n", a.Val)
} }
break break
} }
@ -234,10 +245,10 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
} else if t.Data == "img" || t.Data == "script" { } else if t.Data == "img" || t.Data == "script" {
for _, a := range t.Attr { for _, a := range t.Attr {
if a.Key == "src" { if a.Key == "src" {
if c.CheckUrl(a.Val) { if c.CheckURL(a.Val) {
urls = append(urls, c.FormatUrl(a.Val)) urls = append(urls, c.FormatURL(a.Val))
} else { } else {
fmt.Printf("CheckUrl Failed For: %s\n", a.Val) fmt.Printf("CheckURL Failed For: %s\n", a.Val)
} }
break break
} }
@ -256,22 +267,20 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
return string(body), urls, nil return string(body), urls, nil
} }
/* ParseUrl parses a URL and returns its Domain and TLD // ParseURL parses a URL and returns its Domain and TLD
* (An error is returned if it can't...) func (c *Crawler) ParseURL(url string) (string, string, error) {
*/ partSlice := strings.Split(url, "//")
func (c *Crawler) ParseUrl(url string) (string, string, error) { server := partSlice[1]
part_slice := strings.Split(url, "//") partSlice = strings.Split(server, "/")
server := part_slice[1] server = partSlice[0]
part_slice = strings.Split(server, "/") partSlice = strings.Split(server, ".")
server = part_slice[0]
part_slice = strings.Split(server, ".")
var tld, domain string var tld, domain string
// We just want the last two parts of the slice // We just want the last two parts of the slice
if len(part_slice) >= 2 { if len(partSlice) >= 2 {
part_slice = part_slice[len(part_slice)-2:] partSlice = partSlice[len(partSlice)-2:]
domain = part_slice[0] domain = partSlice[0]
tld = part_slice[1] tld = partSlice[1]
} else { } else {
// There aren't two parts ot the URL?! That's not right... // There aren't two parts ot the URL?! That's not right...
return "", "", fmt.Errorf("Invalid URL Given: %s\n", url) return "", "", fmt.Errorf("Invalid URL Given: %s\n", url)
@ -279,10 +288,9 @@ func (c *Crawler) ParseUrl(url string) (string, string, error) {
return domain, tld, nil return domain, tld, nil
} }
/* CheckUrl checks if we should follow the url or not // CheckURL checks if we should follow the url or not
* The main purpose is for cross-domain checks // The main purpose is for cross-domain checks
*/ func (c *Crawler) CheckURL(url string) bool {
func (c *Crawler) CheckUrl(url string) bool {
// Ignore anchor urls // Ignore anchor urls
if strings.IndexRune(url, '#') >= 0 { if strings.IndexRune(url, '#') >= 0 {
return false return false
@ -294,27 +302,31 @@ func (c *Crawler) CheckUrl(url string) bool {
if !c.xDomain { if !c.xDomain {
if strings.HasPrefix(url, "http") { if strings.HasPrefix(url, "http") {
// Make sure that this url is in the same domain // Make sure that this url is in the same domain
tst_dmn, tst_tld, err := c.ParseUrl(url) tstDmn, tstTld, err := c.ParseURL(url)
if err != nil { if err != nil {
// Error parsing the Domain/TLD out of the URL... // Error parsing the Domain/TLD out of the URL...
return false return false
} }
return (tst_dmn == c.domain && tst_tld == c.tld) return (tstDmn == c.domain && tstTld == c.tld)
} }
} }
return true return true
} }
func (c *Crawler) FormatUrl(url string) string { // FormatURL takes a url and, unless it starts with "http"
// appends it to the end of c.rootURL
func (c *Crawler) FormatURL(url string) string {
// If the URL doesn't start with http, then it should be relative // If the URL doesn't start with http, then it should be relative
if strings.Index(url, "http") != 0 { if strings.Index(url, "http") != 0 {
url = c.rootUrl + url url = c.rootURL + url
} }
return url return url
} }
// CreateDirIfNotExist Checks if directory 'dir' exists
// If it doesn't, it creates it.
func CreateDirIfNotExist(dir string) error { func CreateDirIfNotExist(dir string) error {
// Check if out_dir exists, if not, try to make it // Check if outDir exists, if not, try to make it
_, err := os.Stat(dir) _, err := os.Stat(dir)
if os.IsNotExist(err) { if os.IsNotExist(err) {
// Doesn't exist, try to create // Doesn't exist, try to create
@ -326,8 +338,13 @@ func CreateDirIfNotExist(dir string) error {
return nil return nil
} }
// WriteFile writes the data 'd' to 'filename'
func WriteFile(d string, filename string) error { func WriteFile(d string, filename string) error {
do := []byte(d) do := []byte(d)
//fmt.Printf("Writing %s\n", filename) //fmt.Printf("Writing %s\n", filename)
return ioutil.WriteFile(filename, do, 0664) return ioutil.WriteFile(filename, do, 0664)
}
func printUsage() {
fmt.Println("Usage: ...")
} }