gofmt and some error checking
This commit is contained in:
parent
197750425d
commit
fe66451056
0
.gitignore
vendored
Executable file → Normal file
0
.gitignore
vendored
Executable file → Normal file
137
fullscrape.go
137
fullscrape.go
@ -3,20 +3,28 @@ package main
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"golang.org/x/net/html"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
const PROGRAM_NAME = "fullscrape"
|
||||
const programName = "fullscrape"
|
||||
|
||||
func main() {
|
||||
req_url := os.Args[1] //"http://golang.org/"
|
||||
out_dir := os.Args[2]
|
||||
if len(os.Args) <= 1 {
|
||||
printUsage()
|
||||
os.Exit(1)
|
||||
}
|
||||
reqURL := os.Args[1] //"http://golang.org/"
|
||||
outDir := ""
|
||||
if len(os.Args) > 2 {
|
||||
outDir = os.Args[2]
|
||||
}
|
||||
depthFlag := -1
|
||||
norewriteFlag := false
|
||||
crossdomainFlag := false
|
||||
@ -58,28 +66,28 @@ func main() {
|
||||
}
|
||||
}
|
||||
|
||||
if err := CreateDirIfNotExist(out_dir); err != nil {
|
||||
fmt.Print("Unable to create initial directory %s\n", out_dir)
|
||||
if err := CreateDirIfNotExist(outDir); err != nil {
|
||||
fmt.Print("Unable to create initial directory %s\n", outDir)
|
||||
fmt.Print("Error: %s\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
c := new(Crawler)
|
||||
// Make sure we have the protocol
|
||||
if strings.Index(req_url, "http") != 0 {
|
||||
req_url = "http://" + req_url
|
||||
if strings.Index(reqURL, "http") != 0 {
|
||||
reqURL = "http://" + reqURL
|
||||
}
|
||||
if !strings.HasSuffix(req_url, "/") {
|
||||
req_url = req_url + "/"
|
||||
if !strings.HasSuffix(reqURL, "/") {
|
||||
reqURL = reqURL + "/"
|
||||
}
|
||||
c.rootUrl = req_url
|
||||
c.outDir = out_dir
|
||||
c.rootURL = reqURL
|
||||
c.outDir = outDir
|
||||
c.fixUrls = norewriteFlag
|
||||
c.xDomain = crossdomainFlag
|
||||
c.depth = depthFlag
|
||||
c.throttle = time.Duration(throttleFlag)
|
||||
// Parse out the Domain and TLD
|
||||
c.domain, c.tld, err = c.ParseUrl(c.rootUrl)
|
||||
c.domain, c.tld, err = c.ParseURL(c.rootURL)
|
||||
if err != nil {
|
||||
fmt.Print(err)
|
||||
os.Exit(1)
|
||||
@ -93,8 +101,9 @@ type unprocessed struct {
|
||||
url []string
|
||||
}
|
||||
|
||||
// Crawler crawls
|
||||
type Crawler struct {
|
||||
rootUrl string
|
||||
rootURL string
|
||||
outDir string
|
||||
fixUrls bool
|
||||
xDomain bool
|
||||
@ -104,18 +113,19 @@ type Crawler struct {
|
||||
tld string
|
||||
}
|
||||
|
||||
// Crawl tells the crawler to start crawlin'
|
||||
func (c *Crawler) Crawl() {
|
||||
if c.depth >= 0 {
|
||||
fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.depth, c.fixUrls, c.xDomain, c.throttle)
|
||||
fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.depth, c.fixUrls, c.xDomain, c.throttle)
|
||||
} else {
|
||||
fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootUrl, c.fixUrls, c.xDomain, c.throttle)
|
||||
fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.fixUrls, c.xDomain, c.throttle)
|
||||
}
|
||||
|
||||
// Setup channel for inputs to be processed
|
||||
up := make(chan unprocessed, 0)
|
||||
|
||||
// Kick off processing and count how many pages are left to process
|
||||
go c.getPage(c.rootUrl, c.depth, up)
|
||||
go c.getPage(c.rootURL, c.depth, up)
|
||||
outstanding := 1
|
||||
|
||||
visited := make(map[string]bool)
|
||||
@ -172,8 +182,9 @@ func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
|
||||
r <- unprocessed{depth - 1, urls}
|
||||
}
|
||||
|
||||
// Fetch initiates a page get
|
||||
func (c *Crawler) Fetch(url string) (string, []string, error) {
|
||||
urls := make([]string, 0)
|
||||
var urls []string
|
||||
// Ok, go get URL
|
||||
response, err := http.Get(url)
|
||||
if err != nil || response.StatusCode != 200 {
|
||||
@ -183,31 +194,31 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
||||
response.Body.Close()
|
||||
|
||||
// Save the body to the appropriate directory
|
||||
save_file := strings.TrimPrefix(url, c.rootUrl)
|
||||
if save_file == "" {
|
||||
save_file = "index.html"
|
||||
saveFile := strings.TrimPrefix(url, c.rootURL)
|
||||
if saveFile == "" {
|
||||
saveFile = "index.html"
|
||||
} else {
|
||||
save_arr := strings.Split(save_file, "/")
|
||||
save_dir := strings.Join(save_arr[:len(save_arr)-1], "/")
|
||||
if strings.Index(save_dir, "/") != 0 {
|
||||
save_dir = "/" + save_dir
|
||||
saveArr := strings.Split(saveFile, "/")
|
||||
saveDir := strings.Join(saveArr[:len(saveArr)-1], "/")
|
||||
if strings.Index(saveDir, "/") != 0 {
|
||||
saveDir = "/" + saveDir
|
||||
}
|
||||
save_dir = c.outDir + save_dir
|
||||
saveDir = c.outDir + saveDir
|
||||
|
||||
if len(save_arr) > 0 {
|
||||
if err = CreateDirIfNotExist(save_dir); err != nil {
|
||||
fmt.Printf("PANIC: Unable to create directory %s\n", save_dir)
|
||||
if len(saveArr) > 0 {
|
||||
if err = CreateDirIfNotExist(saveDir); err != nil {
|
||||
fmt.Printf("PANIC: Unable to create directory %s\n", saveDir)
|
||||
fmt.Printf("Error: %s\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
WriteFile(string(body), c.outDir+"/"+save_file)
|
||||
WriteFile(string(body), c.outDir+"/"+saveFile)
|
||||
|
||||
// Read the body into a buffer
|
||||
bd_reader := bytes.NewReader(body)
|
||||
z := html.NewTokenizer(bd_reader)
|
||||
bdReader := bytes.NewReader(body)
|
||||
z := html.NewTokenizer(bdReader)
|
||||
tt := z.Next()
|
||||
// Is this an HTML file?
|
||||
if tt != html.DoctypeToken {
|
||||
@ -223,10 +234,10 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
||||
if t.Data == "link" || t.Data == "a" {
|
||||
for _, a := range t.Attr {
|
||||
if a.Key == "href" {
|
||||
if c.CheckUrl(a.Val) {
|
||||
urls = append(urls, c.FormatUrl(a.Val))
|
||||
if c.CheckURL(a.Val) {
|
||||
urls = append(urls, c.FormatURL(a.Val))
|
||||
} else {
|
||||
fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
|
||||
fmt.Printf("CheckURL Failed For: %s\n", a.Val)
|
||||
}
|
||||
break
|
||||
}
|
||||
@ -234,10 +245,10 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
||||
} else if t.Data == "img" || t.Data == "script" {
|
||||
for _, a := range t.Attr {
|
||||
if a.Key == "src" {
|
||||
if c.CheckUrl(a.Val) {
|
||||
urls = append(urls, c.FormatUrl(a.Val))
|
||||
if c.CheckURL(a.Val) {
|
||||
urls = append(urls, c.FormatURL(a.Val))
|
||||
} else {
|
||||
fmt.Printf("CheckUrl Failed For: %s\n", a.Val)
|
||||
fmt.Printf("CheckURL Failed For: %s\n", a.Val)
|
||||
}
|
||||
break
|
||||
}
|
||||
@ -256,22 +267,20 @@ func (c *Crawler) Fetch(url string) (string, []string, error) {
|
||||
return string(body), urls, nil
|
||||
}
|
||||
|
||||
/* ParseUrl parses a URL and returns its Domain and TLD
|
||||
* (An error is returned if it can't...)
|
||||
*/
|
||||
func (c *Crawler) ParseUrl(url string) (string, string, error) {
|
||||
part_slice := strings.Split(url, "//")
|
||||
server := part_slice[1]
|
||||
part_slice = strings.Split(server, "/")
|
||||
server = part_slice[0]
|
||||
part_slice = strings.Split(server, ".")
|
||||
// ParseURL parses a URL and returns its Domain and TLD
|
||||
func (c *Crawler) ParseURL(url string) (string, string, error) {
|
||||
partSlice := strings.Split(url, "//")
|
||||
server := partSlice[1]
|
||||
partSlice = strings.Split(server, "/")
|
||||
server = partSlice[0]
|
||||
partSlice = strings.Split(server, ".")
|
||||
var tld, domain string
|
||||
|
||||
// We just want the last two parts of the slice
|
||||
if len(part_slice) >= 2 {
|
||||
part_slice = part_slice[len(part_slice)-2:]
|
||||
domain = part_slice[0]
|
||||
tld = part_slice[1]
|
||||
if len(partSlice) >= 2 {
|
||||
partSlice = partSlice[len(partSlice)-2:]
|
||||
domain = partSlice[0]
|
||||
tld = partSlice[1]
|
||||
} else {
|
||||
// There aren't two parts ot the URL?! That's not right...
|
||||
return "", "", fmt.Errorf("Invalid URL Given: %s\n", url)
|
||||
@ -279,10 +288,9 @@ func (c *Crawler) ParseUrl(url string) (string, string, error) {
|
||||
return domain, tld, nil
|
||||
}
|
||||
|
||||
/* CheckUrl checks if we should follow the url or not
|
||||
* The main purpose is for cross-domain checks
|
||||
*/
|
||||
func (c *Crawler) CheckUrl(url string) bool {
|
||||
// CheckURL checks if we should follow the url or not
|
||||
// The main purpose is for cross-domain checks
|
||||
func (c *Crawler) CheckURL(url string) bool {
|
||||
// Ignore anchor urls
|
||||
if strings.IndexRune(url, '#') >= 0 {
|
||||
return false
|
||||
@ -294,27 +302,31 @@ func (c *Crawler) CheckUrl(url string) bool {
|
||||
if !c.xDomain {
|
||||
if strings.HasPrefix(url, "http") {
|
||||
// Make sure that this url is in the same domain
|
||||
tst_dmn, tst_tld, err := c.ParseUrl(url)
|
||||
tstDmn, tstTld, err := c.ParseURL(url)
|
||||
if err != nil {
|
||||
// Error parsing the Domain/TLD out of the URL...
|
||||
return false
|
||||
}
|
||||
return (tst_dmn == c.domain && tst_tld == c.tld)
|
||||
return (tstDmn == c.domain && tstTld == c.tld)
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (c *Crawler) FormatUrl(url string) string {
|
||||
// FormatURL takes a url and, unless it starts with "http"
|
||||
// appends it to the end of c.rootURL
|
||||
func (c *Crawler) FormatURL(url string) string {
|
||||
// If the URL doesn't start with http, then it should be relative
|
||||
if strings.Index(url, "http") != 0 {
|
||||
url = c.rootUrl + url
|
||||
url = c.rootURL + url
|
||||
}
|
||||
return url
|
||||
}
|
||||
|
||||
// CreateDirIfNotExist Checks if directory 'dir' exists
|
||||
// If it doesn't, it creates it.
|
||||
func CreateDirIfNotExist(dir string) error {
|
||||
// Check if out_dir exists, if not, try to make it
|
||||
// Check if outDir exists, if not, try to make it
|
||||
_, err := os.Stat(dir)
|
||||
if os.IsNotExist(err) {
|
||||
// Doesn't exist, try to create
|
||||
@ -326,8 +338,13 @@ func CreateDirIfNotExist(dir string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// WriteFile writes the data 'd' to 'filename'
|
||||
func WriteFile(d string, filename string) error {
|
||||
do := []byte(d)
|
||||
//fmt.Printf("Writing %s\n", filename)
|
||||
return ioutil.WriteFile(filename, do, 0664)
|
||||
}
|
||||
|
||||
func printUsage() {
|
||||
fmt.Println("Usage: ...")
|
||||
}
|
Loading…
Reference in New Issue
Block a user