fullscrape/fullscrape.go

350 lines
8.3 KiB
Go
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package main
import (
"bytes"
"fmt"
"io/ioutil"
"net/http"
"os"
"strconv"
"strings"
"time"
"golang.org/x/net/html"
)
const programName = "fullscrape"
func main() {
if len(os.Args) <= 1 {
printUsage()
os.Exit(1)
}
reqURL := os.Args[1] //"http://golang.org/"
outDir := ""
if len(os.Args) > 2 {
outDir = os.Args[2]
}
depthFlag := -1
norewriteFlag := false
crossdomainFlag := false
throttleFlag := 1000
var err error
if len(os.Args) > 3 {
tst := os.Args[3]
depthArg := strings.IndexRune(tst, 'd')
if depthArg >= 0 {
// The actual depth value should either be depthArg+1
// or, if that is '=', depthArg+2
if tst[depthArg+1] == '=' {
depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+2])
} else {
depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+1])
}
if err != nil {
fmt.Printf("Invalid depth given (must be an integer): %s\n", depthFlag)
os.Exit(1)
}
}
norewriteFlag = (strings.IndexRune(tst, 'n') >= 0)
crossdomainFlag = (strings.IndexRune(tst, 'x') >= 0)
throttleArg := strings.IndexRune(tst, 't')
if throttleArg >= 0 {
// The actual throttle value should either be throttleArg+1...
// or, if that is '=', throttleArg+2...
if tst[depthArg+1] == '=' {
// The throttle argument MUST have a space after it
throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+2])
} else {
throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+1])
}
if err != nil {
fmt.Printf("Invalid depth given (must be milliseconds as an integer): %s\n", depthFlag)
os.Exit(1)
}
}
}
if err := CreateDirIfNotExist(outDir); err != nil {
fmt.Print("Unable to create initial directory %s\n", outDir)
fmt.Print("Error: %s\n", err)
os.Exit(1)
}
c := new(Crawler)
// Make sure we have the protocol
if strings.Index(reqURL, "http") != 0 {
reqURL = "http://" + reqURL
}
if !strings.HasSuffix(reqURL, "/") {
reqURL = reqURL + "/"
}
c.rootURL = reqURL
c.outDir = outDir
c.fixUrls = norewriteFlag
c.xDomain = crossdomainFlag
c.depth = depthFlag
c.throttle = time.Duration(throttleFlag)
// Parse out the Domain and TLD
c.domain, c.tld, err = c.ParseURL(c.rootURL)
if err != nil {
fmt.Print(err)
os.Exit(1)
}
c.Crawl()
}
type unprocessed struct {
depth int
url []string
}
// Crawler crawls
type Crawler struct {
rootURL string
outDir string
fixUrls bool
xDomain bool
depth int
throttle time.Duration
domain string
tld string
}
// Crawl tells the crawler to start crawlin'
func (c *Crawler) Crawl() {
if c.depth >= 0 {
fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.depth, c.fixUrls, c.xDomain, c.throttle)
} else {
fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.fixUrls, c.xDomain, c.throttle)
}
// Setup channel for inputs to be processed
up := make(chan unprocessed, 0)
// Kick off processing and count how many pages are left to process
go c.getPage(c.rootURL, c.depth, up)
outstanding := 1
visited := make(map[string]bool)
status := fmt.Sprintf("Files %d/%d", len(visited), outstanding+len(visited))
for outstanding > 0 {
done := len(visited) - outstanding
if done < 0 {
done = 0
}
fmt.Print(strings.Repeat("", len(status)))
status = fmt.Sprintf("Files %d/%d", done, len(visited))
fmt.Print(status)
if c.throttle > 0 {
time.Sleep(time.Millisecond * c.throttle)
}
// Pop a visit from the channel
next := <-up
outstanding--
// If we're too deep, skip it
if next.depth == 0 {
continue
}
// Loop over all urls to visit from that page
for _, link := range next.url {
// Check that we haven't visited them before
if visited[link] {
continue
}
// All good to visit them
outstanding++
visited[link] = true
go c.getPage(link, next.depth, up)
}
}
//fmt.Print(strings.Repeat("", len(status)))
status = fmt.Sprintf("Files %d/%d", len(visited), len(visited))
fmt.Printf("%s\n", status)
}
func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
_, urls, err := c.Fetch(url)
//body, urls, err := c.Fetch(url)
fmt.Printf("Found: %s\n", url)
if err != nil {
fmt.Println(err)
}
//fmt.Printf("Pulled URLS: %s\n", urls)
r <- unprocessed{depth - 1, urls}
}
// Fetch initiates a page get
func (c *Crawler) Fetch(url string) (string, []string, error) {
var urls []string
// Ok, go get URL
response, err := http.Get(url)
if err != nil || response.StatusCode != 200 {
return "", nil, err
}
body, err := ioutil.ReadAll(response.Body)
response.Body.Close()
// Save the body to the appropriate directory
saveFile := strings.TrimPrefix(url, c.rootURL)
if saveFile == "" {
saveFile = "index.html"
} else {
saveArr := strings.Split(saveFile, "/")
saveDir := strings.Join(saveArr[:len(saveArr)-1], "/")
if strings.Index(saveDir, "/") != 0 {
saveDir = "/" + saveDir
}
saveDir = c.outDir + saveDir
if len(saveArr) > 0 {
if err = CreateDirIfNotExist(saveDir); err != nil {
fmt.Printf("PANIC: Unable to create directory %s\n", saveDir)
fmt.Printf("Error: %s\n", err)
os.Exit(1)
}
}
}
WriteFile(string(body), c.outDir+"/"+saveFile)
// Read the body into a buffer
bdReader := bytes.NewReader(body)
z := html.NewTokenizer(bdReader)
tt := z.Next()
// Is this an HTML file?
if tt != html.DoctypeToken {
// Nope, so we're done here
return string(body), urls, nil
}
for {
tt := z.Next()
switch {
case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
t := z.Token()
if t.Data == "link" || t.Data == "a" {
for _, a := range t.Attr {
if a.Key == "href" {
if c.CheckURL(a.Val) {
urls = append(urls, c.FormatURL(a.Val))
} else {
fmt.Printf("CheckURL Failed For: %s\n", a.Val)
}
break
}
}
} else if t.Data == "img" || t.Data == "script" {
for _, a := range t.Attr {
if a.Key == "src" {
if c.CheckURL(a.Val) {
urls = append(urls, c.FormatURL(a.Val))
} else {
fmt.Printf("CheckURL Failed For: %s\n", a.Val)
}
break
}
}
}
}
if tt == html.ErrorToken {
break
}
}
if err != nil {
return "", nil, err
}
return string(body), urls, nil
}
// ParseURL parses a URL and returns its Domain and TLD
func (c *Crawler) ParseURL(url string) (string, string, error) {
partSlice := strings.Split(url, "//")
server := partSlice[1]
partSlice = strings.Split(server, "/")
server = partSlice[0]
partSlice = strings.Split(server, ".")
var tld, domain string
// We just want the last two parts of the slice
if len(partSlice) >= 2 {
partSlice = partSlice[len(partSlice)-2:]
domain = partSlice[0]
tld = partSlice[1]
} else {
// There aren't two parts ot the URL?! That's not right...
return "", "", fmt.Errorf("Invalid URL Given: %s\n", url)
}
return domain, tld, nil
}
// CheckURL checks if we should follow the url or not
// The main purpose is for cross-domain checks
func (c *Crawler) CheckURL(url string) bool {
// Ignore anchor urls
if strings.IndexRune(url, '#') >= 0 {
return false
}
// Ignore "mailto" links
if strings.HasPrefix(url, "mailto:") {
return false
}
if !c.xDomain {
if strings.HasPrefix(url, "http") {
// Make sure that this url is in the same domain
tstDmn, tstTld, err := c.ParseURL(url)
if err != nil {
// Error parsing the Domain/TLD out of the URL...
return false
}
return (tstDmn == c.domain && tstTld == c.tld)
}
}
return true
}
// FormatURL takes a url and, unless it starts with "http"
// appends it to the end of c.rootURL
func (c *Crawler) FormatURL(url string) string {
// If the URL doesn't start with http, then it should be relative
if strings.Index(url, "http") != 0 {
url = c.rootURL + url
}
return url
}
// CreateDirIfNotExist Checks if directory 'dir' exists
// If it doesn't, it creates it.
func CreateDirIfNotExist(dir string) error {
// Check if outDir exists, if not, try to make it
_, err := os.Stat(dir)
if os.IsNotExist(err) {
// Doesn't exist, try to create
if err = os.MkdirAll(dir, 0755); err != nil {
return err
}
}
// Directory exists, just return
return nil
}
// WriteFile writes the data 'd' to 'filename'
func WriteFile(d string, filename string) error {
do := []byte(d)
//fmt.Printf("Writing %s\n", filename)
return ioutil.WriteFile(filename, do, 0664)
}
func printUsage() {
fmt.Println("Usage: ...")
}