2015-07-30 19:55:05 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"fmt"
|
|
|
|
"io/ioutil"
|
|
|
|
"net/http"
|
|
|
|
"os"
|
2016-05-05 14:02:28 +00:00
|
|
|
"regexp"
|
2015-07-30 19:55:05 +00:00
|
|
|
"strconv"
|
|
|
|
"strings"
|
2015-08-14 22:21:18 +00:00
|
|
|
"time"
|
2016-01-21 18:14:09 +00:00
|
|
|
|
|
|
|
"golang.org/x/net/html"
|
2015-07-30 19:55:05 +00:00
|
|
|
)
|
|
|
|
|
2016-01-21 18:14:09 +00:00
|
|
|
const programName = "fullscrape"
|
2015-07-30 19:55:05 +00:00
|
|
|
|
|
|
|
func main() {
|
2016-01-21 18:14:09 +00:00
|
|
|
if len(os.Args) <= 1 {
|
|
|
|
printUsage()
|
|
|
|
os.Exit(1)
|
|
|
|
}
|
|
|
|
reqURL := os.Args[1] //"http://golang.org/"
|
|
|
|
outDir := ""
|
|
|
|
if len(os.Args) > 2 {
|
|
|
|
outDir = os.Args[2]
|
|
|
|
}
|
2015-08-14 22:21:18 +00:00
|
|
|
depthFlag := -1
|
|
|
|
norewriteFlag := false
|
|
|
|
crossdomainFlag := false
|
|
|
|
throttleFlag := 1000
|
|
|
|
var err error
|
|
|
|
|
2015-07-30 19:55:05 +00:00
|
|
|
if len(os.Args) > 3 {
|
2015-08-14 22:21:18 +00:00
|
|
|
tst := os.Args[3]
|
|
|
|
depthArg := strings.IndexRune(tst, 'd')
|
|
|
|
if depthArg >= 0 {
|
|
|
|
// The actual depth value should either be depthArg+1
|
|
|
|
// or, if that is '=', depthArg+2
|
|
|
|
if tst[depthArg+1] == '=' {
|
|
|
|
depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+2])
|
|
|
|
} else {
|
|
|
|
depthFlag, err = strconv.Atoi(strings.Split(tst, "")[depthArg+1])
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
fmt.Printf("Invalid depth given (must be an integer): %s\n", depthFlag)
|
|
|
|
os.Exit(1)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
norewriteFlag = (strings.IndexRune(tst, 'n') >= 0)
|
|
|
|
crossdomainFlag = (strings.IndexRune(tst, 'x') >= 0)
|
|
|
|
throttleArg := strings.IndexRune(tst, 't')
|
|
|
|
if throttleArg >= 0 {
|
|
|
|
// The actual throttle value should either be throttleArg+1...
|
|
|
|
// or, if that is '=', throttleArg+2...
|
|
|
|
if tst[depthArg+1] == '=' {
|
|
|
|
// The throttle argument MUST have a space after it
|
|
|
|
throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+2])
|
|
|
|
} else {
|
|
|
|
throttleFlag, err = strconv.Atoi(strings.Split(tst, "")[throttleArg+1])
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
fmt.Printf("Invalid depth given (must be milliseconds as an integer): %s\n", depthFlag)
|
|
|
|
os.Exit(1)
|
|
|
|
}
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
|
|
|
}
|
2015-08-14 22:21:18 +00:00
|
|
|
|
2016-01-21 18:14:09 +00:00
|
|
|
if err := CreateDirIfNotExist(outDir); err != nil {
|
|
|
|
fmt.Print("Unable to create initial directory %s\n", outDir)
|
2015-07-30 19:55:05 +00:00
|
|
|
fmt.Print("Error: %s\n", err)
|
|
|
|
os.Exit(1)
|
|
|
|
}
|
|
|
|
|
|
|
|
c := new(Crawler)
|
|
|
|
// Make sure we have the protocol
|
2016-01-21 18:14:09 +00:00
|
|
|
if strings.Index(reqURL, "http") != 0 {
|
|
|
|
reqURL = "http://" + reqURL
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
2016-01-21 18:14:09 +00:00
|
|
|
if !strings.HasSuffix(reqURL, "/") {
|
|
|
|
reqURL = reqURL + "/"
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
2016-01-21 18:14:09 +00:00
|
|
|
c.rootURL = reqURL
|
|
|
|
c.outDir = outDir
|
2015-08-14 22:21:18 +00:00
|
|
|
c.fixUrls = norewriteFlag
|
|
|
|
c.xDomain = crossdomainFlag
|
|
|
|
c.depth = depthFlag
|
|
|
|
c.throttle = time.Duration(throttleFlag)
|
2015-08-20 14:16:02 +00:00
|
|
|
// Parse out the Domain and TLD
|
2016-01-21 18:14:09 +00:00
|
|
|
c.domain, c.tld, err = c.ParseURL(c.rootURL)
|
2015-08-20 14:16:02 +00:00
|
|
|
if err != nil {
|
|
|
|
fmt.Print(err)
|
|
|
|
os.Exit(1)
|
|
|
|
}
|
2015-07-30 19:55:05 +00:00
|
|
|
|
2015-08-14 22:21:18 +00:00
|
|
|
c.Crawl()
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type unprocessed struct {
|
|
|
|
depth int
|
|
|
|
url []string
|
|
|
|
}
|
|
|
|
|
2016-01-21 18:14:09 +00:00
|
|
|
// Crawler crawls
|
2015-07-30 19:55:05 +00:00
|
|
|
type Crawler struct {
|
2016-01-21 18:14:09 +00:00
|
|
|
rootURL string
|
2015-08-14 22:21:18 +00:00
|
|
|
outDir string
|
|
|
|
fixUrls bool
|
|
|
|
xDomain bool
|
|
|
|
depth int
|
|
|
|
throttle time.Duration
|
2015-08-20 14:16:02 +00:00
|
|
|
domain string
|
|
|
|
tld string
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
|
|
|
|
2016-01-21 18:14:09 +00:00
|
|
|
// Crawl tells the crawler to start crawlin'
|
2015-08-14 22:21:18 +00:00
|
|
|
func (c *Crawler) Crawl() {
|
|
|
|
if c.depth >= 0 {
|
2016-01-21 18:14:09 +00:00
|
|
|
fmt.Printf("Processing %s with depth %d (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.depth, c.fixUrls, c.xDomain, c.throttle)
|
2015-08-14 22:21:18 +00:00
|
|
|
} else {
|
2016-01-21 18:14:09 +00:00
|
|
|
fmt.Printf("Processing %s (Norewrite: %t, XDomain: %t, Throttle: %d)\n", c.rootURL, c.fixUrls, c.xDomain, c.throttle)
|
2015-08-14 22:21:18 +00:00
|
|
|
}
|
|
|
|
|
2015-07-30 19:55:05 +00:00
|
|
|
// Setup channel for inputs to be processed
|
|
|
|
up := make(chan unprocessed, 0)
|
|
|
|
|
|
|
|
// Kick off processing and count how many pages are left to process
|
2016-01-21 18:14:09 +00:00
|
|
|
go c.getPage(c.rootURL, c.depth, up)
|
2015-07-30 19:55:05 +00:00
|
|
|
outstanding := 1
|
|
|
|
|
|
|
|
visited := make(map[string]bool)
|
2015-08-14 22:21:18 +00:00
|
|
|
|
2016-05-05 14:02:28 +00:00
|
|
|
status := fmt.Sprintf("Files %d/%d\n", len(visited), outstanding+len(visited))
|
2015-07-30 19:55:05 +00:00
|
|
|
for outstanding > 0 {
|
2015-08-14 22:21:18 +00:00
|
|
|
done := len(visited) - outstanding
|
|
|
|
if done < 0 {
|
|
|
|
done = 0
|
|
|
|
}
|
2016-05-05 14:02:28 +00:00
|
|
|
status = fmt.Sprintf("Files %d/%d\n", done, len(visited))
|
2015-08-14 22:21:18 +00:00
|
|
|
fmt.Print(status)
|
|
|
|
if c.throttle > 0 {
|
|
|
|
time.Sleep(time.Millisecond * c.throttle)
|
|
|
|
}
|
2015-07-30 19:55:05 +00:00
|
|
|
// Pop a visit from the channel
|
|
|
|
next := <-up
|
|
|
|
outstanding--
|
|
|
|
|
|
|
|
// If we're too deep, skip it
|
2015-08-14 22:21:18 +00:00
|
|
|
if next.depth == 0 {
|
2015-07-30 19:55:05 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Loop over all urls to visit from that page
|
|
|
|
for _, link := range next.url {
|
|
|
|
// Check that we haven't visited them before
|
|
|
|
if visited[link] {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// All good to visit them
|
|
|
|
outstanding++
|
|
|
|
visited[link] = true
|
2015-08-14 22:21:18 +00:00
|
|
|
go c.getPage(link, next.depth, up)
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
|
|
|
}
|
2016-05-05 14:02:28 +00:00
|
|
|
status = fmt.Sprintf("Files %d/%d\n", len(visited), len(visited))
|
2015-08-14 22:21:18 +00:00
|
|
|
fmt.Printf("%s\n", status)
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (c *Crawler) getPage(url string, depth int, r chan unprocessed) {
|
|
|
|
_, urls, err := c.Fetch(url)
|
|
|
|
//body, urls, err := c.Fetch(url)
|
2015-08-20 14:16:02 +00:00
|
|
|
fmt.Printf("Found: %s\n", url)
|
2015-07-30 19:55:05 +00:00
|
|
|
if err != nil {
|
|
|
|
fmt.Println(err)
|
|
|
|
}
|
|
|
|
|
2015-08-14 22:21:18 +00:00
|
|
|
//fmt.Printf("Pulled URLS: %s\n", urls)
|
2015-07-30 19:55:05 +00:00
|
|
|
|
|
|
|
r <- unprocessed{depth - 1, urls}
|
|
|
|
}
|
|
|
|
|
2016-01-21 18:14:09 +00:00
|
|
|
// Fetch initiates a page get
|
2015-07-30 19:55:05 +00:00
|
|
|
func (c *Crawler) Fetch(url string) (string, []string, error) {
|
2016-01-21 18:14:09 +00:00
|
|
|
var urls []string
|
2015-07-30 19:55:05 +00:00
|
|
|
// Ok, go get URL
|
|
|
|
response, err := http.Get(url)
|
2015-08-14 22:21:18 +00:00
|
|
|
if err != nil || response.StatusCode != 200 {
|
2015-07-30 19:55:05 +00:00
|
|
|
return "", nil, err
|
|
|
|
}
|
|
|
|
body, err := ioutil.ReadAll(response.Body)
|
|
|
|
response.Body.Close()
|
|
|
|
|
|
|
|
// Save the body to the appropriate directory
|
2016-01-21 18:14:09 +00:00
|
|
|
saveFile := strings.TrimPrefix(url, c.rootURL)
|
|
|
|
if saveFile == "" {
|
|
|
|
saveFile = "index.html"
|
2015-07-30 19:55:05 +00:00
|
|
|
} else {
|
2016-01-21 18:14:09 +00:00
|
|
|
saveArr := strings.Split(saveFile, "/")
|
|
|
|
saveDir := strings.Join(saveArr[:len(saveArr)-1], "/")
|
|
|
|
if strings.Index(saveDir, "/") != 0 {
|
|
|
|
saveDir = "/" + saveDir
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
2016-01-21 18:14:09 +00:00
|
|
|
saveDir = c.outDir + saveDir
|
2015-07-30 19:55:05 +00:00
|
|
|
|
2016-01-21 18:14:09 +00:00
|
|
|
if len(saveArr) > 0 {
|
|
|
|
if err = CreateDirIfNotExist(saveDir); err != nil {
|
|
|
|
fmt.Printf("PANIC: Unable to create directory %s\n", saveDir)
|
2015-07-30 19:55:05 +00:00
|
|
|
fmt.Printf("Error: %s\n", err)
|
|
|
|
os.Exit(1)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-05-05 15:22:10 +00:00
|
|
|
WriteFile(string(body), c.outDir+"/"+saveFile)
|
2016-05-05 14:02:28 +00:00
|
|
|
|
|
|
|
fmt.Println("Parsing " + c.outDir + saveFile)
|
2015-07-30 19:55:05 +00:00
|
|
|
|
|
|
|
// Read the body into a buffer
|
2016-01-21 18:14:09 +00:00
|
|
|
bdReader := bytes.NewReader(body)
|
|
|
|
z := html.NewTokenizer(bdReader)
|
2015-07-30 19:55:05 +00:00
|
|
|
tt := z.Next()
|
2016-05-05 14:02:28 +00:00
|
|
|
// Is this an HTML file
|
|
|
|
switch tt {
|
|
|
|
case html.DoctypeToken:
|
|
|
|
urls = append(urls, c.ParseHTML(body)...)
|
|
|
|
case html.TextToken:
|
|
|
|
parsedURLs := c.ParseText(body)
|
|
|
|
fmt.Println("Found urls in text file: ")
|
|
|
|
// Find file directory
|
|
|
|
urlLoc := saveFile[:strings.LastIndex(saveFile, "/")+1]
|
|
|
|
for i := range parsedURLs {
|
|
|
|
if parsedURLs[i][0] == '/' {
|
|
|
|
parsedURLs[i] = c.FormatURL(parsedURLs[i][1:])
|
|
|
|
} else if parsedURLs[i][0] == '.' {
|
|
|
|
parsedURLs[i] = c.FormatURL(urlLoc + parsedURLs[i])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
urls = append(urls, parsedURLs...)
|
|
|
|
}
|
|
|
|
return string(body), urls, nil
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return "", nil, err
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
|
|
|
|
2016-05-05 14:02:28 +00:00
|
|
|
return string(body), urls, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// ParseHTML parses an html tokenizer and returns a list of urls
|
|
|
|
func (c *Crawler) ParseHTML(bd []byte) []string {
|
|
|
|
var urls []string
|
|
|
|
bdReader := bytes.NewReader(bd)
|
|
|
|
z := html.NewTokenizer(bdReader)
|
|
|
|
tt := z.Next()
|
2015-07-30 19:55:05 +00:00
|
|
|
for {
|
2016-05-05 14:02:28 +00:00
|
|
|
tt = z.Next()
|
2015-07-30 19:55:05 +00:00
|
|
|
switch {
|
2015-08-14 22:21:18 +00:00
|
|
|
case tt == html.StartTagToken || tt == html.SelfClosingTagToken:
|
2015-07-30 19:55:05 +00:00
|
|
|
t := z.Token()
|
2015-08-14 22:21:18 +00:00
|
|
|
if t.Data == "link" || t.Data == "a" {
|
2015-07-30 19:55:05 +00:00
|
|
|
for _, a := range t.Attr {
|
|
|
|
if a.Key == "href" {
|
2016-01-21 18:14:09 +00:00
|
|
|
if c.CheckURL(a.Val) {
|
|
|
|
urls = append(urls, c.FormatURL(a.Val))
|
2015-08-20 14:16:02 +00:00
|
|
|
} else {
|
2016-01-21 18:14:09 +00:00
|
|
|
fmt.Printf("CheckURL Failed For: %s\n", a.Val)
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
2015-08-14 22:21:18 +00:00
|
|
|
} else if t.Data == "img" || t.Data == "script" {
|
2015-07-30 19:55:05 +00:00
|
|
|
for _, a := range t.Attr {
|
|
|
|
if a.Key == "src" {
|
2016-01-21 18:14:09 +00:00
|
|
|
if c.CheckURL(a.Val) {
|
|
|
|
urls = append(urls, c.FormatURL(a.Val))
|
2015-08-20 14:16:02 +00:00
|
|
|
} else {
|
2016-01-21 18:14:09 +00:00
|
|
|
fmt.Printf("CheckURL Failed For: %s\n", a.Val)
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if tt == html.ErrorToken {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
2016-05-05 14:02:28 +00:00
|
|
|
return urls
|
|
|
|
}
|
2015-07-30 19:55:05 +00:00
|
|
|
|
2016-05-05 14:02:28 +00:00
|
|
|
// ParseText parses a text file and returns a list of urls
|
|
|
|
func (c *Crawler) ParseText(bd []byte) []string {
|
|
|
|
var cssURLs = regexp.MustCompile(`url\(([^\)]*)\)`)
|
|
|
|
var urls []string
|
|
|
|
matches := cssURLs.FindAllSubmatch(bd, -1)
|
|
|
|
for _, v := range matches {
|
|
|
|
urls = append(urls, string(v[1]))
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
2016-05-05 14:02:28 +00:00
|
|
|
for i := range urls {
|
|
|
|
if urls[i][0] == '\'' || urls[i][0] == '"' {
|
|
|
|
urls[i] = urls[i][1 : len(urls[i])-1]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return urls
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
|
|
|
|
2016-01-21 18:14:09 +00:00
|
|
|
// ParseURL parses a URL and returns its Domain and TLD
|
|
|
|
func (c *Crawler) ParseURL(url string) (string, string, error) {
|
|
|
|
partSlice := strings.Split(url, "//")
|
|
|
|
server := partSlice[1]
|
|
|
|
partSlice = strings.Split(server, "/")
|
|
|
|
server = partSlice[0]
|
|
|
|
partSlice = strings.Split(server, ".")
|
2015-08-20 14:16:02 +00:00
|
|
|
var tld, domain string
|
|
|
|
|
|
|
|
// We just want the last two parts of the slice
|
2016-01-21 18:14:09 +00:00
|
|
|
if len(partSlice) >= 2 {
|
|
|
|
partSlice = partSlice[len(partSlice)-2:]
|
|
|
|
domain = partSlice[0]
|
|
|
|
tld = partSlice[1]
|
2015-08-20 14:16:02 +00:00
|
|
|
} else {
|
|
|
|
// There aren't two parts ot the URL?! That's not right...
|
|
|
|
return "", "", fmt.Errorf("Invalid URL Given: %s\n", url)
|
|
|
|
}
|
|
|
|
return domain, tld, nil
|
|
|
|
}
|
|
|
|
|
2016-01-21 18:14:09 +00:00
|
|
|
// CheckURL checks if we should follow the url or not
|
|
|
|
// The main purpose is for cross-domain checks
|
|
|
|
func (c *Crawler) CheckURL(url string) bool {
|
2015-08-14 22:21:18 +00:00
|
|
|
// Ignore anchor urls
|
|
|
|
if strings.IndexRune(url, '#') >= 0 {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
// Ignore "mailto" links
|
|
|
|
if strings.HasPrefix(url, "mailto:") {
|
|
|
|
return false
|
|
|
|
}
|
2015-07-30 19:55:05 +00:00
|
|
|
if !c.xDomain {
|
|
|
|
if strings.HasPrefix(url, "http") {
|
2015-08-20 14:16:02 +00:00
|
|
|
// Make sure that this url is in the same domain
|
2016-01-21 18:14:09 +00:00
|
|
|
tstDmn, tstTld, err := c.ParseURL(url)
|
2015-08-20 14:16:02 +00:00
|
|
|
if err != nil {
|
|
|
|
// Error parsing the Domain/TLD out of the URL...
|
|
|
|
return false
|
|
|
|
}
|
2016-01-21 18:14:09 +00:00
|
|
|
return (tstDmn == c.domain && tstTld == c.tld)
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2016-01-21 18:14:09 +00:00
|
|
|
// FormatURL takes a url and, unless it starts with "http"
|
|
|
|
// appends it to the end of c.rootURL
|
|
|
|
func (c *Crawler) FormatURL(url string) string {
|
2015-07-30 19:55:05 +00:00
|
|
|
// If the URL doesn't start with http, then it should be relative
|
|
|
|
if strings.Index(url, "http") != 0 {
|
2016-01-21 18:14:09 +00:00
|
|
|
url = c.rootURL + url
|
2015-07-30 19:55:05 +00:00
|
|
|
}
|
|
|
|
return url
|
|
|
|
}
|
|
|
|
|
2016-01-21 18:14:09 +00:00
|
|
|
// CreateDirIfNotExist Checks if directory 'dir' exists
|
|
|
|
// If it doesn't, it creates it.
|
2015-07-30 19:55:05 +00:00
|
|
|
func CreateDirIfNotExist(dir string) error {
|
2016-01-21 18:14:09 +00:00
|
|
|
// Check if outDir exists, if not, try to make it
|
2015-07-30 19:55:05 +00:00
|
|
|
_, err := os.Stat(dir)
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
// Doesn't exist, try to create
|
|
|
|
if err = os.MkdirAll(dir, 0755); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Directory exists, just return
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-01-21 18:14:09 +00:00
|
|
|
// WriteFile writes the data 'd' to 'filename'
|
2015-07-30 19:55:05 +00:00
|
|
|
func WriteFile(d string, filename string) error {
|
|
|
|
do := []byte(d)
|
2015-08-14 22:21:18 +00:00
|
|
|
//fmt.Printf("Writing %s\n", filename)
|
2015-07-30 19:55:05 +00:00
|
|
|
return ioutil.WriteFile(filename, do, 0664)
|
2016-01-21 18:14:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func printUsage() {
|
|
|
|
fmt.Println("Usage: ...")
|
2016-05-05 14:02:28 +00:00
|
|
|
}
|