2016-07-28 20:13:20 -07:00
|
|
|
package spider
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2016-11-25 23:48:44 -08:00
|
|
|
"io"
|
2016-07-28 20:13:20 -07:00
|
|
|
"net/http"
|
|
|
|
"strings"
|
|
|
|
|
|
|
|
"golang.org/x/net/html"
|
|
|
|
)
|
|
|
|
|
|
|
|
// URLs returns all links on a page
|
2016-11-25 23:48:44 -08:00
|
|
|
func URLs(page io.Reader) ([]string, error) {
|
|
|
|
doc, err := html.Parse(page)
|
2016-07-28 20:13:20 -07:00
|
|
|
if err != nil {
|
2016-11-25 23:48:44 -08:00
|
|
|
return nil, fmt.Errorf("parsing html: %v", err)
|
2016-07-28 20:13:20 -07:00
|
|
|
}
|
2016-11-25 23:48:44 -08:00
|
|
|
paths := []string{}
|
2016-07-28 20:13:20 -07:00
|
|
|
var f func(*html.Node)
|
|
|
|
f = func(n *html.Node) {
|
2016-11-25 23:34:07 -08:00
|
|
|
if n.Type == html.ElementNode {
|
2016-07-28 20:13:20 -07:00
|
|
|
for _, a := range n.Attr {
|
2016-11-25 23:34:07 -08:00
|
|
|
switch a.Key {
|
|
|
|
case "href", "src":
|
2016-11-25 23:48:44 -08:00
|
|
|
paths = append(paths, a.Val)
|
2016-07-28 20:13:20 -07:00
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
|
|
f(c)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
f(doc)
|
2016-11-25 23:48:44 -08:00
|
|
|
return paths, nil
|
2016-07-28 20:13:20 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
type Link struct {
|
|
|
|
From string
|
|
|
|
To string
|
2016-11-25 23:48:44 -08:00
|
|
|
|
|
|
|
Err error
|
2016-07-28 20:13:20 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
func (l Link) String() string {
|
2016-11-25 23:48:44 -08:00
|
|
|
r := fmt.Sprintf("%s > %s", l.From, l.To)
|
|
|
|
if l.Err != nil {
|
|
|
|
r = fmt.Sprintf("%v (%v)", r, l.Err)
|
|
|
|
}
|
|
|
|
return r
|
2016-07-28 20:13:20 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// Pages returns a stream of full urls starting at a given base page.
|
|
|
|
func Pages(base string) <-chan Link {
|
2016-11-25 23:48:44 -08:00
|
|
|
r := make(chan Link)
|
|
|
|
|
2016-07-28 20:13:20 -07:00
|
|
|
base = strings.TrimRight(base, "/")
|
2016-11-24 09:29:31 -08:00
|
|
|
visited := map[string]bool{base: true}
|
2016-11-25 23:48:44 -08:00
|
|
|
links := []Link{}
|
2016-07-28 20:13:20 -07:00
|
|
|
|
2016-11-25 23:48:44 -08:00
|
|
|
resp, err := http.Get(base)
|
|
|
|
if err != nil {
|
|
|
|
go func() {
|
|
|
|
r <- Link{To: base, From: "start", Err: err}
|
|
|
|
close(r)
|
|
|
|
}()
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
lks, err := URLs(resp.Body)
|
|
|
|
if err != nil {
|
|
|
|
go func() {
|
|
|
|
r <- Link{To: base, From: "start", Err: err}
|
|
|
|
close(r)
|
|
|
|
}()
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, l := range lks {
|
|
|
|
links = append(links, Link{From: base, To: l})
|
|
|
|
}
|
2016-07-28 20:13:20 -07:00
|
|
|
|
|
|
|
go func() {
|
|
|
|
for len(links) > 0 {
|
|
|
|
l := links[0]
|
|
|
|
links = links[1:]
|
2016-11-24 09:09:20 -08:00
|
|
|
if strings.HasPrefix(l.To, "/") {
|
2016-07-28 20:13:20 -07:00
|
|
|
l.To = base + l.To
|
|
|
|
}
|
2016-11-24 09:09:20 -08:00
|
|
|
if !strings.HasPrefix(l.To, base) {
|
|
|
|
continue
|
|
|
|
}
|
2016-07-28 20:13:20 -07:00
|
|
|
|
|
|
|
// drop fragments
|
|
|
|
h := strings.Index(l.To, "#")
|
|
|
|
if h >= 0 {
|
|
|
|
l.To = l.To[:h]
|
|
|
|
}
|
|
|
|
|
|
|
|
if _, ok := visited[l.To]; !ok {
|
|
|
|
r <- l
|
2016-11-25 23:48:44 -08:00
|
|
|
|
|
|
|
resp, err := http.Get(l.To)
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
lks, err := URLs(resp.Body)
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, lk := range lks {
|
|
|
|
links = append(links, Link{From: l.From, To: lk})
|
2016-07-28 20:13:20 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
visited[l.To] = true
|
|
|
|
}
|
|
|
|
close(r)
|
|
|
|
}()
|
|
|
|
return r
|
|
|
|
}
|