spider/spider.go

121 lines
2.0 KiB
Go
Raw Normal View History

2016-07-28 20:13:20 -07:00
package spider
import (
"fmt"
2016-11-25 23:48:44 -08:00
"io"
2016-07-28 20:13:20 -07:00
"net/http"
"strings"
"golang.org/x/net/html"
)
// URLs returns all links on a page
2016-11-25 23:48:44 -08:00
func URLs(page io.Reader) ([]string, error) {
doc, err := html.Parse(page)
2016-07-28 20:13:20 -07:00
if err != nil {
2016-11-25 23:48:44 -08:00
return nil, fmt.Errorf("parsing html: %v", err)
2016-07-28 20:13:20 -07:00
}
2016-11-25 23:48:44 -08:00
paths := []string{}
2016-07-28 20:13:20 -07:00
var f func(*html.Node)
f = func(n *html.Node) {
2016-11-25 23:34:07 -08:00
if n.Type == html.ElementNode {
2016-07-28 20:13:20 -07:00
for _, a := range n.Attr {
2016-11-25 23:34:07 -08:00
switch a.Key {
case "href", "src":
2016-11-25 23:48:44 -08:00
paths = append(paths, a.Val)
2016-07-28 20:13:20 -07:00
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
2016-11-25 23:48:44 -08:00
return paths, nil
2016-07-28 20:13:20 -07:00
}
type Link struct {
From string
To string
2016-11-25 23:48:44 -08:00
Err error
2016-07-28 20:13:20 -07:00
}
func (l Link) String() string {
2016-11-25 23:48:44 -08:00
r := fmt.Sprintf("%s > %s", l.From, l.To)
if l.Err != nil {
r = fmt.Sprintf("%v (%v)", r, l.Err)
}
return r
2016-07-28 20:13:20 -07:00
}
// Pages returns a stream of full urls starting at a given base page.
func Pages(base string) <-chan Link {
2016-11-25 23:48:44 -08:00
r := make(chan Link)
2016-07-28 20:13:20 -07:00
base = strings.TrimRight(base, "/")
2016-11-24 09:29:31 -08:00
visited := map[string]bool{base: true}
2016-11-25 23:48:44 -08:00
links := []Link{}
2016-07-28 20:13:20 -07:00
2016-11-25 23:48:44 -08:00
resp, err := http.Get(base)
if err != nil {
go func() {
r <- Link{To: base, From: "start", Err: err}
close(r)
}()
return r
}
lks, err := URLs(resp.Body)
if err != nil {
go func() {
r <- Link{To: base, From: "start", Err: err}
close(r)
}()
return r
}
for _, l := range lks {
links = append(links, Link{From: base, To: l})
}
2016-07-28 20:13:20 -07:00
go func() {
for len(links) > 0 {
l := links[0]
links = links[1:]
2016-11-24 09:09:20 -08:00
if strings.HasPrefix(l.To, "/") {
2016-07-28 20:13:20 -07:00
l.To = base + l.To
}
2016-11-24 09:09:20 -08:00
if !strings.HasPrefix(l.To, base) {
continue
}
2016-07-28 20:13:20 -07:00
// drop fragments
h := strings.Index(l.To, "#")
if h >= 0 {
l.To = l.To[:h]
}
if _, ok := visited[l.To]; !ok {
r <- l
2016-11-25 23:48:44 -08:00
resp, err := http.Get(l.To)
if err != nil {
panic(err)
}
lks, err := URLs(resp.Body)
if err != nil {
panic(err)
}
for _, lk := range lks {
links = append(links, Link{From: l.From, To: lk})
2016-07-28 20:13:20 -07:00
}
}
visited[l.To] = true
}
close(r)
}()
return r
}