spider/spider.go

90 lines
1.6 KiB
Go

package spider
import (
"fmt"
"net/http"
"os"
"strings"
"golang.org/x/net/html"
)
// URLs returns all links on a page
func URLs(url string) []Link {
resp, err := http.Get(url)
if err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
doc, err := html.Parse(resp.Body)
if err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
paths := []Link{}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key == "href" {
paths = append(paths, Link{From: url, To: a.Val})
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
return paths
}
type Link struct {
From string
To string
}
func (l Link) String() string {
return fmt.Sprintf("%s > %s", l.From, l.To)
}
// Pages returns a stream of full urls starting at a given base page.
func Pages(base string) <-chan Link {
base = strings.TrimRight(base, "/")
visited := map[string]bool{}
links := URLs(base)
links = append(links, Link{From: "start", To: base})
r := make(chan Link)
go func() {
for len(links) > 0 {
l := links[0]
links = links[1:]
if !(strings.HasPrefix(l.To, base) || strings.HasPrefix(l.To, "/")) {
continue
}
if !strings.HasPrefix(l.To, base) && strings.HasPrefix(l.To, "/") {
l.To = base + l.To
}
// drop fragments
h := strings.Index(l.To, "#")
if h >= 0 {
l.To = l.To[:h]
}
if _, ok := visited[l.To]; !ok {
r <- l
for _, lk := range URLs(l.To) {
links = append(links, lk)
}
}
visited[l.To] = true
}
close(r)
}()
return r
}