package spider import ( "fmt" "net/http" "os" "strings" "golang.org/x/net/html" ) // URLs returns all links on a page func URLs(url string) []Link { resp, err := http.Get(url) if err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) os.Exit(1) } doc, err := html.Parse(resp.Body) if err != nil { fmt.Fprintf(os.Stderr, "%v\n", err) os.Exit(1) } paths := []Link{} var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, a := range n.Attr { if a.Key == "href" { paths = append(paths, Link{From: url, To: a.Val}) break } } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) return paths } type Link struct { From string To string } func (l Link) String() string { return fmt.Sprintf("%s > %s", l.From, l.To) } // Pages returns a stream of full urls starting at a given base page. func Pages(base string) <-chan Link { base = strings.TrimRight(base, "/") visited := map[string]bool{base: true} links := URLs(base) r := make(chan Link) go func() { for len(links) > 0 { l := links[0] links = links[1:] if strings.HasPrefix(l.To, "/") { l.To = base + l.To } if !strings.HasPrefix(l.To, base) { continue } // drop fragments h := strings.Index(l.To, "#") if h >= 0 { l.To = l.To[:h] } if _, ok := visited[l.To]; !ok { r <- l for _, lk := range URLs(l.To) { links = append(links, lk) } } visited[l.To] = true } close(r) }() return r }