package spider import ( "fmt" "io" "net/http" "strings" "golang.org/x/net/html" ) // URLs returns all links on a page func URLs(page io.Reader) ([]string, error) { doc, err := html.Parse(page) if err != nil { return nil, fmt.Errorf("parsing html: %v", err) } paths := []string{} var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode { for _, a := range n.Attr { switch a.Key { case "href", "src": paths = append(paths, a.Val) break } } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) return paths, nil } type Link struct { From string To string Err error } func (l Link) String() string { r := fmt.Sprintf("%s > %s", l.From, l.To) if l.Err != nil { r = fmt.Sprintf("%v (%v)", r, l.Err) } return r } // Pages returns a stream of full urls starting at a given base page. func Pages(base string) <-chan Link { r := make(chan Link) base = strings.TrimRight(base, "/") visited := map[string]bool{base: true} links := []Link{} resp, err := http.Get(base) if err != nil { go func() { r <- Link{To: base, From: "start", Err: err} close(r) }() return r } lks, err := URLs(resp.Body) if err != nil { go func() { r <- Link{To: base, From: "start", Err: err} close(r) }() return r } for _, l := range lks { links = append(links, Link{From: base, To: l}) } go func() { for len(links) > 0 { l := links[0] links = links[1:] if strings.HasPrefix(l.To, "/") { l.To = base + l.To } if !strings.HasPrefix(l.To, base) { continue } // drop fragments h := strings.Index(l.To, "#") if h >= 0 { l.To = l.To[:h] } if _, ok := visited[l.To]; !ok { r <- l resp, err := http.Get(l.To) if err != nil { panic(err) } lks, err := URLs(resp.Body) if err != nil { panic(err) } for _, lk := range lks { links = append(links, Link{From: l.From, To: lk}) } } visited[l.To] = true } close(r) }() return r }