use io.Reader when parsing urls

This commit is contained in:
Stephen McQuay 2016-11-25 23:48:44 -08:00
parent b2df3da864
commit ccc08a5da1
No known key found for this signature in database
GPG Key ID: 1ABF428F71BAFC3D
3 changed files with 66 additions and 19 deletions

View File

@ -2,6 +2,8 @@ package main
import ( import (
"fmt" "fmt"
"io"
"io/ioutil"
"net/http" "net/http"
"os" "os"
@ -20,9 +22,14 @@ func main() {
for p := range spider.Pages(os.Args[1]) { for p := range spider.Pages(os.Args[1]) {
resp, err := http.Get(p.To) resp, err := http.Get(p.To)
if err != nil { if err != nil {
p.Err = err
failures = append(failures, p) failures = append(failures, p)
continue
} }
io.Copy(ioutil.Discard, resp.Body)
resp.Body.Close()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
p.Err = fmt.Errorf("http status; got %s, want %s", http.StatusText(resp.StatusCode), http.StatusText(http.StatusOK))
failures = append(failures, p) failures = append(failures, p)
} }
} }

View File

@ -2,6 +2,7 @@ package main
import ( import (
"fmt" "fmt"
"net/http"
"os" "os"
"mcquay.me/spider" "mcquay.me/spider"
@ -14,7 +15,15 @@ func main() {
fmt.Fprintf(os.Stderr, "%s\n", usage) fmt.Fprintf(os.Stderr, "%s\n", usage)
os.Exit(1) os.Exit(1)
} }
for _, l := range spider.URLs(os.Args[1]) { resp, err := http.Get(os.Args[1])
if err != nil {
panic(err)
}
links, err := spider.URLs(resp.Body)
if err != nil {
panic(err)
}
for _, l := range links {
fmt.Println(l) fmt.Println(l)
} }
} }

View File

@ -2,33 +2,27 @@ package spider
import ( import (
"fmt" "fmt"
"io"
"net/http" "net/http"
"os"
"strings" "strings"
"golang.org/x/net/html" "golang.org/x/net/html"
) )
// URLs returns all links on a page // URLs returns all links on a page
func URLs(url string) []Link { func URLs(page io.Reader) ([]string, error) {
resp, err := http.Get(url) doc, err := html.Parse(page)
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err) return nil, fmt.Errorf("parsing html: %v", err)
os.Exit(1)
} }
doc, err := html.Parse(resp.Body) paths := []string{}
if err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
paths := []Link{}
var f func(*html.Node) var f func(*html.Node)
f = func(n *html.Node) { f = func(n *html.Node) {
if n.Type == html.ElementNode { if n.Type == html.ElementNode {
for _, a := range n.Attr { for _, a := range n.Attr {
switch a.Key { switch a.Key {
case "href", "src": case "href", "src":
paths = append(paths, Link{From: url, To: a.Val}) paths = append(paths, a.Val)
break break
} }
} }
@ -38,25 +32,52 @@ func URLs(url string) []Link {
} }
} }
f(doc) f(doc)
return paths return paths, nil
} }
type Link struct { type Link struct {
From string From string
To string To string
Err error
} }
func (l Link) String() string { func (l Link) String() string {
return fmt.Sprintf("%s > %s", l.From, l.To) r := fmt.Sprintf("%s > %s", l.From, l.To)
if l.Err != nil {
r = fmt.Sprintf("%v (%v)", r, l.Err)
}
return r
} }
// Pages returns a stream of full urls starting at a given base page. // Pages returns a stream of full urls starting at a given base page.
func Pages(base string) <-chan Link { func Pages(base string) <-chan Link {
r := make(chan Link)
base = strings.TrimRight(base, "/") base = strings.TrimRight(base, "/")
visited := map[string]bool{base: true} visited := map[string]bool{base: true}
links := URLs(base) links := []Link{}
r := make(chan Link) resp, err := http.Get(base)
if err != nil {
go func() {
r <- Link{To: base, From: "start", Err: err}
close(r)
}()
return r
}
lks, err := URLs(resp.Body)
if err != nil {
go func() {
r <- Link{To: base, From: "start", Err: err}
close(r)
}()
return r
}
for _, l := range lks {
links = append(links, Link{From: base, To: l})
}
go func() { go func() {
for len(links) > 0 { for len(links) > 0 {
@ -77,8 +98,18 @@ func Pages(base string) <-chan Link {
if _, ok := visited[l.To]; !ok { if _, ok := visited[l.To]; !ok {
r <- l r <- l
for _, lk := range URLs(l.To) {
links = append(links, lk) resp, err := http.Get(l.To)
if err != nil {
panic(err)
}
lks, err := URLs(resp.Body)
if err != nil {
panic(err)
}
for _, lk := range lks {
links = append(links, Link{From: l.From, To: lk})
} }
} }
visited[l.To] = true visited[l.To] = true