commit eea0aee587705b88eed550fecc7c428d290f0111 Author: Stephen McQuay (smcquay) Date: Thu Jul 28 20:13:20 2016 -0700 init diff --git a/cmd/crawl/main.go b/cmd/crawl/main.go new file mode 100644 index 0000000..8de42bd --- /dev/null +++ b/cmd/crawl/main.go @@ -0,0 +1,36 @@ +package main + +import ( + "fmt" + "net/http" + "os" + + "mcquay.me/spider" +) + +const usage = "crawl " + +func main() { + if len(os.Args) < 2 { + fmt.Fprintf(os.Stderr, "%s\n", usage) + os.Exit(1) + } + + failures := []spider.Link{} + for p := range spider.Pages(os.Args[1]) { + resp, err := http.Get(p.To) + if err != nil { + failures = append(failures, p) + } + if resp.StatusCode != http.StatusOK { + failures = append(failures, p) + } + } + + if len(failures) > 0 { + for _, f := range failures { + fmt.Fprintf(os.Stderr, "%+v\n", f) + } + os.Exit(1) + } +} diff --git a/cmd/lnks/main.go b/cmd/lnks/main.go new file mode 100644 index 0000000..01eaf48 --- /dev/null +++ b/cmd/lnks/main.go @@ -0,0 +1,20 @@ +package main + +import ( + "fmt" + "os" + + "mcquay.me/spider" +) + +const usage = "lnks " + +func main() { + if len(os.Args) < 2 { + fmt.Fprintf(os.Stderr, "%s\n", usage) + os.Exit(1) + } + for _, l := range spider.URLs(os.Args[1]) { + fmt.Println(l) + } +} diff --git a/licence.txt b/licence.txt new file mode 100644 index 0000000..1b44b3d --- /dev/null +++ b/licence.txt @@ -0,0 +1,27 @@ +Copyright (c) 2016, stephen mcquay + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of vaind nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..b44414a --- /dev/null +++ b/readme.md @@ -0,0 +1,8 @@ +# spider + +`spider` is a little tool for me to learn how to crawl webpages using the Go +stdlib. + +## installation + + $ go get mcquay.me/spider/... diff --git a/spider.go b/spider.go new file mode 100644 index 0000000..365d517 --- /dev/null +++ b/spider.go @@ -0,0 +1,89 @@ +package spider + +import ( + "fmt" + "net/http" + "os" + "strings" + + "golang.org/x/net/html" +) + +// URLs returns all links on a page +func URLs(url string) []Link { + resp, err := http.Get(url) + if err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + os.Exit(1) + } + doc, err := html.Parse(resp.Body) + if err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + os.Exit(1) + } + paths := []Link{} + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "a" { + for _, a := range n.Attr { + if a.Key == "href" { + paths = append(paths, Link{From: url, To: a.Val}) + break + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + f(doc) + return paths +} + +type Link struct { + From string + To string +} + +func (l Link) String() string { + return fmt.Sprintf("%s > %s", l.From, l.To) +} + +// Pages returns a stream of full urls starting at a given base page. +func Pages(base string) <-chan Link { + base = strings.TrimRight(base, "/") + visited := map[string]bool{} + links := URLs(base) + links = append(links, Link{From: "start", To: base}) + + r := make(chan Link) + + go func() { + for len(links) > 0 { + l := links[0] + links = links[1:] + if !(strings.HasPrefix(l.To, base) || strings.HasPrefix(l.To, "/")) { + continue + } + if !strings.HasPrefix(l.To, base) && strings.HasPrefix(l.To, "/") { + l.To = base + l.To + } + + // drop fragments + h := strings.Index(l.To, "#") + if h >= 0 { + l.To = l.To[:h] + } + + if _, ok := visited[l.To]; !ok { + r <- l + for _, lk := range URLs(l.To) { + links = append(links, lk) + } + } + visited[l.To] = true + } + close(r) + }() + return r +}