init

2016-07-28 20:13:20 -07:00 · 2016-07-28 20:13:20 -07:00 · eea0aee587
commit eea0aee587
5 changed files with 180 additions and 0 deletions
--- a/cmd/crawl/main.go
+++ b/cmd/crawl/main.go
@ -0,0 +1,36 @@
+package main
+
+import (
+	"fmt"
+	"net/http"
+	"os"
+
+	"mcquay.me/spider"
+)
+
+const usage = "crawl <url>"
+
+func main() {
+	if len(os.Args) < 2 {
+		fmt.Fprintf(os.Stderr, "%s\n", usage)
+		os.Exit(1)
+	}
+
+	failures := []spider.Link{}
+	for p := range spider.Pages(os.Args[1]) {
+		resp, err := http.Get(p.To)
+		if err != nil {
+			failures = append(failures, p)
+		}
+		if resp.StatusCode != http.StatusOK {
+			failures = append(failures, p)
+		}
+	}
+
+	if len(failures) > 0 {
+		for _, f := range failures {
+			fmt.Fprintf(os.Stderr, "%+v\n", f)
+		}
+		os.Exit(1)
+	}
+}
--- a/cmd/lnks/main.go
+++ b/cmd/lnks/main.go
@ -0,0 +1,20 @@
+package main
+
+import (
+	"fmt"
+	"os"
+
+	"mcquay.me/spider"
+)
+
+const usage = "lnks <url>"
+
+func main() {
+	if len(os.Args) < 2 {
+		fmt.Fprintf(os.Stderr, "%s\n", usage)
+		os.Exit(1)
+	}
+	for _, l := range spider.URLs(os.Args[1]) {
+		fmt.Println(l)
+	}
+}
--- a/licence.txt
+++ b/licence.txt
@ -0,0 +1,27 @@
+Copyright (c) 2016, stephen mcquay
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of vaind nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,8 @@
+# spider
+
+`spider` is a little tool for me to learn how to crawl webpages using the Go
+stdlib.
+
+## installation
+
+    $ go get mcquay.me/spider/...
--- a/spider.go
+++ b/spider.go
@ -0,0 +1,89 @@
+package spider
+
+import (
+	"fmt"
+	"net/http"
+	"os"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+// URLs returns all links on a page
+func URLs(url string) []Link {
+	resp, err := http.Get(url)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "%v\n", err)
+		os.Exit(1)
+	}
+	doc, err := html.Parse(resp.Body)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "%v\n", err)
+		os.Exit(1)
+	}
+	paths := []Link{}
+	var f func(*html.Node)
+	f = func(n *html.Node) {
+		if n.Type == html.ElementNode && n.Data == "a" {
+			for _, a := range n.Attr {
+				if a.Key == "href" {
+					paths = append(paths, Link{From: url, To: a.Val})
+					break
+				}
+			}
+		}
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			f(c)
+		}
+	}
+	f(doc)
+	return paths
+}
+
+type Link struct {
+	From string
+	To   string
+}
+
+func (l Link) String() string {
+	return fmt.Sprintf("%s > %s", l.From, l.To)
+}
+
+// Pages returns a stream of full urls starting at a given base page.
+func Pages(base string) <-chan Link {
+	base = strings.TrimRight(base, "/")
+	visited := map[string]bool{}
+	links := URLs(base)
+	links = append(links, Link{From: "start", To: base})
+
+	r := make(chan Link)
+
+	go func() {
+		for len(links) > 0 {
+			l := links[0]
+			links = links[1:]
+			if !(strings.HasPrefix(l.To, base) || strings.HasPrefix(l.To, "/")) {
+				continue
+			}
+			if !strings.HasPrefix(l.To, base) && strings.HasPrefix(l.To, "/") {
+				l.To = base + l.To
+			}
+
+			// drop fragments
+			h := strings.Index(l.To, "#")
+			if h >= 0 {
+				l.To = l.To[:h]
+			}
+
+			if _, ok := visited[l.To]; !ok {
+				r <- l
+				for _, lk := range URLs(l.To) {
+					links = append(links, lk)
+				}
+			}
+			visited[l.To] = true
+		}
+		close(r)
+	}()
+	return r
+}