init
This commit is contained in:
commit
eea0aee587
36
cmd/crawl/main.go
Normal file
36
cmd/crawl/main.go
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"mcquay.me/spider"
|
||||||
|
)
|
||||||
|
|
||||||
|
const usage = "crawl <url>"
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
if len(os.Args) < 2 {
|
||||||
|
fmt.Fprintf(os.Stderr, "%s\n", usage)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
failures := []spider.Link{}
|
||||||
|
for p := range spider.Pages(os.Args[1]) {
|
||||||
|
resp, err := http.Get(p.To)
|
||||||
|
if err != nil {
|
||||||
|
failures = append(failures, p)
|
||||||
|
}
|
||||||
|
if resp.StatusCode != http.StatusOK {
|
||||||
|
failures = append(failures, p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(failures) > 0 {
|
||||||
|
for _, f := range failures {
|
||||||
|
fmt.Fprintf(os.Stderr, "%+v\n", f)
|
||||||
|
}
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
}
|
20
cmd/lnks/main.go
Normal file
20
cmd/lnks/main.go
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"mcquay.me/spider"
|
||||||
|
)
|
||||||
|
|
||||||
|
const usage = "lnks <url>"
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
if len(os.Args) < 2 {
|
||||||
|
fmt.Fprintf(os.Stderr, "%s\n", usage)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
for _, l := range spider.URLs(os.Args[1]) {
|
||||||
|
fmt.Println(l)
|
||||||
|
}
|
||||||
|
}
|
27
licence.txt
Normal file
27
licence.txt
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
Copyright (c) 2016, stephen mcquay
|
||||||
|
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of vaind nor the names of its contributors
|
||||||
|
may be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||||
|
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
8
readme.md
Normal file
8
readme.md
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# spider
|
||||||
|
|
||||||
|
`spider` is a little tool for me to learn how to crawl webpages using the Go
|
||||||
|
stdlib.
|
||||||
|
|
||||||
|
## installation
|
||||||
|
|
||||||
|
$ go get mcquay.me/spider/...
|
89
spider.go
Normal file
89
spider.go
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
package spider
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
)
|
||||||
|
|
||||||
|
// URLs returns all links on a page
|
||||||
|
func URLs(url string) []Link {
|
||||||
|
resp, err := http.Get(url)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "%v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
doc, err := html.Parse(resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "%v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
paths := []Link{}
|
||||||
|
var f func(*html.Node)
|
||||||
|
f = func(n *html.Node) {
|
||||||
|
if n.Type == html.ElementNode && n.Data == "a" {
|
||||||
|
for _, a := range n.Attr {
|
||||||
|
if a.Key == "href" {
|
||||||
|
paths = append(paths, Link{From: url, To: a.Val})
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||||
|
f(c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
f(doc)
|
||||||
|
return paths
|
||||||
|
}
|
||||||
|
|
||||||
|
type Link struct {
|
||||||
|
From string
|
||||||
|
To string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l Link) String() string {
|
||||||
|
return fmt.Sprintf("%s > %s", l.From, l.To)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pages returns a stream of full urls starting at a given base page.
|
||||||
|
func Pages(base string) <-chan Link {
|
||||||
|
base = strings.TrimRight(base, "/")
|
||||||
|
visited := map[string]bool{}
|
||||||
|
links := URLs(base)
|
||||||
|
links = append(links, Link{From: "start", To: base})
|
||||||
|
|
||||||
|
r := make(chan Link)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
for len(links) > 0 {
|
||||||
|
l := links[0]
|
||||||
|
links = links[1:]
|
||||||
|
if !(strings.HasPrefix(l.To, base) || strings.HasPrefix(l.To, "/")) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !strings.HasPrefix(l.To, base) && strings.HasPrefix(l.To, "/") {
|
||||||
|
l.To = base + l.To
|
||||||
|
}
|
||||||
|
|
||||||
|
// drop fragments
|
||||||
|
h := strings.Index(l.To, "#")
|
||||||
|
if h >= 0 {
|
||||||
|
l.To = l.To[:h]
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, ok := visited[l.To]; !ok {
|
||||||
|
r <- l
|
||||||
|
for _, lk := range URLs(l.To) {
|
||||||
|
links = append(links, lk)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
visited[l.To] = true
|
||||||
|
}
|
||||||
|
close(r)
|
||||||
|
}()
|
||||||
|
return r
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user