Commit 4c82422d authored by ale's avatar ale

make Scope checking more modular

parent efe98903
......@@ -124,7 +124,11 @@ func main() {
}
seeds := crawl.MustParseURLs(flag.Args())
scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ","))
scope := []crawl.Scope{
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
}
w := warc.NewWriter(outf)
defer w.Close()
......
......@@ -38,7 +38,11 @@ func main() {
flag.Parse()
seeds := crawl.MustParseURLs(flag.Args())
scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ","))
scope := []crawl.Scope{
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
}
crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.HandlerFunc(extractLinks))
if err != nil {
......
......@@ -8,7 +8,6 @@ import (
"log"
"net/http"
"net/url"
"strings"
"sync"
"time"
......@@ -58,10 +57,6 @@ type URLInfo struct {
Error error
}
type Scope interface {
Check(*url.URL, int) bool
}
type Fetcher interface {
Fetch(string) (*http.Response, error)
}
......@@ -86,7 +81,7 @@ func (f HandlerFunc) Handle(db *Crawler, u string, depth int, resp *http.Respons
type Crawler struct {
db *gobDB
seeds []*url.URL
scope Scope
scopes []Scope
fetcher Fetcher
handler Handler
......@@ -111,9 +106,11 @@ func (c *Crawler) Enqueue(u *url.URL, depth int) {
// Normalize the URL.
urlStr := purell.NormalizeURL(u, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
// See if it's in scope.
if !c.scope.Check(u, depth) {
return
// See if it's in scope. Checks are ANDed.
for _, sc := range c.scopes {
if !sc.Check(u, depth) {
return
}
}
c.enqueueMx.Lock()
......@@ -202,46 +199,6 @@ func (c *Crawler) urlHandler(queue <-chan QueuePair) {
}
}
type seedScope struct {
seeds []*url.URL
schemes map[string]struct{}
maxDepth int
}
func (s *seedScope) Check(u *url.URL, depth int) bool {
// Ignore non-allowed schemes.
if _, ok := s.schemes[u.Scheme]; !ok {
return false
}
// Do not crawl beyond maxDepth.
if depth > s.maxDepth {
return false
}
// Check each seed prefix.
for _, seed := range s.seeds {
if u.Host == seed.Host && strings.HasPrefix(u.Path, seed.Path) {
return true
}
}
return false
}
// NewSeedScope returns a Scope that will only allow crawling the seed
// domains, and not beyond the specified maximum link depth.
func NewSeedScope(seeds []*url.URL, maxDepth int, allowedSchemes []string) Scope {
scope := &seedScope{
seeds: seeds,
maxDepth: maxDepth,
schemes: make(map[string]struct{}),
}
for _, s := range allowedSchemes {
scope.schemes[s] = struct{}{}
}
return scope
}
func MustParseURLs(urls []string) []*url.URL {
// Parse the seed URLs.
var parsed []*url.URL
......@@ -256,7 +213,7 @@ func MustParseURLs(urls []string) []*url.URL {
}
// NewCrawler creates a new Crawler object with the specified behavior.
func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) {
func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Handler) (*Crawler, error) {
// Open the crawl database.
db, err := newGobDB(path)
if err != nil {
......@@ -267,7 +224,7 @@ func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler
fetcher: f,
handler: &standardPageHandler{h},
seeds: seeds,
scope: scope,
scopes: scopes,
}
return c, nil
}
......@@ -321,8 +278,6 @@ func (wrap *standardPageHandler) Handle(c *Crawler, u string, depth int, resp *h
}
info.Error = err
//log.Printf("[CRAWL] %+v", info)
c.UpdateURL(info)
return nil
}
package crawl
import (
"fmt"
"net/url"
"strings"
)
type Scope interface {
Check(*url.URL, int) bool
}
type maxDepthScope struct {
maxDepth int
}
func (s *maxDepthScope) Check(uri *url.URL, depth int) bool {
return depth < s.maxDepth
}
// NewDepthScope returns a Scope that will limit crawls to a
// maximum link depth with respect to the crawl seeds.
func NewDepthScope(maxDepth int) Scope {
return &maxDepthScope{maxDepth}
}
type schemeScope struct {
allowedSchemes map[string]struct{}
}
func (s *schemeScope) Check(uri *url.URL, depth int) bool {
_, ok := s.allowedSchemes[uri.Scheme]
return ok
}
// NewSchemeScope limits the crawl to the specified URL schemes.
func NewSchemeScope(schemes []string) Scope {
m := make(map[string]struct{})
for _, s := range schemes {
m[s] = struct{}{}
}
return &schemeScope{m}
}
// A URLPrefixMap makes it easy to check for URL prefixes (even for
// very large lists). The URL scheme is ignored, along with an
// eventual "www." prefix.
type URLPrefixMap map[string]struct{}
func normalizeUrlPrefix(uri *url.URL) string {
return strings.TrimPrefix(uri.Host, "www.") + strings.TrimSuffix(uri.Path, "/")
}
func (m URLPrefixMap) Add(uri *url.URL) {
m[normalizeUrlPrefix(uri)] = struct{}{}
}
func (m URLPrefixMap) Contains(uri *url.URL) bool {
s := strings.TrimPrefix(uri.Host, "www.")
for _, p := range strings.Split(uri.Path, "/") {
if p == "" {
continue
}
s = fmt.Sprintf("%s/%s", s, p)
if _, ok := m[s]; ok {
return true
}
}
return false
}
type urlPrefixScope struct {
prefixes URLPrefixMap
}
func (s *urlPrefixScope) Check(uri *url.URL, depth int) bool {
return s.prefixes.Contains(uri)
}
// NewURLPrefixScope returns a Scope that limits the crawl to a set of
// allowed URL prefixes.
func NewURLPrefixScope(prefixes URLPrefixMap) Scope {
return &urlPrefixScope{prefixes}
}
// NewSeedScope returns a Scope that will only allow crawling the seed
// prefixes.
func NewSeedScope(seeds []*url.URL) Scope {
pfx := make(URLPrefixMap)
for _, s := range seeds {
pfx.Add(s)
}
return NewURLPrefixScope(pfx)
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment