Skip to content
Snippets Groups Projects
Commit 4c82422d authored by ale's avatar ale
Browse files

make Scope checking more modular

parent efe98903
No related branches found
No related tags found
No related merge requests found
......@@ -124,7 +124,11 @@ func main() {
}
seeds := crawl.MustParseURLs(flag.Args())
scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ","))
scope := []crawl.Scope{
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
}
w := warc.NewWriter(outf)
defer w.Close()
......
......@@ -38,7 +38,11 @@ func main() {
flag.Parse()
seeds := crawl.MustParseURLs(flag.Args())
scope := crawl.NewSeedScope(seeds, *depth, strings.Split(*validSchemes, ","))
scope := []crawl.Scope{
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
}
crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.HandlerFunc(extractLinks))
if err != nil {
......
......@@ -8,7 +8,6 @@ import (
"log"
"net/http"
"net/url"
"strings"
"sync"
"time"
......@@ -58,10 +57,6 @@ type URLInfo struct {
Error error
}
type Scope interface {
Check(*url.URL, int) bool
}
type Fetcher interface {
Fetch(string) (*http.Response, error)
}
......@@ -86,7 +81,7 @@ func (f HandlerFunc) Handle(db *Crawler, u string, depth int, resp *http.Respons
type Crawler struct {
db *gobDB
seeds []*url.URL
scope Scope
scopes []Scope
fetcher Fetcher
handler Handler
......@@ -111,10 +106,12 @@ func (c *Crawler) Enqueue(u *url.URL, depth int) {
// Normalize the URL.
urlStr := purell.NormalizeURL(u, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
// See if it's in scope.
if !c.scope.Check(u, depth) {
// See if it's in scope. Checks are ANDed.
for _, sc := range c.scopes {
if !sc.Check(u, depth) {
return
}
}
c.enqueueMx.Lock()
defer c.enqueueMx.Unlock()
......@@ -202,46 +199,6 @@ func (c *Crawler) urlHandler(queue <-chan QueuePair) {
}
}
type seedScope struct {
seeds []*url.URL
schemes map[string]struct{}
maxDepth int
}
func (s *seedScope) Check(u *url.URL, depth int) bool {
// Ignore non-allowed schemes.
if _, ok := s.schemes[u.Scheme]; !ok {
return false
}
// Do not crawl beyond maxDepth.
if depth > s.maxDepth {
return false
}
// Check each seed prefix.
for _, seed := range s.seeds {
if u.Host == seed.Host && strings.HasPrefix(u.Path, seed.Path) {
return true
}
}
return false
}
// NewSeedScope returns a Scope that will only allow crawling the seed
// domains, and not beyond the specified maximum link depth.
func NewSeedScope(seeds []*url.URL, maxDepth int, allowedSchemes []string) Scope {
scope := &seedScope{
seeds: seeds,
maxDepth: maxDepth,
schemes: make(map[string]struct{}),
}
for _, s := range allowedSchemes {
scope.schemes[s] = struct{}{}
}
return scope
}
func MustParseURLs(urls []string) []*url.URL {
// Parse the seed URLs.
var parsed []*url.URL
......@@ -256,7 +213,7 @@ func MustParseURLs(urls []string) []*url.URL {
}
// NewCrawler creates a new Crawler object with the specified behavior.
func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) {
func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Handler) (*Crawler, error) {
// Open the crawl database.
db, err := newGobDB(path)
if err != nil {
......@@ -267,7 +224,7 @@ func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler
fetcher: f,
handler: &standardPageHandler{h},
seeds: seeds,
scope: scope,
scopes: scopes,
}
return c, nil
}
......@@ -321,8 +278,6 @@ func (wrap *standardPageHandler) Handle(c *Crawler, u string, depth int, resp *h
}
info.Error = err
//log.Printf("[CRAWL] %+v", info)
c.UpdateURL(info)
return nil
}
scope.go 0 → 100644
package crawl
import (
"fmt"
"net/url"
"strings"
)
type Scope interface {
Check(*url.URL, int) bool
}
type maxDepthScope struct {
maxDepth int
}
func (s *maxDepthScope) Check(uri *url.URL, depth int) bool {
return depth < s.maxDepth
}
// NewDepthScope returns a Scope that will limit crawls to a
// maximum link depth with respect to the crawl seeds.
func NewDepthScope(maxDepth int) Scope {
return &maxDepthScope{maxDepth}
}
type schemeScope struct {
allowedSchemes map[string]struct{}
}
func (s *schemeScope) Check(uri *url.URL, depth int) bool {
_, ok := s.allowedSchemes[uri.Scheme]
return ok
}
// NewSchemeScope limits the crawl to the specified URL schemes.
func NewSchemeScope(schemes []string) Scope {
m := make(map[string]struct{})
for _, s := range schemes {
m[s] = struct{}{}
}
return &schemeScope{m}
}
// A URLPrefixMap makes it easy to check for URL prefixes (even for
// very large lists). The URL scheme is ignored, along with an
// eventual "www." prefix.
type URLPrefixMap map[string]struct{}
func normalizeUrlPrefix(uri *url.URL) string {
return strings.TrimPrefix(uri.Host, "www.") + strings.TrimSuffix(uri.Path, "/")
}
func (m URLPrefixMap) Add(uri *url.URL) {
m[normalizeUrlPrefix(uri)] = struct{}{}
}
func (m URLPrefixMap) Contains(uri *url.URL) bool {
s := strings.TrimPrefix(uri.Host, "www.")
for _, p := range strings.Split(uri.Path, "/") {
if p == "" {
continue
}
s = fmt.Sprintf("%s/%s", s, p)
if _, ok := m[s]; ok {
return true
}
}
return false
}
type urlPrefixScope struct {
prefixes URLPrefixMap
}
func (s *urlPrefixScope) Check(uri *url.URL, depth int) bool {
return s.prefixes.Contains(uri)
}
// NewURLPrefixScope returns a Scope that limits the crawl to a set of
// allowed URL prefixes.
func NewURLPrefixScope(prefixes URLPrefixMap) Scope {
return &urlPrefixScope{prefixes}
}
// NewSeedScope returns a Scope that will only allow crawling the seed
// prefixes.
func NewSeedScope(seeds []*url.URL) Scope {
pfx := make(URLPrefixMap)
for _, s := range seeds {
pfx.Add(s)
}
return NewURLPrefixScope(pfx)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment