scope.go 3.76 KB
Newer Older
ale's avatar
ale committed
1 2 3 4 5
package crawl

import (
	"fmt"
	"net/url"
ale's avatar
ale committed
6
	"regexp"
ale's avatar
ale committed
7 8 9
	"strings"
)

ale's avatar
ale committed
10
// Scope defines the crawling scope.
ale's avatar
ale committed
11
type Scope interface {
ale's avatar
ale committed
12
	// Check a URL to see if it's in scope for crawling.
13
	Check(Outlink, int) bool
ale's avatar
ale committed
14 15 16 17 18 19
}

type maxDepthScope struct {
	maxDepth int
}

20
func (s *maxDepthScope) Check(_ Outlink, depth int) bool {
ale's avatar
ale committed
21 22 23 24 25 26 27 28 29 30 31 32 33
	return depth < s.maxDepth
}

// NewDepthScope returns a Scope that will limit crawls to a
// maximum link depth with respect to the crawl seeds.
func NewDepthScope(maxDepth int) Scope {
	return &maxDepthScope{maxDepth}
}

type schemeScope struct {
	allowedSchemes map[string]struct{}
}

34 35
func (s *schemeScope) Check(link Outlink, depth int) bool {
	_, ok := s.allowedSchemes[link.URL.Scheme]
ale's avatar
ale committed
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
	return ok
}

// NewSchemeScope limits the crawl to the specified URL schemes.
func NewSchemeScope(schemes []string) Scope {
	m := make(map[string]struct{})
	for _, s := range schemes {
		m[s] = struct{}{}
	}
	return &schemeScope{m}
}

// A URLPrefixMap makes it easy to check for URL prefixes (even for
// very large lists). The URL scheme is ignored, along with an
// eventual "www." prefix.
type URLPrefixMap map[string]struct{}

ale's avatar
ale committed
53
func normalizeURLPrefix(uri *url.URL) string {
ale's avatar
ale committed
54 55 56
	return strings.TrimPrefix(uri.Host, "www.") + strings.TrimSuffix(uri.Path, "/")
}

ale's avatar
ale committed
57
// Add an URL to the prefix map.
ale's avatar
ale committed
58
func (m URLPrefixMap) Add(uri *url.URL) {
ale's avatar
ale committed
59
	m[normalizeURLPrefix(uri)] = struct{}{}
ale's avatar
ale committed
60 61
}

ale's avatar
ale committed
62
// Contains returns true if the given URL matches the prefix map.
ale's avatar
ale committed
63 64
func (m URLPrefixMap) Contains(uri *url.URL) bool {
	s := strings.TrimPrefix(uri.Host, "www.")
ale's avatar
ale committed
65 66 67
	if _, ok := m[s]; ok {
		return true
	}
ale's avatar
ale committed
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
	for _, p := range strings.Split(uri.Path, "/") {
		if p == "" {
			continue
		}
		s = fmt.Sprintf("%s/%s", s, p)
		if _, ok := m[s]; ok {
			return true
		}
	}
	return false
}

type urlPrefixScope struct {
	prefixes URLPrefixMap
}

84 85
func (s *urlPrefixScope) Check(link Outlink, depth int) bool {
	return s.prefixes.Contains(link.URL)
ale's avatar
ale committed
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
}

// NewURLPrefixScope returns a Scope that limits the crawl to a set of
// allowed URL prefixes.
func NewURLPrefixScope(prefixes URLPrefixMap) Scope {
	return &urlPrefixScope{prefixes}
}

// NewSeedScope returns a Scope that will only allow crawling the seed
// prefixes.
func NewSeedScope(seeds []*url.URL) Scope {
	pfx := make(URLPrefixMap)
	for _, s := range seeds {
		pfx.Add(s)
	}
	return NewURLPrefixScope(pfx)
}
ale's avatar
ale committed
103 104 105 106 107

type regexpIgnoreScope struct {
	ignores []*regexp.Regexp
}

108 109
func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool {
	uriStr := link.URL.String()
ale's avatar
ale committed
110 111 112 113 114 115 116 117
	for _, i := range s.ignores {
		if i.MatchString(uriStr) {
			return false
		}
	}
	return true
}

ale's avatar
ale committed
118 119
// NewRegexpIgnoreScope returns a Scope that filters out URLs
// according to a list of regular expressions.
ale's avatar
ale committed
120 121 122 123 124 125 126 127 128 129 130 131
func NewRegexpIgnoreScope(ignores []string) Scope {
	if ignores == nil {
		ignores = defaultIgnorePatterns
	}
	r := regexpIgnoreScope{
		ignores: make([]*regexp.Regexp, 0, len(ignores)),
	}
	for _, i := range ignores {
		r.ignores = append(r.ignores, regexp.MustCompile(i))
	}
	return &r
}
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178

// NewIncludeRelatedScope always includes resources with TagRelated.
func NewIncludeRelatedScope() Scope {
	return &includeRelatedScope{}
}

type includeRelatedScope struct{}

func (s *includeRelatedScope) Check(link Outlink, _ int) bool {
	return link.Tag == TagRelated
}

// AND performs a boolean AND.
func AND(elems ...Scope) Scope {
	return &andScope{elems: elems}
}

type andScope struct {
	elems []Scope
}

func (s *andScope) Check(link Outlink, depth int) bool {
	for _, e := range s.elems {
		if !e.Check(link, depth) {
			return false
		}
	}
	return true
}

// OR performs a boolean OR.
func OR(elems ...Scope) Scope {
	return &orScope{elems: elems}
}

type orScope struct {
	elems []Scope
}

func (s *orScope) Check(link Outlink, depth int) bool {
	for _, e := range s.elems {
		if e.Check(link, depth) {
			return true
		}
	}
	return false
}