Skip to content
Snippets Groups Projects
Commit 66ce654d authored by ale's avatar ale
Browse files

Add --exclude and --exclude-file options

Allow users to add to the exclude regexp lists easily.
parent a5d20a9a
No related branches found
No related tags found
No related merge requests found
......@@ -3,6 +3,7 @@
package main
import (
"bufio"
"bytes"
"flag"
"fmt"
......@@ -12,6 +13,7 @@ import (
"net/http"
"os"
"os/signal"
"regexp"
"runtime/pprof"
"strconv"
"strings"
......@@ -33,10 +35,52 @@ var (
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
excludes []*regexp.Regexp
)
func init() {
flag.Var(&excludesFlag{}, "exclude", "exclude regex URL patterns")
flag.Var(&excludesFileFlag{}, "exclude-from-file", "load exclude regex URL patterns from a file")
}
type excludesFlag struct{}
func (f *excludesFlag) String() string { return "" }
func (f *excludesFlag) Set(s string) error {
rx, err := regexp.Compile(s)
if err != nil {
return err
}
excludes = append(excludes, rx)
return nil
}
type excludesFileFlag struct{}
func (f *excludesFileFlag) String() string { return "" }
func (f *excludesFileFlag) Set(s string) error {
ff, err := os.Open(s)
if err != nil {
return err
}
defer ff.Close() // nolint
var lineNum int
scanner := bufio.NewScanner(ff)
for scanner.Scan() {
lineNum++
rx, err := regexp.Compile(scanner.Text())
if err != nil {
return fmt.Errorf("%s, line %d: %v", s, lineNum, err)
}
excludes = append(excludes, rx)
}
return nil
}
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
links, err := analysis.GetLinks(resp)
if err != nil {
......@@ -221,7 +265,7 @@ func main() {
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
crawl.NewRegexpIgnoreScope(nil),
crawl.NewRegexpIgnoreScope(excludes),
)
if !*excludeRelated {
scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
......
......@@ -115,19 +115,21 @@ func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool {
return true
}
func compileDefaultIgnorePatterns() []*regexp.Regexp {
out := make([]*regexp.Regexp, 0, len(defaultIgnorePatterns))
for _, p := range defaultIgnorePatterns {
out = append(out, regexp.MustCompile(p))
}
return out
}
// NewRegexpIgnoreScope returns a Scope that filters out URLs
// according to a list of regular expressions.
func NewRegexpIgnoreScope(ignores []string) Scope {
if ignores == nil {
ignores = defaultIgnorePatterns
}
r := regexpIgnoreScope{
ignores: make([]*regexp.Regexp, 0, len(ignores)),
}
for _, i := range ignores {
r.ignores = append(r.ignores, regexp.MustCompile(i))
func NewRegexpIgnoreScope(ignores []*regexp.Regexp) Scope {
ignores = append(compileDefaultIgnorePatterns(), ignores...)
return &regexpIgnoreScope{
ignores: ignores,
}
return &r
}
// NewIncludeRelatedScope always includes resources with TagRelated.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment