Commit 66ce654d authored by ale's avatar ale

Add --exclude and --exclude-file options

Allow users to add to the exclude regexp lists easily.
parent a5d20a9a
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
package main package main
import ( import (
"bufio"
"bytes" "bytes"
"flag" "flag"
"fmt" "fmt"
...@@ -12,6 +13,7 @@ import ( ...@@ -12,6 +13,7 @@ import (
"net/http" "net/http"
"os" "os"
"os/signal" "os/signal"
"regexp"
"runtime/pprof" "runtime/pprof"
"strconv" "strconv"
"strings" "strings"
...@@ -33,10 +35,52 @@ var ( ...@@ -33,10 +35,52 @@ var (
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope") excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile") excludes []*regexp.Regexp
) )
func init() {
flag.Var(&excludesFlag{}, "exclude", "exclude regex URL patterns")
flag.Var(&excludesFileFlag{}, "exclude-from-file", "load exclude regex URL patterns from a file")
}
type excludesFlag struct{}
func (f *excludesFlag) String() string { return "" }
func (f *excludesFlag) Set(s string) error {
rx, err := regexp.Compile(s)
if err != nil {
return err
}
excludes = append(excludes, rx)
return nil
}
type excludesFileFlag struct{}
func (f *excludesFileFlag) String() string { return "" }
func (f *excludesFileFlag) Set(s string) error {
ff, err := os.Open(s)
if err != nil {
return err
}
defer ff.Close() // nolint
var lineNum int
scanner := bufio.NewScanner(ff)
for scanner.Scan() {
lineNum++
rx, err := regexp.Compile(scanner.Text())
if err != nil {
return fmt.Errorf("%s, line %d: %v", s, lineNum, err)
}
excludes = append(excludes, rx)
}
return nil
}
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error { func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
links, err := analysis.GetLinks(resp) links, err := analysis.GetLinks(resp)
if err != nil { if err != nil {
...@@ -221,7 +265,7 @@ func main() { ...@@ -221,7 +265,7 @@ func main() {
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")), crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth), crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds), crawl.NewSeedScope(seeds),
crawl.NewRegexpIgnoreScope(nil), crawl.NewRegexpIgnoreScope(excludes),
) )
if !*excludeRelated { if !*excludeRelated {
scope = crawl.OR(scope, crawl.NewIncludeRelatedScope()) scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
......
...@@ -115,19 +115,21 @@ func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool { ...@@ -115,19 +115,21 @@ func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool {
return true return true
} }
func compileDefaultIgnorePatterns() []*regexp.Regexp {
out := make([]*regexp.Regexp, 0, len(defaultIgnorePatterns))
for _, p := range defaultIgnorePatterns {
out = append(out, regexp.MustCompile(p))
}
return out
}
// NewRegexpIgnoreScope returns a Scope that filters out URLs // NewRegexpIgnoreScope returns a Scope that filters out URLs
// according to a list of regular expressions. // according to a list of regular expressions.
func NewRegexpIgnoreScope(ignores []string) Scope { func NewRegexpIgnoreScope(ignores []*regexp.Regexp) Scope {
if ignores == nil { ignores = append(compileDefaultIgnorePatterns(), ignores...)
ignores = defaultIgnorePatterns return &regexpIgnoreScope{
} ignores: ignores,
r := regexpIgnoreScope{
ignores: make([]*regexp.Regexp, 0, len(ignores)),
}
for _, i := range ignores {
r.ignores = append(r.ignores, regexp.MustCompile(i))
} }
return &r
} }
// NewIncludeRelatedScope always includes resources with TagRelated. // NewIncludeRelatedScope always includes resources with TagRelated.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment