From 66ce654d5be9c26ba69cc75ac12ff6662410c69d Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Sun, 2 Sep 2018 11:16:49 +0100
Subject: [PATCH] Add --exclude and --exclude-file options

Allow users to add to the exclude regexp lists easily.
---
 cmd/crawl/crawl.go | 48 ++++++++++++++++++++++++++++++++++++++++++++--
 scope.go           | 22 +++++++++++----------
 2 files changed, 58 insertions(+), 12 deletions(-)

diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index bbbd65b..d0ff268 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -3,6 +3,7 @@
 package main
 
 import (
+	"bufio"
 	"bytes"
 	"flag"
 	"fmt"
@@ -12,6 +13,7 @@ import (
 	"net/http"
 	"os"
 	"os/signal"
+	"regexp"
 	"runtime/pprof"
 	"strconv"
 	"strings"
@@ -33,10 +35,52 @@ var (
 	validSchemes   = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
 	excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope")
 	outputFile     = flag.String("output", "crawl.warc.gz", "output WARC file")
+	cpuprofile     = flag.String("cpuprofile", "", "create cpu profile")
 
-	cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
+	excludes []*regexp.Regexp
 )
 
+func init() {
+	flag.Var(&excludesFlag{}, "exclude", "exclude regex URL patterns")
+	flag.Var(&excludesFileFlag{}, "exclude-from-file", "load exclude regex URL patterns from a file")
+}
+
+type excludesFlag struct{}
+
+func (f *excludesFlag) String() string { return "" }
+
+func (f *excludesFlag) Set(s string) error {
+	rx, err := regexp.Compile(s)
+	if err != nil {
+		return err
+	}
+	excludes = append(excludes, rx)
+	return nil
+}
+
+type excludesFileFlag struct{}
+
+func (f *excludesFileFlag) String() string { return "" }
+
+func (f *excludesFileFlag) Set(s string) error {
+	ff, err := os.Open(s)
+	if err != nil {
+		return err
+	}
+	defer ff.Close() // nolint
+	var lineNum int
+	scanner := bufio.NewScanner(ff)
+	for scanner.Scan() {
+		lineNum++
+		rx, err := regexp.Compile(scanner.Text())
+		if err != nil {
+			return fmt.Errorf("%s, line %d: %v", s, lineNum, err)
+		}
+		excludes = append(excludes, rx)
+	}
+	return nil
+}
+
 func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
 	links, err := analysis.GetLinks(resp)
 	if err != nil {
@@ -221,7 +265,7 @@ func main() {
 		crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
 		crawl.NewDepthScope(*depth),
 		crawl.NewSeedScope(seeds),
-		crawl.NewRegexpIgnoreScope(nil),
+		crawl.NewRegexpIgnoreScope(excludes),
 	)
 	if !*excludeRelated {
 		scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
diff --git a/scope.go b/scope.go
index b2e90ea..bda1035 100644
--- a/scope.go
+++ b/scope.go
@@ -115,19 +115,21 @@ func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool {
 	return true
 }
 
+func compileDefaultIgnorePatterns() []*regexp.Regexp {
+	out := make([]*regexp.Regexp, 0, len(defaultIgnorePatterns))
+	for _, p := range defaultIgnorePatterns {
+		out = append(out, regexp.MustCompile(p))
+	}
+	return out
+}
+
 // NewRegexpIgnoreScope returns a Scope that filters out URLs
 // according to a list of regular expressions.
-func NewRegexpIgnoreScope(ignores []string) Scope {
-	if ignores == nil {
-		ignores = defaultIgnorePatterns
-	}
-	r := regexpIgnoreScope{
-		ignores: make([]*regexp.Regexp, 0, len(ignores)),
-	}
-	for _, i := range ignores {
-		r.ignores = append(r.ignores, regexp.MustCompile(i))
+func NewRegexpIgnoreScope(ignores []*regexp.Regexp) Scope {
+	ignores = append(compileDefaultIgnorePatterns(), ignores...)
+	return &regexpIgnoreScope{
+		ignores: ignores,
 	}
-	return &r
 }
 
 // NewIncludeRelatedScope always includes resources with TagRelated.
-- 
GitLab