diff --git a/README.md b/README.md index 34360fada7fd457b074b752f89bb566d9ddc2b94..39b0cea026e0424d08fb19364d904666bd07672b 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,8 @@ The crawling scope is controlled with a set of overlapping checks: prefix is implicitly ignored) * maximum crawling depth can be controlled with the *--depth* option * resources related to a page (CSS, JS, etc) will always be fetched, - even if on external domains, if the *--include-related* option is - specified + even if on external domains, unless the *--exclude-related* option + is specified If the program is interrupted, running it again with the same command line from the same directory will cause it to resume crawling from diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index e7e8582661cb3b855bdc5fd88aa7d09f66828f08..0e5fc15b68406a45af9a13f8c86bd9c6f7aad572 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -26,13 +26,13 @@ import ( ) var ( - dbPath = flag.String("state", "crawldb", "crawl state database path") - keepDb = flag.Bool("keep", false, "keep the state database when done") - concurrency = flag.Int("c", 10, "concurrent workers") - depth = flag.Int("depth", 10, "maximum link depth") - validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") - alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)") - outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") + dbPath = flag.String("state", "crawldb", "crawl state database path") + keepDb = flag.Bool("keep", false, "keep the state database when done") + concurrency = flag.Int("c", 10, "concurrent workers") + depth = flag.Int("depth", 100, "maximum link depth") + validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") + excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope") + outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") cpuprofile = flag.String("cpuprofile", "", "create cpu profile") ) @@ -213,7 +213,7 @@ func main() { crawl.NewSeedScope(seeds), crawl.NewRegexpIgnoreScope(nil), ) - if *alwaysIncludeRelated { + if !*excludeRelated { scope = crawl.OR(scope, crawl.NewIncludeRelatedScope()) }