From df800e154f3265f43a3758ac5071caba026ae585 Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Tue, 19 Dec 2017 10:16:08 +0000
Subject: [PATCH] Provide better defaults for command-line options

Defaults that are more suitable to real-world site archiving.
---
 README.md          |  4 ++--
 cmd/crawl/crawl.go | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 34360fa..39b0cea 100644
--- a/README.md
+++ b/README.md
@@ -34,8 +34,8 @@ The crawling scope is controlled with a set of overlapping checks:
   prefix is implicitly ignored)
 * maximum crawling depth can be controlled with the *--depth* option
 * resources related to a page (CSS, JS, etc) will always be fetched,
-  even if on external domains, if the *--include-related* option is
-  specified
+  even if on external domains, unless the *--exclude-related* option
+  is specified
 
 If the program is interrupted, running it again with the same command
 line from the same directory will cause it to resume crawling from
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index e7e8582..0e5fc15 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -26,13 +26,13 @@ import (
 )
 
 var (
-	dbPath               = flag.String("state", "crawldb", "crawl state database path")
-	keepDb               = flag.Bool("keep", false, "keep the state database when done")
-	concurrency          = flag.Int("c", 10, "concurrent workers")
-	depth                = flag.Int("depth", 10, "maximum link depth")
-	validSchemes         = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
-	alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)")
-	outputFile           = flag.String("output", "crawl.warc.gz", "output WARC file")
+	dbPath         = flag.String("state", "crawldb", "crawl state database path")
+	keepDb         = flag.Bool("keep", false, "keep the state database when done")
+	concurrency    = flag.Int("c", 10, "concurrent workers")
+	depth          = flag.Int("depth", 100, "maximum link depth")
+	validSchemes   = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
+	excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope")
+	outputFile     = flag.String("output", "crawl.warc.gz", "output WARC file")
 
 	cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
 )
@@ -213,7 +213,7 @@ func main() {
 		crawl.NewSeedScope(seeds),
 		crawl.NewRegexpIgnoreScope(nil),
 	)
-	if *alwaysIncludeRelated {
+	if !*excludeRelated {
 		scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
 	}
 
-- 
GitLab