Skip to content
Snippets Groups Projects
Commit df800e15 authored by ale's avatar ale
Browse files

Provide better defaults for command-line options

Defaults that are more suitable to real-world site archiving.
parent b06b0cd4
No related branches found
No related tags found
No related merge requests found
...@@ -34,8 +34,8 @@ The crawling scope is controlled with a set of overlapping checks: ...@@ -34,8 +34,8 @@ The crawling scope is controlled with a set of overlapping checks:
prefix is implicitly ignored) prefix is implicitly ignored)
* maximum crawling depth can be controlled with the *--depth* option * maximum crawling depth can be controlled with the *--depth* option
* resources related to a page (CSS, JS, etc) will always be fetched, * resources related to a page (CSS, JS, etc) will always be fetched,
even if on external domains, if the *--include-related* option is even if on external domains, unless the *--exclude-related* option
specified is specified
If the program is interrupted, running it again with the same command If the program is interrupted, running it again with the same command
line from the same directory will cause it to resume crawling from line from the same directory will cause it to resume crawling from
......
...@@ -26,13 +26,13 @@ import ( ...@@ -26,13 +26,13 @@ import (
) )
var ( var (
dbPath = flag.String("state", "crawldb", "crawl state database path") dbPath = flag.String("state", "crawldb", "crawl state database path")
keepDb = flag.Bool("keep", false, "keep the state database when done") keepDb = flag.Bool("keep", false, "keep the state database when done")
concurrency = flag.Int("c", 10, "concurrent workers") concurrency = flag.Int("c", 10, "concurrent workers")
depth = flag.Int("depth", 10, "maximum link depth") depth = flag.Int("depth", 100, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)") excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile") cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
) )
...@@ -213,7 +213,7 @@ func main() { ...@@ -213,7 +213,7 @@ func main() {
crawl.NewSeedScope(seeds), crawl.NewSeedScope(seeds),
crawl.NewRegexpIgnoreScope(nil), crawl.NewRegexpIgnoreScope(nil),
) )
if *alwaysIncludeRelated { if !*excludeRelated {
scope = crawl.OR(scope, crawl.NewIncludeRelatedScope()) scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment