Commit df800e15 authored by ale's avatar ale

Provide better defaults for command-line options

Defaults that are more suitable to real-world site archiving.
parent b06b0cd4
Pipeline #733 passed with stage
in 14 seconds
......@@ -34,8 +34,8 @@ The crawling scope is controlled with a set of overlapping checks:
prefix is implicitly ignored)
* maximum crawling depth can be controlled with the *--depth* option
* resources related to a page (CSS, JS, etc) will always be fetched,
even if on external domains, if the *--include-related* option is
specified
even if on external domains, unless the *--exclude-related* option
is specified
If the program is interrupted, running it again with the same command
line from the same directory will cause it to resume crawling from
......
......@@ -26,13 +26,13 @@ import (
)
var (
dbPath = flag.String("state", "crawldb", "crawl state database path")
keepDb = flag.Bool("keep", false, "keep the state database when done")
concurrency = flag.Int("c", 10, "concurrent workers")
depth = flag.Int("depth", 10, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
dbPath = flag.String("state", "crawldb", "crawl state database path")
keepDb = flag.Bool("keep", false, "keep the state database when done")
concurrency = flag.Int("c", 10, "concurrent workers")
depth = flag.Int("depth", 100, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
)
......@@ -213,7 +213,7 @@ func main() {
crawl.NewSeedScope(seeds),
crawl.NewRegexpIgnoreScope(nil),
)
if *alwaysIncludeRelated {
if !*excludeRelated {
scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment