diff --git a/README.md b/README.md index b4d28e5753765b64b9003548545717ba85f7c50f..60daa6d1836b59b5bbe1ef247e26b2ad73c8e059 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,13 @@ created, e.g.: $ crawl --output=out-%s.warc.gz http://example.com/ +The crawler will rate-limit its requests to avoid overloading the +target servers. You can select the desired rate of requests per second +with the *--qps* option. It is a floating point number, so you can use +values < 1 to space requests further apart than one second. Note that +rate limiting is currently applied separately for each hostname, +*before* DNS resolution. + ## Limitations Like most crawlers, this one has a number of limitations: diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 2ebba9847754615c55ffb06e5bb113eb557db99e..1a32705e1b2041bcb41e24a16bb435120cd6880d 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -22,6 +22,7 @@ import ( "syscall" "time" + "git.autistici.org/ai3/jobqueue/queue" "git.autistici.org/ale/crawl" "git.autistici.org/ale/crawl/analysis" "git.autistici.org/ale/crawl/warc" @@ -36,6 +37,7 @@ var ( excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope") outputFile = flag.String("output", "crawl.warc.gz", "output WARC file or pattern (patterns must include a \"%s\" literal token)") warcFileSizeMB = flag.Int("output-max-size", 100, "maximum output WARC file size (in MB) when using patterns") + hostQPS = flag.Float64("qps", 3, "per-hostname qps limit") cpuprofile = flag.String("cpuprofile", "", "create cpu profile") excludes []*regexp.Regexp @@ -296,12 +298,20 @@ func main() { log.Fatal(err) } + var rl queue.RatelimiterFunc + if *hostQPS > 0 { + rl = func(_ []byte) queue.Ratelimiter { + return queue.NewSimpleRatelimiter(*hostQPS) + } + } + crawler, err := crawl.NewCrawler( *dbPath, seeds, scope, crawl.FetcherFunc(fetch), crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(saver))), + rl, ) if err != nil { log.Fatal(err) diff --git a/crawler.go b/crawler.go index c7d1f6d909af99ad03033eafa8c60223a457190d..389e8098602df674cdb2fcc29d836f6d860d9e40 100644 --- a/crawler.go +++ b/crawler.go @@ -288,7 +288,7 @@ func MustParseURLs(urls []string) []*url.URL { } // NewCrawler creates a new Crawler object with the specified behavior. -func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) { +func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler, rl queue.RatelimiterFunc) (*Crawler, error) { // Open the crawl database. db, err := newGobDB(path) if err != nil { @@ -296,7 +296,13 @@ func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler } // Create the queue. - q, err := queue.NewQueue(db.DB, queue.WithRetryInterval(ErrorRetryDelay)) + opts := []queue.Option{ + queue.WithRetryInterval(ErrorRetryDelay), + } + if rl != nil { + opts = append(opts, queue.WithRatelimiter(rl)) + } + q, err := queue.NewQueue(db.DB, opts...) if err != nil { return nil, err } diff --git a/crawler_test.go b/crawler_test.go index 7b5c92c4d8f10f510608ba3e595d36f80cde1101..35c4ce7782b735cf7e87b59c323541151d6af732 100644 --- a/crawler_test.go +++ b/crawler_test.go @@ -44,7 +44,7 @@ func TestCrawler(t *testing.T) { return nil }) - crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h)))) + crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h))), nil) if err != nil { t.Fatal("NewCrawler", err) }