From b6901eee6ea1d368cc61d5b042dcdbf22bf38d8d Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Mon, 31 Dec 2018 09:57:22 +0000 Subject: [PATCH] Enable proper rate limiting functionality Use the --qps option to control per-hostname rate limits. The limit is currently global and applied independently to each hostname, before DNS resolution (due to the current structure of the crawler that performs DNS resolution in the worker), but it should be definitely better than nothing. --- README.md | 7 +++++++ cmd/crawl/crawl.go | 10 ++++++++++ crawler.go | 10 ++++++++-- crawler_test.go | 2 +- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b4d28e5..60daa6d 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,13 @@ created, e.g.: $ crawl --output=out-%s.warc.gz http://example.com/ +The crawler will rate-limit its requests to avoid overloading the +target servers. You can select the desired rate of requests per second +with the *--qps* option. It is a floating point number, so you can use +values < 1 to space requests further apart than one second. Note that +rate limiting is currently applied separately for each hostname, +*before* DNS resolution. + ## Limitations Like most crawlers, this one has a number of limitations: diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index 2ebba98..1a32705 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -22,6 +22,7 @@ import ( "syscall" "time" + "git.autistici.org/ai3/jobqueue/queue" "git.autistici.org/ale/crawl" "git.autistici.org/ale/crawl/analysis" "git.autistici.org/ale/crawl/warc" @@ -36,6 +37,7 @@ var ( excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope") outputFile = flag.String("output", "crawl.warc.gz", "output WARC file or pattern (patterns must include a \"%s\" literal token)") warcFileSizeMB = flag.Int("output-max-size", 100, "maximum output WARC file size (in MB) when using patterns") + hostQPS = flag.Float64("qps", 3, "per-hostname qps limit") cpuprofile = flag.String("cpuprofile", "", "create cpu profile") excludes []*regexp.Regexp @@ -296,12 +298,20 @@ func main() { log.Fatal(err) } + var rl queue.RatelimiterFunc + if *hostQPS > 0 { + rl = func(_ []byte) queue.Ratelimiter { + return queue.NewSimpleRatelimiter(*hostQPS) + } + } + crawler, err := crawl.NewCrawler( *dbPath, seeds, scope, crawl.FetcherFunc(fetch), crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(saver))), + rl, ) if err != nil { log.Fatal(err) diff --git a/crawler.go b/crawler.go index c7d1f6d..389e809 100644 --- a/crawler.go +++ b/crawler.go @@ -288,7 +288,7 @@ func MustParseURLs(urls []string) []*url.URL { } // NewCrawler creates a new Crawler object with the specified behavior. -func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) { +func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler, rl queue.RatelimiterFunc) (*Crawler, error) { // Open the crawl database. db, err := newGobDB(path) if err != nil { @@ -296,7 +296,13 @@ func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler } // Create the queue. - q, err := queue.NewQueue(db.DB, queue.WithRetryInterval(ErrorRetryDelay)) + opts := []queue.Option{ + queue.WithRetryInterval(ErrorRetryDelay), + } + if rl != nil { + opts = append(opts, queue.WithRatelimiter(rl)) + } + q, err := queue.NewQueue(db.DB, opts...) if err != nil { return nil, err } diff --git a/crawler_test.go b/crawler_test.go index 7b5c92c..35c4ce7 100644 --- a/crawler_test.go +++ b/crawler_test.go @@ -44,7 +44,7 @@ func TestCrawler(t *testing.T) { return nil }) - crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h)))) + crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h))), nil) if err != nil { t.Fatal("NewCrawler", err) } -- GitLab