From b6901eee6ea1d368cc61d5b042dcdbf22bf38d8d Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Mon, 31 Dec 2018 09:57:22 +0000
Subject: [PATCH] Enable proper rate limiting functionality

Use the --qps option to control per-hostname rate limits. The limit is
currently global and applied independently to each hostname, before
DNS resolution (due to the current structure of the crawler that
performs DNS resolution in the worker), but it should be definitely
better than nothing.
---
 README.md          |  7 +++++++
 cmd/crawl/crawl.go | 10 ++++++++++
 crawler.go         | 10 ++++++++--
 crawler_test.go    |  2 +-
 4 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index b4d28e5..60daa6d 100644
--- a/README.md
+++ b/README.md
@@ -65,6 +65,13 @@ created, e.g.:
 
     $ crawl --output=out-%s.warc.gz http://example.com/
 
+The crawler will rate-limit its requests to avoid overloading the
+target servers. You can select the desired rate of requests per second
+with the *--qps* option. It is a floating point number, so you can use
+values < 1 to space requests further apart than one second. Note that
+rate limiting is currently applied separately for each hostname,
+*before* DNS resolution.
+
 ## Limitations
 
 Like most crawlers, this one has a number of limitations:
diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 2ebba98..1a32705 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -22,6 +22,7 @@ import (
 	"syscall"
 	"time"
 
+	"git.autistici.org/ai3/jobqueue/queue"
 	"git.autistici.org/ale/crawl"
 	"git.autistici.org/ale/crawl/analysis"
 	"git.autistici.org/ale/crawl/warc"
@@ -36,6 +37,7 @@ var (
 	excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope")
 	outputFile     = flag.String("output", "crawl.warc.gz", "output WARC file or pattern (patterns must include a \"%s\" literal token)")
 	warcFileSizeMB = flag.Int("output-max-size", 100, "maximum output WARC file size (in MB) when using patterns")
+	hostQPS        = flag.Float64("qps", 3, "per-hostname qps limit")
 	cpuprofile     = flag.String("cpuprofile", "", "create cpu profile")
 
 	excludes []*regexp.Regexp
@@ -296,12 +298,20 @@ func main() {
 		log.Fatal(err)
 	}
 
+	var rl queue.RatelimiterFunc
+	if *hostQPS > 0 {
+		rl = func(_ []byte) queue.Ratelimiter {
+			return queue.NewSimpleRatelimiter(*hostQPS)
+		}
+	}
+
 	crawler, err := crawl.NewCrawler(
 		*dbPath,
 		seeds,
 		scope,
 		crawl.FetcherFunc(fetch),
 		crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(saver))),
+		rl,
 	)
 	if err != nil {
 		log.Fatal(err)
diff --git a/crawler.go b/crawler.go
index c7d1f6d..389e809 100644
--- a/crawler.go
+++ b/crawler.go
@@ -288,7 +288,7 @@ func MustParseURLs(urls []string) []*url.URL {
 }
 
 // NewCrawler creates a new Crawler object with the specified behavior.
-func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) {
+func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler, rl queue.RatelimiterFunc) (*Crawler, error) {
 	// Open the crawl database.
 	db, err := newGobDB(path)
 	if err != nil {
@@ -296,7 +296,13 @@ func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler
 	}
 
 	// Create the queue.
-	q, err := queue.NewQueue(db.DB, queue.WithRetryInterval(ErrorRetryDelay))
+	opts := []queue.Option{
+		queue.WithRetryInterval(ErrorRetryDelay),
+	}
+	if rl != nil {
+		opts = append(opts, queue.WithRatelimiter(rl))
+	}
+	q, err := queue.NewQueue(db.DB, opts...)
 	if err != nil {
 		return nil, err
 	}
diff --git a/crawler_test.go b/crawler_test.go
index 7b5c92c..35c4ce7 100644
--- a/crawler_test.go
+++ b/crawler_test.go
@@ -44,7 +44,7 @@ func TestCrawler(t *testing.T) {
 		return nil
 	})
 
-	crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h))))
+	crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h))), nil)
 	if err != nil {
 		t.Fatal("NewCrawler", err)
 	}
-- 
GitLab