Commit 2e1b00d8 authored by ale's avatar ale

Enable proper rate limiting functionality

Use the --qps option to control per-hostname rate limits. The limit is
currently global and applied independently to each hostname, before
DNS resolution (due to the current structure of the crawler that
performs DNS resolution in the worker), but it should be definitely
better than nothing.
parent 4def7464
Pipeline #1989 failed with stage
in 17 seconds
......@@ -56,6 +56,13 @@ avoid calendars, admin panels of common CMS applications, and other
well-known pitfalls. This list is sourced from the
[ArchiveBot](https://github.com/ArchiveTeam/ArchiveBot) project.
The crawler will rate-limit its requests to avoid overloading the
target servers. You can select the desired rate of requests per second
with the *--qps* option. It is a floating point number, so you can use
values < 1 to space requests further apart than one second. Note that
rate limiting is currently applied separately for each hostname,
*before* DNS resolution.
## Limitations
Like most crawlers, this one has a number of limitations:
......
......@@ -22,6 +22,7 @@ import (
"syscall"
"time"
"git.autistici.org/ai3/jobqueue/queue"
"git.autistici.org/ale/crawl"
"git.autistici.org/ale/crawl/analysis"
"git.autistici.org/ale/crawl/warc"
......@@ -35,6 +36,7 @@ var (
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
excludeRelated = flag.Bool("exclude-related", false, "include related resources (css, images, etc) only if their URL is in scope")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
hostQPS = flag.Float64("qps", 3, "per-hostname qps limit")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
excludes []*regexp.Regexp
......@@ -283,12 +285,20 @@ func main() {
log.Fatal(err)
}
var rl queue.RatelimiterFunc
if *hostQPS > 0 {
rl = func(_ []byte) queue.Ratelimiter {
return queue.NewSimpleRatelimiter(*hostQPS)
}
}
crawler, err := crawl.NewCrawler(
*dbPath,
seeds,
scope,
crawl.FetcherFunc(fetch),
crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(saver))),
rl,
)
if err != nil {
log.Fatal(err)
......
......@@ -288,7 +288,7 @@ func MustParseURLs(urls []string) []*url.URL {
}
// NewCrawler creates a new Crawler object with the specified behavior.
func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) {
func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler, rl queue.RatelimiterFunc) (*Crawler, error) {
// Open the crawl database.
db, err := newGobDB(path)
if err != nil {
......@@ -296,7 +296,13 @@ func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler
}
// Create the queue.
q, err := queue.NewQueue(db.DB, queue.WithRetryInterval(ErrorRetryDelay))
opts := []queue.Option{
queue.WithRetryInterval(ErrorRetryDelay),
}
if rl != nil {
opts = append(opts, queue.WithRatelimiter(rl))
}
q, err := queue.NewQueue(db.DB, opts...)
if err != nil {
return nil, err
}
......
......@@ -44,7 +44,7 @@ func TestCrawler(t *testing.T) {
return nil
})
crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h))))
crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h))), nil)
if err != nil {
t.Fatal("NewCrawler", err)
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment