diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go index d68ac5e0a34f64ca05dd23ca3298171941ef9b90..de45494df16400321901c666b347b44041b307ce 100644 --- a/cmd/crawl/crawl.go +++ b/cmd/crawl/crawl.go @@ -24,6 +24,7 @@ import ( var ( dbPath = flag.String("state", "crawldb", "crawl state database path") + keepDb = flag.Bool("keep", false, "keep the state database when done") concurrency = flag.Int("c", 10, "concurrent workers") depth = flag.Int("depth", 10, "maximum link depth") validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") @@ -207,9 +208,14 @@ func main() { saver := NewSaveHandler(w) - crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver)) + crawler, err := crawl.NewCrawler(*dbPath, seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver)) if err != nil { log.Fatal(err) } crawler.Run(*concurrency) + + crawler.Close() + if !*keepDb { + os.RemoveAll(*dbPath) + } } diff --git a/crawler.go b/crawler.go index c337d971a0d1cedbfc7a8964a9a29e092d9faac2..d162330660f5fd43448805499dede451360f81fe 100644 --- a/crawler.go +++ b/crawler.go @@ -319,6 +319,10 @@ func (c *Crawler) Run(concurrency int) { wg.Wait() } +func (c *Crawler) Close() { + c.db.Close() +} + type redirectHandler struct { h Handler }