From 64eb5fb23f64f209e3d813e017097044a111151f Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Sun, 20 Jan 2019 08:15:22 +0000
Subject: [PATCH] Refactor Handlers in terms of a Publisher interface

Introduce an interface to decouple the Enqueue functionality from the
Crawler implementation.
---
 cmd/crawl/crawl.go |  8 ++++----
 cmd/links/links.go |  4 ++--
 crawler.go         | 28 +++++++++++++++++-----------
 crawler_test.go    |  4 ++--
 4 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 2ebba98..54bb505 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -82,7 +82,7 @@ func (f *excludesFileFlag) Set(s string) error {
 	return nil
 }
 
-func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
+func extractLinks(p crawl.Publisher, u string, depth int, resp *http.Response, _ error) error {
 	links, err := analysis.GetLinks(resp)
 	if err != nil {
 		// This is not a fatal error, just a bad web page.
@@ -90,7 +90,7 @@ func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _
 	}
 
 	for _, link := range links {
-		if err := c.Enqueue(link, depth+1); err != nil {
+		if err := p.Enqueue(link, depth+1); err != nil {
 			return err
 		}
 	}
@@ -127,7 +127,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error {
 	return w.Close()
 }
 
-func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
+func (h *warcSaveHandler) Handle(p crawl.Publisher, u string, depth int, resp *http.Response, _ error) error {
 	// Read the response body (so we can save it to the WARC
 	// output) and replace it with a buffer.
 	data, derr := ioutil.ReadAll(resp.Body)
@@ -157,7 +157,7 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht
 
 	h.numWritten++
 
-	return extractLinks(c, u, depth, resp, nil)
+	return extractLinks(p, u, depth, resp, nil)
 }
 
 func newWarcSaveHandler(w *warc.Writer) (crawl.Handler, error) {
diff --git a/cmd/links/links.go b/cmd/links/links.go
index bf91f3f..2263414 100644
--- a/cmd/links/links.go
+++ b/cmd/links/links.go
@@ -20,7 +20,7 @@ var (
 	validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
 )
 
-func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
+func extractLinks(p crawl.Publisher, u string, depth int, resp *http.Response, _ error) error {
 	links, err := analysis.GetLinks(resp)
 	if err != nil {
 		// Not a fatal error, just a bad web page.
@@ -28,7 +28,7 @@ func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _
 	}
 
 	for _, link := range links {
-		if err := c.Enqueue(link, depth+1); err != nil {
+		if err := p.Enqueue(link, depth+1); err != nil {
 			return err
 		}
 	}
diff --git a/crawler.go b/crawler.go
index e7bbf3c..b48646e 100644
--- a/crawler.go
+++ b/crawler.go
@@ -112,21 +112,27 @@ func (f FetcherFunc) Fetch(u string) (*http.Response, error) {
 // unless the handler returns the special error ErrRetryRequest.
 type Handler interface {
 	// Handle the response from a URL.
-	Handle(*Crawler, string, int, *http.Response, error) error
+	Handle(Publisher, string, int, *http.Response, error) error
 }
 
 // HandlerFunc wraps a function into the Handler interface.
-type HandlerFunc func(*Crawler, string, int, *http.Response, error) error
+type HandlerFunc func(Publisher, string, int, *http.Response, error) error
 
 // Handle the response from a URL.
-func (f HandlerFunc) Handle(db *Crawler, u string, depth int, resp *http.Response, err error) error {
-	return f(db, u, depth, resp, err)
+func (f HandlerFunc) Handle(p Publisher, u string, depth int, resp *http.Response, err error) error {
+	return f(p, u, depth, resp, err)
 }
 
 // ErrRetryRequest is returned by a Handler when the request should be
 // retried after some time.
 var ErrRetryRequest = errors.New("retry_request")
 
+// Publisher is an interface to something with an Enqueue() method to
+// add new potential URLs to crawl.
+type Publisher interface {
+	Enqueue(Outlink, int) error
+}
+
 // The Crawler object contains the crawler state.
 type Crawler struct {
 	db      *gobDB
@@ -341,8 +347,8 @@ func (c *Crawler) Close() {
 // and adds them to the queue for crawling. It will call the wrapped
 // handler on all requests regardless.
 func FollowRedirects(wrap Handler) Handler {
-	return HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error {
-		if herr := wrap.Handle(c, u, depth, resp, err); herr != nil {
+	return HandlerFunc(func(p Publisher, u string, depth int, resp *http.Response, err error) error {
+		if herr := wrap.Handle(p, u, depth, resp, err); herr != nil {
 			return herr
 		}
 
@@ -356,7 +362,7 @@ func FollowRedirects(wrap Handler) Handler {
 			if uerr != nil {
 				log.Printf("error parsing Location header: %v", uerr)
 			} else {
-				return c.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1)
+				return p.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1)
 			}
 		}
 		return nil
@@ -367,14 +373,14 @@ func FollowRedirects(wrap Handler) Handler {
 // "successful" HTTP status code (anything < 400). When using this
 // wrapper, subsequent Handle calls will always have err set to nil.
 func FilterErrors(wrap Handler) Handler {
-	return HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error {
+	return HandlerFunc(func(p Publisher, u string, depth int, resp *http.Response, err error) error {
 		if err != nil {
 			return nil
 		}
 		if resp.StatusCode >= 400 {
 			return nil
 		}
-		return wrap.Handle(c, u, depth, resp, nil)
+		return wrap.Handle(p, u, depth, resp, nil)
 	})
 }
 
@@ -382,11 +388,11 @@ func FilterErrors(wrap Handler) Handler {
 // temporary errors (all transport-level errors are considered
 // temporary, as well as any HTTP status code >= 500).
 func HandleRetries(wrap Handler) Handler {
-	return HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error {
+	return HandlerFunc(func(p Publisher, u string, depth int, resp *http.Response, err error) error {
 		if err != nil || resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode >= 500 {
 			return ErrRetryRequest
 		}
-		return wrap.Handle(c, u, depth, resp, nil)
+		return wrap.Handle(p, u, depth, resp, nil)
 	})
 }
 
diff --git a/crawler_test.go b/crawler_test.go
index 7b5c92c..0ad469b 100644
--- a/crawler_test.go
+++ b/crawler_test.go
@@ -33,11 +33,11 @@ func TestCrawler(t *testing.T) {
 	)
 
 	var crawledPages int
-	h := HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error {
+	h := HandlerFunc(func(p Publisher, u string, depth int, resp *http.Response, err error) error {
 		crawledPages++
 		next := fmt.Sprintf(srv.URL+"/page/%d", crawledPages)
 		log.Printf("%s -> %s", u, next)
-		c.Enqueue(Outlink{
+		p.Enqueue(Outlink{
 			URL: mustParseURL(next),
 			Tag: TagPrimary,
 		}, depth+1)
-- 
GitLab