From 70c12b7a5de3fe635f4f49aa7e249f5d6141d2af Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Fri, 31 Aug 2018 09:57:06 +0100
Subject: [PATCH] Improve error handling, part two

Handler errors are fatal, so that an error writing the WARC output
will cause the crawl to abort.
---
 cmd/crawl/crawl.go | 12 +++++++-----
 cmd/links/links.go |  5 +++--
 crawler.go         | 14 ++++++++++----
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index 3d1120c..587b64a 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -40,7 +40,8 @@ var (
 func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
 	links, err := analysis.GetLinks(resp)
 	if err != nil {
-		return err
+		// This is not a fatal error, just a bad web page.
+		return nil
 	}
 
 	for _, link := range links {
@@ -82,7 +83,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error {
 
 func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
 	if err != nil {
-		return err
+		return nil
 	}
 
 	// Read the response body (so we can save it to the WARC
@@ -104,9 +105,10 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht
 
 	// Dump the response.
 	statusLine := fmt.Sprintf("HTTP/1.1 %s", resp.Status)
-	respPayload := bytes.Join([][]byte{
-		[]byte(statusLine), hdr2str(resp.Header), data},
-		[]byte{'\r', '\n'})
+	respPayload := bytes.Join(
+		[][]byte{[]byte(statusLine), hdr2str(resp.Header), data},
+		[]byte{'\r', '\n'},
+	)
 	if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respPayload); werr != nil {
 		return werr
 	}
diff --git a/cmd/links/links.go b/cmd/links/links.go
index 9cd741f..5f76a6a 100644
--- a/cmd/links/links.go
+++ b/cmd/links/links.go
@@ -22,12 +22,13 @@ var (
 
 func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
 	if err != nil {
-		return err
+		return nil
 	}
 
 	links, err := analysis.GetLinks(resp)
 	if err != nil {
-		return err
+		// Not a fatal error, just a bad web page.
+		return nil
 	}
 
 	for _, link := range links {
diff --git a/crawler.go b/crawler.go
index f6670c1..d91d5b4 100644
--- a/crawler.go
+++ b/crawler.go
@@ -20,6 +20,8 @@ import (
 	lutil "github.com/syndtr/goleveldb/leveldb/util"
 )
 
+var errorRetryDelay = 180 * time.Second
+
 type gobDB struct {
 	*leveldb.DB
 }
@@ -95,7 +97,7 @@ type URLInfo struct {
 	URL        string
 	StatusCode int
 	CrawledAt  time.Time
-	Error      error
+	Error      string
 }
 
 // A Fetcher retrieves contents from remote URLs.
@@ -229,9 +231,12 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
 			info.StatusCode = httpResp.StatusCode
 		}
 
-		// Invoke the handler (even if the fetcher errored out).
-		info.Error = c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr)
+		// Invoke the handler (even if the fetcher errored
+		// out). Errors in handling requests are fatal, crawl
+		// will be aborted.
+		Must(c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr))
 
+		// Write the result in our database.
 		wb := new(leveldb.Batch)
 		if httpErr == nil {
 			respBody.Close() // nolint
@@ -239,8 +244,9 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
 			// Remove the URL from the queue if the fetcher was successful.
 			c.queue.Release(wb, p)
 		} else {
+			info.Error = httpErr.Error()
 			log.Printf("error retrieving %s: %v", p.URL, httpErr)
-			Must(c.queue.Retry(wb, p, 300*time.Second))
+			Must(c.queue.Retry(wb, p, errorRetryDelay))
 		}
 
 		Must(c.db.PutObjBatch(wb, urlkey, &info))
-- 
GitLab