Commit 70c12b7a authored by ale's avatar ale

Improve error handling, part two

Handler errors are fatal, so that an error writing the WARC output
will cause the crawl to abort.
parent 98e2528f
...@@ -40,7 +40,8 @@ var ( ...@@ -40,7 +40,8 @@ var (
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error { func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
links, err := analysis.GetLinks(resp) links, err := analysis.GetLinks(resp)
if err != nil { if err != nil {
return err // This is not a fatal error, just a bad web page.
return nil
} }
for _, link := range links { for _, link := range links {
...@@ -82,7 +83,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error { ...@@ -82,7 +83,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error {
func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
if err != nil { if err != nil {
return err return nil
} }
// Read the response body (so we can save it to the WARC // Read the response body (so we can save it to the WARC
...@@ -104,9 +105,10 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht ...@@ -104,9 +105,10 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht
// Dump the response. // Dump the response.
statusLine := fmt.Sprintf("HTTP/1.1 %s", resp.Status) statusLine := fmt.Sprintf("HTTP/1.1 %s", resp.Status)
respPayload := bytes.Join([][]byte{ respPayload := bytes.Join(
[]byte(statusLine), hdr2str(resp.Header), data}, [][]byte{[]byte(statusLine), hdr2str(resp.Header), data},
[]byte{'\r', '\n'}) []byte{'\r', '\n'},
)
if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respPayload); werr != nil { if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respPayload); werr != nil {
return werr return werr
} }
......
...@@ -22,12 +22,13 @@ var ( ...@@ -22,12 +22,13 @@ var (
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
if err != nil { if err != nil {
return err return nil
} }
links, err := analysis.GetLinks(resp) links, err := analysis.GetLinks(resp)
if err != nil { if err != nil {
return err // Not a fatal error, just a bad web page.
return nil
} }
for _, link := range links { for _, link := range links {
......
...@@ -20,6 +20,8 @@ import ( ...@@ -20,6 +20,8 @@ import (
lutil "github.com/syndtr/goleveldb/leveldb/util" lutil "github.com/syndtr/goleveldb/leveldb/util"
) )
var errorRetryDelay = 180 * time.Second
type gobDB struct { type gobDB struct {
*leveldb.DB *leveldb.DB
} }
...@@ -95,7 +97,7 @@ type URLInfo struct { ...@@ -95,7 +97,7 @@ type URLInfo struct {
URL string URL string
StatusCode int StatusCode int
CrawledAt time.Time CrawledAt time.Time
Error error Error string
} }
// A Fetcher retrieves contents from remote URLs. // A Fetcher retrieves contents from remote URLs.
...@@ -229,9 +231,12 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { ...@@ -229,9 +231,12 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
info.StatusCode = httpResp.StatusCode info.StatusCode = httpResp.StatusCode
} }
// Invoke the handler (even if the fetcher errored out). // Invoke the handler (even if the fetcher errored
info.Error = c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr) // out). Errors in handling requests are fatal, crawl
// will be aborted.
Must(c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr))
// Write the result in our database.
wb := new(leveldb.Batch) wb := new(leveldb.Batch)
if httpErr == nil { if httpErr == nil {
respBody.Close() // nolint respBody.Close() // nolint
...@@ -239,8 +244,9 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { ...@@ -239,8 +244,9 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
// Remove the URL from the queue if the fetcher was successful. // Remove the URL from the queue if the fetcher was successful.
c.queue.Release(wb, p) c.queue.Release(wb, p)
} else { } else {
info.Error = httpErr.Error()
log.Printf("error retrieving %s: %v", p.URL, httpErr) log.Printf("error retrieving %s: %v", p.URL, httpErr)
Must(c.queue.Retry(wb, p, 300*time.Second)) Must(c.queue.Retry(wb, p, errorRetryDelay))
} }
Must(c.db.PutObjBatch(wb, urlkey, &info)) Must(c.db.PutObjBatch(wb, urlkey, &info))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment