Commit 70c12b7a authored by ale's avatar ale

Improve error handling, part two

Handler errors are fatal, so that an error writing the WARC output
will cause the crawl to abort.
parent 98e2528f
......@@ -40,7 +40,8 @@ var (
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
links, err := analysis.GetLinks(resp)
if err != nil {
return err
// This is not a fatal error, just a bad web page.
return nil
}
for _, link := range links {
......@@ -82,7 +83,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error {
func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
if err != nil {
return err
return nil
}
// Read the response body (so we can save it to the WARC
......@@ -104,9 +105,10 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht
// Dump the response.
statusLine := fmt.Sprintf("HTTP/1.1 %s", resp.Status)
respPayload := bytes.Join([][]byte{
[]byte(statusLine), hdr2str(resp.Header), data},
[]byte{'\r', '\n'})
respPayload := bytes.Join(
[][]byte{[]byte(statusLine), hdr2str(resp.Header), data},
[]byte{'\r', '\n'},
)
if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respPayload); werr != nil {
return werr
}
......
......@@ -22,12 +22,13 @@ var (
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
if err != nil {
return err
return nil
}
links, err := analysis.GetLinks(resp)
if err != nil {
return err
// Not a fatal error, just a bad web page.
return nil
}
for _, link := range links {
......
......@@ -20,6 +20,8 @@ import (
lutil "github.com/syndtr/goleveldb/leveldb/util"
)
var errorRetryDelay = 180 * time.Second
type gobDB struct {
*leveldb.DB
}
......@@ -95,7 +97,7 @@ type URLInfo struct {
URL string
StatusCode int
CrawledAt time.Time
Error error
Error string
}
// A Fetcher retrieves contents from remote URLs.
......@@ -229,9 +231,12 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
info.StatusCode = httpResp.StatusCode
}
// Invoke the handler (even if the fetcher errored out).
info.Error = c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr)
// Invoke the handler (even if the fetcher errored
// out). Errors in handling requests are fatal, crawl
// will be aborted.
Must(c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr))
// Write the result in our database.
wb := new(leveldb.Batch)
if httpErr == nil {
respBody.Close() // nolint
......@@ -239,8 +244,9 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
// Remove the URL from the queue if the fetcher was successful.
c.queue.Release(wb, p)
} else {
info.Error = httpErr.Error()
log.Printf("error retrieving %s: %v", p.URL, httpErr)
Must(c.queue.Retry(wb, p, 300*time.Second))
Must(c.queue.Retry(wb, p, errorRetryDelay))
}
Must(c.db.PutObjBatch(wb, urlkey, &info))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment