Skip to content
Snippets Groups Projects
Commit 70c12b7a authored by ale's avatar ale
Browse files

Improve error handling, part two

Handler errors are fatal, so that an error writing the WARC output
will cause the crawl to abort.
parent 98e2528f
No related branches found
No related tags found
No related merge requests found
......@@ -40,7 +40,8 @@ var (
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, _ error) error {
links, err := analysis.GetLinks(resp)
if err != nil {
return err
// This is not a fatal error, just a bad web page.
return nil
}
for _, link := range links {
......@@ -82,7 +83,7 @@ func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error {
func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
if err != nil {
return err
return nil
}
// Read the response body (so we can save it to the WARC
......@@ -104,9 +105,10 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht
// Dump the response.
statusLine := fmt.Sprintf("HTTP/1.1 %s", resp.Status)
respPayload := bytes.Join([][]byte{
[]byte(statusLine), hdr2str(resp.Header), data},
[]byte{'\r', '\n'})
respPayload := bytes.Join(
[][]byte{[]byte(statusLine), hdr2str(resp.Header), data},
[]byte{'\r', '\n'},
)
if werr := h.writeWARCRecord("response", resp.Request.URL.String(), respPayload); werr != nil {
return werr
}
......
......@@ -22,12 +22,13 @@ var (
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
if err != nil {
return err
return nil
}
links, err := analysis.GetLinks(resp)
if err != nil {
return err
// Not a fatal error, just a bad web page.
return nil
}
for _, link := range links {
......
......@@ -20,6 +20,8 @@ import (
lutil "github.com/syndtr/goleveldb/leveldb/util"
)
var errorRetryDelay = 180 * time.Second
type gobDB struct {
*leveldb.DB
}
......@@ -95,7 +97,7 @@ type URLInfo struct {
URL string
StatusCode int
CrawledAt time.Time
Error error
Error string
}
// A Fetcher retrieves contents from remote URLs.
......@@ -229,9 +231,12 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
info.StatusCode = httpResp.StatusCode
}
// Invoke the handler (even if the fetcher errored out).
info.Error = c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr)
// Invoke the handler (even if the fetcher errored
// out). Errors in handling requests are fatal, crawl
// will be aborted.
Must(c.handler.Handle(c, p.URL, p.Depth, httpResp, httpErr))
// Write the result in our database.
wb := new(leveldb.Batch)
if httpErr == nil {
respBody.Close() // nolint
......@@ -239,8 +244,9 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
// Remove the URL from the queue if the fetcher was successful.
c.queue.Release(wb, p)
} else {
info.Error = httpErr.Error()
log.Printf("error retrieving %s: %v", p.URL, httpErr)
Must(c.queue.Retry(wb, p, 300*time.Second))
Must(c.queue.Retry(wb, p, errorRetryDelay))
}
Must(c.db.PutObjBatch(wb, urlkey, &info))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment