Commit c7ab8701 authored by ale's avatar ale
Browse files

Ignore URL decode errors

This is an internal inconsistency that should be investigated.
parent 833da3f3
Pipeline #17429 passed with stages
in 1 minute and 33 seconds
......@@ -154,8 +154,8 @@ func normalizeURL(u *url.URL) *url.URL {
purell.FlagRemoveFragment|purell.FlagSortQuery)
u2, err := url.Parse(urlStr)
if err != nil {
// We *really* do not expect an error here.
panic(err)
// Ignore errors here.
return nil
}
return u2
}
......@@ -178,6 +178,13 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error {
// Normalize the URL. We are going to replace link.URL in-place, to
// ensure that scope checks are applied to the normalized URL.
link.URL = normalizeURL(link.URL)
if link.URL == nil {
// We couldn't parse a URL that we have extracted
// ourselves from the documents. This is an internal
// inconsistency, but by ignoring the error we avoid
// failing the entire crawl.
return nil
}
// See if it's in scope.
if !c.scope.Check(link, depth) {
......
module git.autistici.org/ale/crawl
go 1.15
require (
github.com/PuerkitoBio/goquery v1.5.0
github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment