Skip to content
Snippets Groups Projects
Commit c7ab8701 authored by ale's avatar ale
Browse files

Ignore URL decode errors

This is an internal inconsistency that should be investigated.
parent 833da3f3
No related branches found
No related tags found
No related merge requests found
......@@ -154,8 +154,8 @@ func normalizeURL(u *url.URL) *url.URL {
purell.FlagRemoveFragment|purell.FlagSortQuery)
u2, err := url.Parse(urlStr)
if err != nil {
// We *really* do not expect an error here.
panic(err)
// Ignore errors here.
return nil
}
return u2
}
......@@ -178,6 +178,13 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error {
// Normalize the URL. We are going to replace link.URL in-place, to
// ensure that scope checks are applied to the normalized URL.
link.URL = normalizeURL(link.URL)
if link.URL == nil {
// We couldn't parse a URL that we have extracted
// ourselves from the documents. This is an internal
// inconsistency, but by ignoring the error we avoid
// failing the entire crawl.
return nil
}
// See if it's in scope.
if !c.scope.Check(link, depth) {
......
module git.autistici.org/ale/crawl
go 1.15
require (
github.com/PuerkitoBio/goquery v1.5.0
github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment