From c7ab870184449929fc69f9a5da36de38cae1b5ba Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Sat, 19 Jun 2021 16:47:14 +0100
Subject: [PATCH] Ignore URL decode errors

This is an internal inconsistency that should be investigated.
---
 crawler.go | 11 +++++++++--
 go.mod     |  2 ++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/crawler.go b/crawler.go
index 49d124b..b2ad3d9 100644
--- a/crawler.go
+++ b/crawler.go
@@ -154,8 +154,8 @@ func normalizeURL(u *url.URL) *url.URL {
 			purell.FlagRemoveFragment|purell.FlagSortQuery)
 	u2, err := url.Parse(urlStr)
 	if err != nil {
-		// We *really* do not expect an error here.
-		panic(err)
+		// Ignore errors here.
+		return nil
 	}
 	return u2
 }
@@ -178,6 +178,13 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error {
 	// Normalize the URL. We are going to replace link.URL in-place, to
 	// ensure that scope checks are applied to the normalized URL.
 	link.URL = normalizeURL(link.URL)
+	if link.URL == nil {
+		// We couldn't parse a URL that we have extracted
+		// ourselves from the documents. This is an internal
+		// inconsistency, but by ignoring the error we avoid
+		// failing the entire crawl.
+		return nil
+	}
 
 	// See if it's in scope.
 	if !c.scope.Check(link, depth) {
diff --git a/go.mod b/go.mod
index 488dbdb..5ca4ba4 100644
--- a/go.mod
+++ b/go.mod
@@ -1,5 +1,7 @@
 module git.autistici.org/ale/crawl
 
+go 1.15
+
 require (
 	github.com/PuerkitoBio/goquery v1.5.0
 	github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597
-- 
GitLab