Commit 2f3ca2f8 authored by ale's avatar ale

Normalize URLs before checking if they are in scope

parent 52eba2bb
Pipeline #1937 passed with stage
in 22 seconds
...@@ -150,23 +150,36 @@ type Crawler struct { ...@@ -150,23 +150,36 @@ type Crawler struct {
enqueueMx sync.Mutex enqueueMx sync.Mutex
} }
func normalizeURL(u *url.URL) *url.URL {
urlStr := purell.NormalizeURL(u,
purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|
purell.FlagRemoveFragment|purell.FlagSortQuery)
u2, err := url.Parse(urlStr)
if err != nil {
// We *really* do not expect an error here.
panic(err)
}
return u2
}
// Enqueue a (possibly new) URL for processing. // Enqueue a (possibly new) URL for processing.
func (c *Crawler) Enqueue(link Outlink, depth int) error { func (c *Crawler) Enqueue(link Outlink, depth int) error {
// Normalize the URL. We are going to replace link.URL in-place, to
// ensure that scope checks are applied to the normalized URL.
link.URL = normalizeURL(link.URL)
// See if it's in scope. // See if it's in scope.
if !c.scope.Check(link, depth) { if !c.scope.Check(link, depth) {
return nil return nil
} }
// Normalize the URL.
urlStr := purell.NormalizeURL(link.URL, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagSortQuery)
// Protect the read-modify-update below with a mutex. // Protect the read-modify-update below with a mutex.
c.enqueueMx.Lock() c.enqueueMx.Lock()
defer c.enqueueMx.Unlock() defer c.enqueueMx.Unlock()
// Check if we've already seen it. // Check if we've already seen it.
var info URLInfo var info URLInfo
ukey := []byte(fmt.Sprintf("url/%s", urlStr)) ukey := []byte(fmt.Sprintf("url/%s", link.URL.String()))
if err := c.db.GetObj(ukey, &info); err == nil { if err := c.db.GetObj(ukey, &info); err == nil {
return nil return nil
} }
...@@ -175,7 +188,7 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error { ...@@ -175,7 +188,7 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error {
// make sure that subsequent calls to Enqueue with the same // make sure that subsequent calls to Enqueue with the same
// URL will fail. // URL will fail.
wb := new(leveldb.Batch) wb := new(leveldb.Batch)
if err := c.queue.Add(wb, urlStr, depth, time.Now()); err != nil { if err := c.queue.Add(wb, link.URL.String(), depth, time.Now()); err != nil {
return err return err
} }
if err := c.db.PutObjBatch(wb, ukey, &info); err != nil { if err := c.db.PutObjBatch(wb, ukey, &info); err != nil {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment