Skip to content
Snippets Groups Projects
Commit 2f3ca2f8 authored by ale's avatar ale
Browse files

Normalize URLs before checking if they are in scope

parent 52eba2bb
Branches
No related tags found
No related merge requests found
...@@ -150,23 +150,36 @@ type Crawler struct { ...@@ -150,23 +150,36 @@ type Crawler struct {
enqueueMx sync.Mutex enqueueMx sync.Mutex
} }
func normalizeURL(u *url.URL) *url.URL {
urlStr := purell.NormalizeURL(u,
purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|
purell.FlagRemoveFragment|purell.FlagSortQuery)
u2, err := url.Parse(urlStr)
if err != nil {
// We *really* do not expect an error here.
panic(err)
}
return u2
}
// Enqueue a (possibly new) URL for processing. // Enqueue a (possibly new) URL for processing.
func (c *Crawler) Enqueue(link Outlink, depth int) error { func (c *Crawler) Enqueue(link Outlink, depth int) error {
// Normalize the URL. We are going to replace link.URL in-place, to
// ensure that scope checks are applied to the normalized URL.
link.URL = normalizeURL(link.URL)
// See if it's in scope. // See if it's in scope.
if !c.scope.Check(link, depth) { if !c.scope.Check(link, depth) {
return nil return nil
} }
// Normalize the URL.
urlStr := purell.NormalizeURL(link.URL, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagSortQuery)
// Protect the read-modify-update below with a mutex. // Protect the read-modify-update below with a mutex.
c.enqueueMx.Lock() c.enqueueMx.Lock()
defer c.enqueueMx.Unlock() defer c.enqueueMx.Unlock()
// Check if we've already seen it. // Check if we've already seen it.
var info URLInfo var info URLInfo
ukey := []byte(fmt.Sprintf("url/%s", urlStr)) ukey := []byte(fmt.Sprintf("url/%s", link.URL.String()))
if err := c.db.GetObj(ukey, &info); err == nil { if err := c.db.GetObj(ukey, &info); err == nil {
return nil return nil
} }
...@@ -175,7 +188,7 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error { ...@@ -175,7 +188,7 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error {
// make sure that subsequent calls to Enqueue with the same // make sure that subsequent calls to Enqueue with the same
// URL will fail. // URL will fail.
wb := new(leveldb.Batch) wb := new(leveldb.Batch)
if err := c.queue.Add(wb, urlStr, depth, time.Now()); err != nil { if err := c.queue.Add(wb, link.URL.String(), depth, time.Now()); err != nil {
return err return err
} }
if err := c.db.PutObjBatch(wb, ukey, &info); err != nil { if err := c.db.PutObjBatch(wb, ukey, &info); err != nil {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment