Skip to content
Snippets Groups Projects
Commit cce28f44 authored by ale's avatar ale
Browse files

Replace URLInfo with a simple URL presence check

The whole URLInfo structure, while neat, is unused except for the
purpose of verifying if we have already seen a specific URL.

The presence check is also now limited to Enqueue().
parent c5ec7eb8
No related branches found
No related tags found
No related merge requests found
...@@ -92,14 +92,6 @@ const ( ...@@ -92,14 +92,6 @@ const (
TagRelated TagRelated
) )
// URLInfo stores information about a crawled URL.
type URLInfo struct {
URL string
StatusCode int
CrawledAt time.Time
Error string
}
// A Fetcher retrieves contents from remote URLs. // A Fetcher retrieves contents from remote URLs.
type Fetcher interface { type Fetcher interface {
// Fetch retrieves a URL and returns the response. // Fetch retrieves a URL and returns the response.
...@@ -162,6 +154,19 @@ func normalizeURL(u *url.URL) *url.URL { ...@@ -162,6 +154,19 @@ func normalizeURL(u *url.URL) *url.URL {
return u2 return u2
} }
func seenKey(u *url.URL) []byte {
return []byte(fmt.Sprintf("_seen/%s", u.String()))
}
func (c *Crawler) hasSeen(u *url.URL) bool {
_, err := c.db.Get(seenKey(u), nil)
return err == nil
}
func (c *Crawler) setSeen(wb *leveldb.Batch, u *url.URL) {
wb.Put(seenKey(u), []byte{})
}
// Enqueue a (possibly new) URL for processing. // Enqueue a (possibly new) URL for processing.
func (c *Crawler) Enqueue(link Outlink, depth int) error { func (c *Crawler) Enqueue(link Outlink, depth int) error {
// Normalize the URL. We are going to replace link.URL in-place, to // Normalize the URL. We are going to replace link.URL in-place, to
...@@ -178,22 +183,18 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error { ...@@ -178,22 +183,18 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error {
defer c.enqueueMx.Unlock() defer c.enqueueMx.Unlock()
// Check if we've already seen it. // Check if we've already seen it.
var info URLInfo if c.hasSeen(link.URL) {
ukey := []byte(fmt.Sprintf("url/%s", link.URL.String()))
if err := c.db.GetObj(ukey, &info); err == nil {
return nil return nil
} }
// Store the URL in the queue, and store an empty URLInfo to // Store the URL in the queue, and mark it as seen to make
// make sure that subsequent calls to Enqueue with the same // sure that subsequent calls to Enqueue with the same URL
// URL will fail. // will fail.
wb := new(leveldb.Batch) wb := new(leveldb.Batch)
if err := c.queue.Add(wb, link.URL.String(), depth, time.Now()); err != nil { if err := c.queue.Add(wb, link.URL.String(), depth, time.Now()); err != nil {
return err return err
} }
if err := c.db.PutObjBatch(wb, ukey, &info); err != nil { c.setSeen(wb, link.URL)
return err
}
return c.db.Write(wb, nil) return c.db.Write(wb, nil)
} }
...@@ -230,14 +231,6 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { ...@@ -230,14 +231,6 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
return return
} }
// Retrieve the URLInfo object from the crawl db.
// Ignore errors, we can work with an empty object.
urlkey := []byte(fmt.Sprintf("url/%s", p.URL))
var info URLInfo
c.db.GetObj(urlkey, &info) // nolint
info.CrawledAt = time.Now()
info.URL = p.URL
// Fetch the URL and handle it. Make sure to Close the // Fetch the URL and handle it. Make sure to Close the
// response body (even if it gets replaced in the // response body (even if it gets replaced in the
// Response object). // Response object).
...@@ -246,7 +239,6 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { ...@@ -246,7 +239,6 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
var respBody io.ReadCloser var respBody io.ReadCloser
if httpErr == nil { if httpErr == nil {
respBody = httpResp.Body respBody = httpResp.Body
info.StatusCode = httpResp.StatusCode
} }
// Invoke the handler (even if the fetcher errored // Invoke the handler (even if the fetcher errored
...@@ -268,7 +260,6 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) { ...@@ -268,7 +260,6 @@ func (c *Crawler) urlHandler(queue <-chan queuePair) {
} }
// Write the result in our database. // Write the result in our database.
Must(c.db.PutObjBatch(wb, urlkey, &info))
Must(c.db.Write(wb, nil)) Must(c.db.Write(wb, nil))
} }
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment