Skip to content
Snippets Groups Projects
Commit 4cd67e72 authored by ale's avatar ale
Browse files

Add tags (primary/related) to links

This change allows more complex scope boundaries, including loosening
edges a bit to include related resources of HTML pages (which makes
for more complete archives if desired).
parent 77211d4f
No related branches found
No related tags found
No related merge requests found
......@@ -6,31 +6,39 @@ import (
"fmt"
"io/ioutil"
"net/http"
"net/url"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
"git.autistici.org/ale/crawl"
)
var (
urlcssRx = regexp.MustCompile(`(@import|.*:).*url\(["']?([^'"\)]+)["']?\)`)
urlcssRx = regexp.MustCompile(`(?:@import|:).*url\(["']?([^'"\)]+)["']?\)`)
linkMatches = []struct {
tag string
attr string
tag string
attr string
linkTag int
}{
{"a", "href"},
{"link", "href"},
{"img", "src"},
{"script", "src"},
{"a", "href", crawl.TagPrimary},
{"link", "href", crawl.TagRelated},
{"img", "src", crawl.TagRelated},
{"script", "src", crawl.TagRelated},
}
)
// The unparsed version of an Outlink.
type rawOutlink struct {
URL string
Tag int
}
// GetLinks returns all the links found in a document. Currently only
// parses HTML pages and CSS stylesheets.
func GetLinks(resp *http.Response) ([]*url.URL, error) {
var outlinks []string
func GetLinks(resp *http.Response) ([]crawl.Outlink, error) {
var outlinks []rawOutlink
ctype := resp.Header.Get("Content-Type")
if strings.HasPrefix(ctype, "text/html") {
......@@ -45,7 +53,7 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
outlinks = append(outlinks, val)
outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag})
})
}
} else if strings.HasPrefix(ctype, "text/css") {
......@@ -53,22 +61,25 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
// expression to extract "url()" links from CSS.
if data, err := ioutil.ReadAll(resp.Body); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
outlinks = append(outlinks, val[1])
outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated})
}
}
}
// Parse outbound links relative to the request URI, and
// return unique results.
var result []*url.URL
links := make(map[string]*url.URL)
for _, val := range outlinks {
if linkurl, err := resp.Request.URL.Parse(val); err == nil {
links[linkurl.String()] = linkurl
var result []crawl.Outlink
links := make(map[string]crawl.Outlink)
for _, l := range outlinks {
if linkurl, err := resp.Request.URL.Parse(l.URL); err == nil {
links[linkurl.String()] = crawl.Outlink{
URL: linkurl,
Tag: l.Tag,
}
}
}
for _, u := range links {
result = append(result, u)
for _, l := range links {
result = append(result, l)
}
return result, nil
}
......@@ -23,12 +23,13 @@ import (
)
var (
dbPath = flag.String("state", "crawldb", "crawl state database path")
keepDb = flag.Bool("keep", false, "keep the state database when done")
concurrency = flag.Int("c", 10, "concurrent workers")
depth = flag.Int("depth", 10, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
dbPath = flag.String("state", "crawldb", "crawl state database path")
keepDb = flag.Bool("keep", false, "keep the state database when done")
concurrency = flag.Int("c", 10, "concurrent workers")
depth = flag.Int("depth", 10, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
)
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
......@@ -196,11 +197,14 @@ func main() {
}
seeds := crawl.MustParseURLs(flag.Args())
scope := []crawl.Scope{
scope := crawl.AND(
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
crawl.NewRegexpIgnoreScope(nil),
)
if *alwaysIncludeRelated {
scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
}
w := warc.NewWriter(outf)
......
......@@ -75,6 +75,20 @@ func (i *gobIterator) Value(obj interface{}) error {
return gob.NewDecoder(bytes.NewBuffer(i.Iterator.Value())).Decode(obj)
}
// Outlink is a tagged outbound link.
type Outlink struct {
URL *url.URL
Tag int
}
const (
// TagPrimary is a primary reference (another web page).
TagPrimary = iota
// TagRelated is a secondary resource, related to a page.
TagRelated
)
// URLInfo stores information about a crawled URL.
type URLInfo struct {
URL string
......@@ -118,7 +132,7 @@ type Crawler struct {
db *gobDB
queue *queue
seeds []*url.URL
scopes []Scope
scope Scope
fetcher Fetcher
handler Handler
......@@ -126,17 +140,15 @@ type Crawler struct {
}
// Enqueue a (possibly new) URL for processing.
func (c *Crawler) Enqueue(u *url.URL, depth int) {
// Normalize the URL.
urlStr := purell.NormalizeURL(u, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
// See if it's in scope. Checks are ANDed.
for _, sc := range c.scopes {
if !sc.Check(u, depth) {
return
}
func (c *Crawler) Enqueue(link Outlink, depth int) {
// See if it's in scope.
if !c.scope.Check(link, depth) {
return
}
// Normalize the URL.
urlStr := purell.NormalizeURL(link.URL, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
// Protect the read-modify-update below with a mutex.
c.enqueueMx.Lock()
defer c.enqueueMx.Unlock()
......@@ -228,7 +240,7 @@ func MustParseURLs(urls []string) []*url.URL {
}
// NewCrawler creates a new Crawler object with the specified behavior.
func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Handler) (*Crawler, error) {
func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) {
// Open the crawl database.
db, err := newGobDB(path)
if err != nil {
......@@ -241,7 +253,7 @@ func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Hand
fetcher: f,
handler: h,
seeds: seeds,
scopes: scopes,
scope: scope,
}
// Recover active tasks.
......@@ -255,7 +267,7 @@ func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Hand
func (c *Crawler) Run(concurrency int) {
// Load initial seeds into the queue.
for _, u := range c.seeds {
c.Enqueue(u, 0)
c.Enqueue(Outlink{URL: u, Tag: TagPrimary}, 0)
}
// Start some runners and wait until they're done.
......@@ -291,7 +303,7 @@ func (wrap *redirectHandler) Handle(c *Crawler, u string, depth int, resp *http.
if err != nil {
log.Printf("error parsing Location header: %v", err)
} else {
c.Enqueue(locationURL, depth+1)
c.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1)
}
}
} else {
......
package crawl
import (
"fmt"
"io"
"io/ioutil"
"log"
"net/http"
"net/http/httptest"
"os"
"testing"
)
func TestCrawler(t *testing.T) {
dir, err := ioutil.TempDir("", "")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(dir)
// Run a trivial test http server just so our test Fetcher can
// return a real http.Response object.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
io.WriteString(w, "hello")
}))
defer srv.Close()
seeds := MustParseURLs([]string{srv.URL})
scope := AND(
NewSchemeScope([]string{"http"}),
NewSeedScope(seeds),
NewDepthScope(2),
)
var crawledPages int
h := HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error {
crawledPages++
next := fmt.Sprintf(srv.URL+"/page/%d", crawledPages)
log.Printf("%s -> %s", u, next)
c.Enqueue(Outlink{
URL: mustParseURL(next),
Tag: TagPrimary,
}, depth+1)
return nil
})
crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), NewRedirectHandler(h))
if err != nil {
t.Fatal("NewCrawler", err)
}
crawler.Run(1)
crawler.Close()
if crawledPages != 2 {
t.Fatalf("incomplete/bad crawl (%d pages, expected %d)", crawledPages, 10)
}
}
......@@ -10,14 +10,14 @@ import (
// Scope defines the crawling scope.
type Scope interface {
// Check a URL to see if it's in scope for crawling.
Check(*url.URL, int) bool
Check(Outlink, int) bool
}
type maxDepthScope struct {
maxDepth int
}
func (s *maxDepthScope) Check(uri *url.URL, depth int) bool {
func (s *maxDepthScope) Check(_ Outlink, depth int) bool {
return depth < s.maxDepth
}
......@@ -31,8 +31,8 @@ type schemeScope struct {
allowedSchemes map[string]struct{}
}
func (s *schemeScope) Check(uri *url.URL, depth int) bool {
_, ok := s.allowedSchemes[uri.Scheme]
func (s *schemeScope) Check(link Outlink, depth int) bool {
_, ok := s.allowedSchemes[link.URL.Scheme]
return ok
}
......@@ -81,8 +81,8 @@ type urlPrefixScope struct {
prefixes URLPrefixMap
}
func (s *urlPrefixScope) Check(uri *url.URL, depth int) bool {
return s.prefixes.Contains(uri)
func (s *urlPrefixScope) Check(link Outlink, depth int) bool {
return s.prefixes.Contains(link.URL)
}
// NewURLPrefixScope returns a Scope that limits the crawl to a set of
......@@ -105,8 +105,8 @@ type regexpIgnoreScope struct {
ignores []*regexp.Regexp
}
func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool {
uriStr := uri.String()
func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool {
uriStr := link.URL.String()
for _, i := range s.ignores {
if i.MatchString(uriStr) {
return false
......@@ -129,3 +129,50 @@ func NewRegexpIgnoreScope(ignores []string) Scope {
}
return &r
}
// NewIncludeRelatedScope always includes resources with TagRelated.
func NewIncludeRelatedScope() Scope {
return &includeRelatedScope{}
}
type includeRelatedScope struct{}
func (s *includeRelatedScope) Check(link Outlink, _ int) bool {
return link.Tag == TagRelated
}
// AND performs a boolean AND.
func AND(elems ...Scope) Scope {
return &andScope{elems: elems}
}
type andScope struct {
elems []Scope
}
func (s *andScope) Check(link Outlink, depth int) bool {
for _, e := range s.elems {
if !e.Check(link, depth) {
return false
}
}
return true
}
// OR performs a boolean OR.
func OR(elems ...Scope) Scope {
return &orScope{elems: elems}
}
type orScope struct {
elems []Scope
}
func (s *orScope) Check(link Outlink, depth int) bool {
for _, e := range s.elems {
if e.Check(link, depth) {
return true
}
}
return false
}
......@@ -19,7 +19,7 @@ type testScopeEntry struct {
func runScopeTest(t *testing.T, sc Scope, testdata []testScopeEntry) {
for _, td := range testdata {
uri := mustParseURL(td.uri)
result := sc.Check(uri, td.depth)
result := sc.Check(Outlink{URL: uri, Tag: TagPrimary}, td.depth)
if result != td.expected {
t.Errorf("Check(%s, %d) -> got %v, want %v", td.uri, td.depth, result, td.expected)
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment