Commit 4cd67e72 authored by ale's avatar ale

Add tags (primary/related) to links

This change allows more complex scope boundaries, including loosening
edges a bit to include related resources of HTML pages (which makes
for more complete archives if desired).
parent 77211d4f
Pipeline #728 failed with stage
in 15 seconds
......@@ -6,31 +6,39 @@ import (
"fmt"
"io/ioutil"
"net/http"
"net/url"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
"git.autistici.org/ale/crawl"
)
var (
urlcssRx = regexp.MustCompile(`(@import|.*:).*url\(["']?([^'"\)]+)["']?\)`)
urlcssRx = regexp.MustCompile(`(?:@import|:).*url\(["']?([^'"\)]+)["']?\)`)
linkMatches = []struct {
tag string
attr string
tag string
attr string
linkTag int
}{
{"a", "href"},
{"link", "href"},
{"img", "src"},
{"script", "src"},
{"a", "href", crawl.TagPrimary},
{"link", "href", crawl.TagRelated},
{"img", "src", crawl.TagRelated},
{"script", "src", crawl.TagRelated},
}
)
// The unparsed version of an Outlink.
type rawOutlink struct {
URL string
Tag int
}
// GetLinks returns all the links found in a document. Currently only
// parses HTML pages and CSS stylesheets.
func GetLinks(resp *http.Response) ([]*url.URL, error) {
var outlinks []string
func GetLinks(resp *http.Response) ([]crawl.Outlink, error) {
var outlinks []rawOutlink
ctype := resp.Header.Get("Content-Type")
if strings.HasPrefix(ctype, "text/html") {
......@@ -45,7 +53,7 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
outlinks = append(outlinks, val)
outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag})
})
}
} else if strings.HasPrefix(ctype, "text/css") {
......@@ -53,22 +61,25 @@ func GetLinks(resp *http.Response) ([]*url.URL, error) {
// expression to extract "url()" links from CSS.
if data, err := ioutil.ReadAll(resp.Body); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
outlinks = append(outlinks, val[1])
outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated})
}
}
}
// Parse outbound links relative to the request URI, and
// return unique results.
var result []*url.URL
links := make(map[string]*url.URL)
for _, val := range outlinks {
if linkurl, err := resp.Request.URL.Parse(val); err == nil {
links[linkurl.String()] = linkurl
var result []crawl.Outlink
links := make(map[string]crawl.Outlink)
for _, l := range outlinks {
if linkurl, err := resp.Request.URL.Parse(l.URL); err == nil {
links[linkurl.String()] = crawl.Outlink{
URL: linkurl,
Tag: l.Tag,
}
}
}
for _, u := range links {
result = append(result, u)
for _, l := range links {
result = append(result, l)
}
return result, nil
}
......@@ -23,12 +23,13 @@ import (
)
var (
dbPath = flag.String("state", "crawldb", "crawl state database path")
keepDb = flag.Bool("keep", false, "keep the state database when done")
concurrency = flag.Int("c", 10, "concurrent workers")
depth = flag.Int("depth", 10, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
dbPath = flag.String("state", "crawldb", "crawl state database path")
keepDb = flag.Bool("keep", false, "keep the state database when done")
concurrency = flag.Int("c", 10, "concurrent workers")
depth = flag.Int("depth", 10, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
)
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
......@@ -196,11 +197,14 @@ func main() {
}
seeds := crawl.MustParseURLs(flag.Args())
scope := []crawl.Scope{
scope := crawl.AND(
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
crawl.NewRegexpIgnoreScope(nil),
)
if *alwaysIncludeRelated {
scope = crawl.OR(scope, crawl.NewIncludeRelatedScope())
}
w := warc.NewWriter(outf)
......
......@@ -75,6 +75,20 @@ func (i *gobIterator) Value(obj interface{}) error {
return gob.NewDecoder(bytes.NewBuffer(i.Iterator.Value())).Decode(obj)
}
// Outlink is a tagged outbound link.
type Outlink struct {
URL *url.URL
Tag int
}
const (
// TagPrimary is a primary reference (another web page).
TagPrimary = iota
// TagRelated is a secondary resource, related to a page.
TagRelated
)
// URLInfo stores information about a crawled URL.
type URLInfo struct {
URL string
......@@ -118,7 +132,7 @@ type Crawler struct {
db *gobDB
queue *queue
seeds []*url.URL
scopes []Scope
scope Scope
fetcher Fetcher
handler Handler
......@@ -126,17 +140,15 @@ type Crawler struct {
}
// Enqueue a (possibly new) URL for processing.
func (c *Crawler) Enqueue(u *url.URL, depth int) {
// Normalize the URL.
urlStr := purell.NormalizeURL(u, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
// See if it's in scope. Checks are ANDed.
for _, sc := range c.scopes {
if !sc.Check(u, depth) {
return
}
func (c *Crawler) Enqueue(link Outlink, depth int) {
// See if it's in scope.
if !c.scope.Check(link, depth) {
return
}
// Normalize the URL.
urlStr := purell.NormalizeURL(link.URL, purell.FlagsSafe|purell.FlagRemoveDotSegments|purell.FlagRemoveDuplicateSlashes|purell.FlagRemoveFragment|purell.FlagRemoveDirectoryIndex|purell.FlagSortQuery)
// Protect the read-modify-update below with a mutex.
c.enqueueMx.Lock()
defer c.enqueueMx.Unlock()
......@@ -228,7 +240,7 @@ func MustParseURLs(urls []string) []*url.URL {
}
// NewCrawler creates a new Crawler object with the specified behavior.
func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Handler) (*Crawler, error) {
func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler) (*Crawler, error) {
// Open the crawl database.
db, err := newGobDB(path)
if err != nil {
......@@ -241,7 +253,7 @@ func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Hand
fetcher: f,
handler: h,
seeds: seeds,
scopes: scopes,
scope: scope,
}
// Recover active tasks.
......@@ -255,7 +267,7 @@ func NewCrawler(path string, seeds []*url.URL, scopes []Scope, f Fetcher, h Hand
func (c *Crawler) Run(concurrency int) {
// Load initial seeds into the queue.
for _, u := range c.seeds {
c.Enqueue(u, 0)
c.Enqueue(Outlink{URL: u, Tag: TagPrimary}, 0)
}
// Start some runners and wait until they're done.
......@@ -291,7 +303,7 @@ func (wrap *redirectHandler) Handle(c *Crawler, u string, depth int, resp *http.
if err != nil {
log.Printf("error parsing Location header: %v", err)
} else {
c.Enqueue(locationURL, depth+1)
c.Enqueue(Outlink{URL: locationURL, Tag: TagPrimary}, depth+1)
}
}
} else {
......
package crawl
import (
"fmt"
"io"
"io/ioutil"
"log"
"net/http"
"net/http/httptest"
"os"
"testing"
)
func TestCrawler(t *testing.T) {
dir, err := ioutil.TempDir("", "")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(dir)
// Run a trivial test http server just so our test Fetcher can
// return a real http.Response object.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
io.WriteString(w, "hello")
}))
defer srv.Close()
seeds := MustParseURLs([]string{srv.URL})
scope := AND(
NewSchemeScope([]string{"http"}),
NewSeedScope(seeds),
NewDepthScope(2),
)
var crawledPages int
h := HandlerFunc(func(c *Crawler, u string, depth int, resp *http.Response, err error) error {
crawledPages++
next := fmt.Sprintf(srv.URL+"/page/%d", crawledPages)
log.Printf("%s -> %s", u, next)
c.Enqueue(Outlink{
URL: mustParseURL(next),
Tag: TagPrimary,
}, depth+1)
return nil
})
crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), NewRedirectHandler(h))
if err != nil {
t.Fatal("NewCrawler", err)
}
crawler.Run(1)
crawler.Close()
if crawledPages != 2 {
t.Fatalf("incomplete/bad crawl (%d pages, expected %d)", crawledPages, 10)
}
}
......@@ -10,14 +10,14 @@ import (
// Scope defines the crawling scope.
type Scope interface {
// Check a URL to see if it's in scope for crawling.
Check(*url.URL, int) bool
Check(Outlink, int) bool
}
type maxDepthScope struct {
maxDepth int
}
func (s *maxDepthScope) Check(uri *url.URL, depth int) bool {
func (s *maxDepthScope) Check(_ Outlink, depth int) bool {
return depth < s.maxDepth
}
......@@ -31,8 +31,8 @@ type schemeScope struct {
allowedSchemes map[string]struct{}
}
func (s *schemeScope) Check(uri *url.URL, depth int) bool {
_, ok := s.allowedSchemes[uri.Scheme]
func (s *schemeScope) Check(link Outlink, depth int) bool {
_, ok := s.allowedSchemes[link.URL.Scheme]
return ok
}
......@@ -81,8 +81,8 @@ type urlPrefixScope struct {
prefixes URLPrefixMap
}
func (s *urlPrefixScope) Check(uri *url.URL, depth int) bool {
return s.prefixes.Contains(uri)
func (s *urlPrefixScope) Check(link Outlink, depth int) bool {
return s.prefixes.Contains(link.URL)
}
// NewURLPrefixScope returns a Scope that limits the crawl to a set of
......@@ -105,8 +105,8 @@ type regexpIgnoreScope struct {
ignores []*regexp.Regexp
}
func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool {
uriStr := uri.String()
func (s *regexpIgnoreScope) Check(link Outlink, depth int) bool {
uriStr := link.URL.String()
for _, i := range s.ignores {
if i.MatchString(uriStr) {
return false
......@@ -129,3 +129,50 @@ func NewRegexpIgnoreScope(ignores []string) Scope {
}
return &r
}
// NewIncludeRelatedScope always includes resources with TagRelated.
func NewIncludeRelatedScope() Scope {
return &includeRelatedScope{}
}
type includeRelatedScope struct{}
func (s *includeRelatedScope) Check(link Outlink, _ int) bool {
return link.Tag == TagRelated
}
// AND performs a boolean AND.
func AND(elems ...Scope) Scope {
return &andScope{elems: elems}
}
type andScope struct {
elems []Scope
}
func (s *andScope) Check(link Outlink, depth int) bool {
for _, e := range s.elems {
if !e.Check(link, depth) {
return false
}
}
return true
}
// OR performs a boolean OR.
func OR(elems ...Scope) Scope {
return &orScope{elems: elems}
}
type orScope struct {
elems []Scope
}
func (s *orScope) Check(link Outlink, depth int) bool {
for _, e := range s.elems {
if e.Check(link, depth) {
return true
}
}
return false
}
......@@ -19,7 +19,7 @@ type testScopeEntry struct {
func runScopeTest(t *testing.T, sc Scope, testdata []testScopeEntry) {
for _, td := range testdata {
uri := mustParseURL(td.uri)
result := sc.Check(uri, td.depth)
result := sc.Check(Outlink{URL: uri, Tag: TagPrimary}, td.depth)
if result != td.expected {
t.Errorf("Check(%s, %d) -> got %v, want %v", td.uri, td.depth, result, td.expected)
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment