Commit 63bd51e0 authored by ale's avatar ale

add ignore list from ArchiveBot

parent aa6e67d7
......@@ -13,6 +13,9 @@ import (
"os"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"git.autistici.org/ale/crawl"
"git.autistici.org/ale/crawl/analysis"
......@@ -115,6 +118,74 @@ func NewSaveHandler(w *warc.Writer) crawl.Handler {
}
}
type crawlStats struct {
bytes int64
start time.Time
lock sync.Mutex
states map[int]int
}
func (c *crawlStats) Update(resp *http.Response) {
c.lock.Lock()
defer c.lock.Unlock()
c.states[resp.StatusCode]++
resp.Body = &byteCounter{resp.Body}
}
func (c *crawlStats) UpdateBytes(n int64) {
atomic.AddInt64(&c.bytes, n)
}
func (c *crawlStats) Dump() {
c.lock.Lock()
defer c.lock.Unlock()
rate := float64(c.bytes) / time.Since(c.start).Seconds() / 1000
fmt.Fprintf(os.Stderr, "stats: downloaded %d bytes (%.4g KB/s), status: %v\n", c.bytes, rate, c.states)
}
var (
stats *crawlStats
client *http.Client
)
func fetch(urlstr string) (*http.Response, error) {
resp, err := client.Get(urlstr)
if err == nil {
stats.Update(resp)
}
return resp, err
}
func init() {
client = &http.Client{}
stats = &crawlStats{
states: make(map[int]int),
start: time.Now(),
}
go func() {
for range time.Tick(10 * time.Second) {
stats.Dump()
}
}()
}
type byteCounter struct {
io.ReadCloser
}
func (b *byteCounter) Read(buf []byte) (int, error) {
n, err := b.ReadCloser.Read(buf)
if n > 0 {
stats.UpdateBytes(int64(n))
}
return n, err
}
func main() {
flag.Parse()
......@@ -128,6 +199,7 @@ func main() {
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
crawl.NewRegexpIgnoreScope(nil),
}
w := warc.NewWriter(outf)
......@@ -135,7 +207,7 @@ func main() {
saver := NewSaveHandler(w)
crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.NewRedirectHandler(saver))
crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver))
if err != nil {
log.Fatal(err)
}
......
#!/usr/bin/python
#
# Parse ArchiveBot ignore regexp patterns and generate a Go source
# file with a global variable including all of them.
#
# Invoke with a single argument, the location of a checked-out copy of
# https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns.
#
import glob
import json
import os
import sys
archivebot_ignore_path = sys.argv[1]
print 'package crawl\n\nvar defaultIgnorePatterns = []string{'
for fn in glob.glob(os.path.join(archivebot_ignore_path, '*.json')):
try:
with open(fn) as fd:
print '\n\t// %s' % os.path.basename(fn)
for p in json.load(fd)['patterns']:
if '\\\\1' in p or '(?!' in p:
# RE2 does not support backreferences or other
# fancy PCRE constructs. This excludes <10
# patterns from the ignore list.
continue
print '\t%s,' % json.dumps(p)
except Exception, e:
print >>sys.stderr, 'error in %s: %s' % (fn, e)
print '}'
This diff is collapsed.
......@@ -3,6 +3,7 @@ package crawl
import (
"fmt"
"net/url"
"regexp"
"strings"
)
......@@ -95,3 +96,30 @@ func NewSeedScope(seeds []*url.URL) Scope {
}
return NewURLPrefixScope(pfx)
}
type regexpIgnoreScope struct {
ignores []*regexp.Regexp
}
func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool {
uriStr := uri.String()
for _, i := range s.ignores {
if i.MatchString(uriStr) {
return false
}
}
return true
}
func NewRegexpIgnoreScope(ignores []string) Scope {
if ignores == nil {
ignores = defaultIgnorePatterns
}
r := regexpIgnoreScope{
ignores: make([]*regexp.Regexp, 0, len(ignores)),
}
for _, i := range ignores {
r.ignores = append(r.ignores, regexp.MustCompile(i))
}
return &r
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment