Commit 63bd51e0 authored by ale's avatar ale

add ignore list from ArchiveBot

parent aa6e67d7
......@@ -13,6 +13,9 @@ import (
"os"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"git.autistici.org/ale/crawl"
"git.autistici.org/ale/crawl/analysis"
......@@ -115,6 +118,74 @@ func NewSaveHandler(w *warc.Writer) crawl.Handler {
}
}
type crawlStats struct {
bytes int64
start time.Time
lock sync.Mutex
states map[int]int
}
func (c *crawlStats) Update(resp *http.Response) {
c.lock.Lock()
defer c.lock.Unlock()
c.states[resp.StatusCode]++
resp.Body = &byteCounter{resp.Body}
}
func (c *crawlStats) UpdateBytes(n int64) {
atomic.AddInt64(&c.bytes, n)
}
func (c *crawlStats) Dump() {
c.lock.Lock()
defer c.lock.Unlock()
rate := float64(c.bytes) / time.Since(c.start).Seconds() / 1000
fmt.Fprintf(os.Stderr, "stats: downloaded %d bytes (%.4g KB/s), status: %v\n", c.bytes, rate, c.states)
}
var (
stats *crawlStats
client *http.Client
)
func fetch(urlstr string) (*http.Response, error) {
resp, err := client.Get(urlstr)
if err == nil {
stats.Update(resp)
}
return resp, err
}
func init() {
client = &http.Client{}
stats = &crawlStats{
states: make(map[int]int),
start: time.Now(),
}
go func() {
for range time.Tick(10 * time.Second) {
stats.Dump()
}
}()
}
type byteCounter struct {
io.ReadCloser
}
func (b *byteCounter) Read(buf []byte) (int, error) {
n, err := b.ReadCloser.Read(buf)
if n > 0 {
stats.UpdateBytes(int64(n))
}
return n, err
}
func main() {
flag.Parse()
......@@ -128,6 +199,7 @@ func main() {
crawl.NewSchemeScope(strings.Split(*validSchemes, ",")),
crawl.NewDepthScope(*depth),
crawl.NewSeedScope(seeds),
crawl.NewRegexpIgnoreScope(nil),
}
w := warc.NewWriter(outf)
......@@ -135,7 +207,7 @@ func main() {
saver := NewSaveHandler(w)
crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(http.Get), crawl.NewRedirectHandler(saver))
crawler, err := crawl.NewCrawler("crawldb", seeds, scope, crawl.FetcherFunc(fetch), crawl.NewRedirectHandler(saver))
if err != nil {
log.Fatal(err)
}
......
#!/usr/bin/python
#
# Parse ArchiveBot ignore regexp patterns and generate a Go source
# file with a global variable including all of them.
#
# Invoke with a single argument, the location of a checked-out copy of
# https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns.
#
import glob
import json
import os
import sys
archivebot_ignore_path = sys.argv[1]
print 'package crawl\n\nvar defaultIgnorePatterns = []string{'
for fn in glob.glob(os.path.join(archivebot_ignore_path, '*.json')):
try:
with open(fn) as fd:
print '\n\t// %s' % os.path.basename(fn)
for p in json.load(fd)['patterns']:
if '\\\\1' in p or '(?!' in p:
# RE2 does not support backreferences or other
# fancy PCRE constructs. This excludes <10
# patterns from the ignore list.
continue
print '\t%s,' % json.dumps(p)
except Exception, e:
print >>sys.stderr, 'error in %s: %s' % (fn, e)
print '}'
package crawl
var defaultIgnorePatterns = []string{
// WordPress.
"wp-login\\.php",
"/wp-admin/",
"/xmlrpc\\.php",
// googleplus.json
"^https?://accounts\\.google\\.com/ServiceLogin",
"^https?://accounts\\.google\\.com/SignUp",
"^https?://lh4\\.googleusercontent\\.com/proxy/[^/]+",
"^https?://plus\\.google\\.com/_/scs/apps-static/",
// mediawiki.json
"[\\?&]oldid=\\d+",
"[\\?&]curid=\\d+",
"[\\?&]limit=(20|100|250|500)",
"[\\?&]hide(minor|bots|anons|liu|myself|redirs|links|trans|patrolled)=",
"([\\?&]title=|/)Special:(UserLogin|UserLogout|Translate|MobileFeedback|MobileOptions|RecentChangesLinked|Diff|MobileDiff)",
"([\\?&]title=|/)Special:RecentChanges&from=\\d+",
"([\\?&]title=|/)Special:ListFiles&dir=prev&offset=\\d+",
"([\\?&]title=|/)Special:(ListFiles|PrefixIndex).*&amp;",
"([\\?&]title=|/)Special:ListFiles.*&user=",
"([\\?&]title=|/)Special:Log/",
"[\\?&]action=edit&section=(\\d+|new)",
"[\\?&]feed(format)?=atom",
"[\\?&]redlink=1",
"[\\?&]printable=yes",
"[\\?&]mobileaction=",
"[\\?&]undo(after)?=\\d+",
"^http://a\\.wikia-beacon\\.com/__track/",
"/User_talk:.+/User_talk:",
"/User_blog:.+/User_blog:",
"/User:.+/User:",
// nosortedindex.json
"\\?C=[NMSD];O=[AD]$",
// coppermine.json
"(?:displayimage|thumbnails)\\.php[?&]album=(?:topn|toprated|lastcom|lastup|lastupby|random|lastcomby)",
"ratepic\\.php",
"addfav\\.php\\?.*ref=displayimage\\.php",
"displayimage\\.php\\?.*slideshow=\\d+",
// youtube.json
"^https?://accounts\\.google\\.com/ServiceLogin",
"\\.?youtube\\.com/user/[^/]+/(playlists|channels|videos)\\?(flow|view|sort|live_view)=",
// reddit.json
"^https?://www\\.reddit\\.com/gold\\?goldtype=",
"^https?://www\\.reddit\\.com/r/[^/]+/comments/[a-z0-9]+/[^/]+/[a-z0-9]+",
"^https?://www\\.reddit\\.com/r/[^/]+/comments/[a-z0-9]+.*\\?sort=",
"^https?://www\\.reddit\\.com/r/[^/]+/comments/[a-z0-9]+/[^/]+/\\.compact",
"^https?://www\\.reddit\\.com/r/[^/]+/(top|new|rising|controversial|gilded|ads)/.+[\\?&]after=",
"^https?://www\\.reddit\\.com/r/[^/]+/related/",
"^https?://www\\.reddit\\.com/r/[^/]+/(gilded)?\\.mobile\\?",
"^https?://www\\.reddit\\.com/r/[^/]+/search/?\\?",
"^https?://www\\.reddit\\.com/r/[^/]+/wiki/(revisions|discussions)/user/.+",
"^https?://www\\.reddit\\.com/user/[^/]+/(comments/)?.+[\\?&]sort=",
"^https?://www\\.reddit\\.com/.+/\\.rss$",
"^https?://simple\\.reddit\\.com/",
"^https?://pixel\\.redditmedia\\.com/pixel/",
"\\.reddit\\.com/message/compose/?\\?",
"^https?://m\\.reddit\\.com/",
// nogravatar.json
"^https?://(\\d|secure)\\.gravatar\\.com/avatar/",
// meetupeverywhere.json
"^https?://.*\\.meetup\\.com/login/",
// pinterest.json
"^https?://www\\.pinterest\\.com/[^/]+/\\^/[^/]+/",
"^https?://www\\.pinterest\\.com/[^/]+/[^/]+/\\^/[^/]+/",
"^https?://www\\.pinterest\\.com/[^/]+/[^/]+\\.[^/]+",
"^https?://www\\.pinterest\\.com/[^/]+/[^/]+/[^/]+\\.[^/]+",
"^https?://www\\.pinterest\\.com/[^/]+/webapp/js/app/(desktop|common)/bundle-(jcrop|mapbox)\\.js",
"^https?://www\\.pinterest\\.com/[^/]+/[^/]+/webapp/js/app/(desktop|common)/bundle-(jcrop|mapbox)\\.js",
// noonion.json
// blogs.json
"[\\?&]replytocom=",
"[\\?&]share=",
"/page/%d/$",
"\\?showComment(=|%5C)",
"/quote-comment-\\d+/$",
"/wp-login\\.php\\?",
"^https?://r\\-login\\.wordpress\\.com/remote\\-login\\.php",
"'\\%20\\+\\%20liker\\.(avatar|profile)_URL\\%20\\+\\%20'",
"\\%22\\%20\\+\\%20$wrapper\\.data\\(",
"^http://.+\\.blogspot\\.(com|in|com\\.au|co\\.uk|jp|co\\.nz|ca|de|it|fr|se|sg|es|pt|com\\.br|ar|mx|kr)/(search(\\?|/label/)|\\d{4,4}/\\d{2,2}/CSI/$)",
"livejournal\\.com/ljcounter/?\\?",
"\\?replyto=[0-9]+",
"[\\?&]mode=reply",
"xiti\\.com/hit\\.xiti\\?",
"/stats\\.g\\.doubleclick\\.net/dc\\.js$",
"/jetpack-comment/\\?",
"\\?like_comment=\\d+",
"^https?://.+/.+/disqus\\.com/forums/$",
"(\\?|%5Cx26)route=(/page/:page|/archive/:year/:month|/tagged/:tag|/post/:id|/image/:post_id)",
"%5Cx26route=/archive",
"^http://\\d+\\.media\\.tumblr\\.com/avatar_.+_16\\.png$",
"^http://www\\.livejournal\\.com/(tools/memadd|update|login)\\.bml\\?",
"^http://[^\\.]+\\.livejournal\\.com/.+[\\?&]mode=reply",
"^http://[^\\.]+\\.livejournal\\.com/.+/\\*sup_ru/ru/UTF-8/",
"^http://[^\\.]+\\.livejournal\\.com/.+http://[^\\.]+\\.livejournal\\.com/",
"^http://[^\\.]+\\.livejournal\\.com/.+/stats\\.g\\.doubleclick\\.net/dc\\.js$",
"^https?://www\\.dreamwidth\\.org/tools/(memadd|tellafriend)\\?",
"^https?://[^\\.]+\\.dreamwidth\\.org/.+[\\?&]mode=reply",
// global.json
//"/(.*)/(\\1/){3,}",
"%25252525",
"/App_Themes/.+/App_Themes/",
"/bxSlider/.+/bxSlider/",
"/bxSlider/bxSlider/",
"/slides/slides/.+/slides/",
"/slides/.+/slides/slides/",
"/slides/slides/slides/",
"/js/js/.+/js/",
"/js/.+/js/js/",
"/js/js/js/",
"/css/css/.+/css/",
"/css/.+/css/css/",
"/css/css/css/",
"/styles/styles/.+/styles/",
"/styles/.+/styles/styles/",
"/styles/styles/styles/",
"/scripts/scripts/.+/scripts/",
"/scripts/.+/scripts/scripts/",
"/scripts/scripts/scripts/",
"/images/images/.+/images/",
"/images/.+/images/images/",
"/images/images/images/",
"/img/img/.+/img/",
"/img/.+/img/img/",
"/img/img/img/",
"/clientscript/clientscript/.+/clientscript/",
"/clientscript/.+/clientscript/clientscript/",
"/clientscript/clientscript/clientscript/",
"/lib/exe/.*lib[-_]exe[-_]lib[-_]exe[-_]",
"/(%5C)+(%22|%27)",
"/%5C/%5C/",
"/%27\\+[^/]+\\+%27",
"/%22\\+[^/]+\\+%22",
"/%27%20\\+[^/]+\\+%20%27",
"/%22%20\\+[^/]+\\+%20%22",
"/\\\\+(%22|%27)",
"/\\\\+[\"']",
"/\\\\/\\\\/",
"/'\\+[^/]+\\+'",
"^https?://localhost(:\\d+)?/",
"^https?://(127|10)\\.\\d+\\.\\d+\\.\\d+(:\\d+)?/",
"^https?://172\\.(1[6-9]|2\\d|3[01])\\.\\d+\\.\\d+(:\\d+)?/",
"^https?://192\\.168\\.\\d+\\.\\d+(:\\d+)?/",
"^https?://www\\.google\\.com/recaptcha/api",
"^https?://geo\\.yahoo\\.com/b\\?",
"^https?://((s-)?static\\.ak\\.fbcdn\\.net|(connect\\.|www\\.)?facebook\\.com)/connect\\.php/js/.*rsrc\\.php",
"^https?://www\\.flickr\\.com/change_language\\.gne",
"^https?://((www|web|web-beta|wayback)\\.)?archive\\.org/",
"^https?://www\\.google\\.((com|ad|ae|al|am|as|at|az|ba|be|bf|bg|bi|bj|bs|bt|by|ca|cd|cf|cg|ch|ci|cl|cm|cn|cv|cz|de|dj|dk|dm|dz|ee|es|fi|fm|fr|ga|ge|gg|gl|gm|gp|gr|gy|hn|hr|ht|hu|ie|im|iq|is|it|je|jo|ki|kg|kz|la|li|lk|lt|lu|lv|md|me|mg|mk|ml|mn|ms|mu|mv|mw|ne|nl|no|nr|nu|pl|pn|ps|pt|ro|ru|rw|sc|se|sh|si|sk|sn|so|sm|sr|st|td|tg|tk|tl|tm|tn|to|tt|vg|vu|ws|rs|cat)|(com\\.(af|ag|ai|ar|au|bd|bh|bn|bo|br|bz|co|cu|cy|do|ec|eg|et|fj|gh|gi|gt|hk|jm|kh|kw|lb|ly|mm|mt|mx|my|na|nf|ng|ni|np|om|pa|pe|pg|ph|pk|pr|py|qa|sa|sb|sg|sl|sv|tj|tr|tw|ua|uy|vc|vn))|(co\\.(ao|bw|ck|cr|id|il|in|jp|ke|kr|ls|ma|mz|nz|th|tz|ug|uk|uz|ve|vi|za|zm|zw)))/finance\\?noIL=1&q=[^&]+&ei=",
"^https?://upload\\.wikimedia\\.org/wikipedia/[^/]+/thumb/",
"^http://b\\.scorecardresearch\\.com/",
"^http://i\\.dev\\.cdn\\.turner\\.com/",
"^http://video-subtitle\\.tedcdn\\.com/",
"^http://download\\.ted\\.com/",
"^http://msft\\.digitalrivercontent\\.net/win/.+\\.iso",
"^https?://tmz\\.vo\\.llnwd\\.net/",
"^https?://(www\\.)?megaupload\\.com/",
"^https?://(www\\.)?filesonic\\.com/",
"^https?://(www\\.)?wupload\\.com/",
"^https?://prod-preview\\.wired\\.com/",
"^http://([^\\./]+\\.)?stream\\.publicradio\\.org/",
"^http://icecast\\.streaming\\.castor\\.nl/",
"^http://wm1\\.streaming\\.castor\\.nl:8000/",
"^http://icecast\\.databoss\\.nl:8000/",
"^http://stream\\.rynothebearded\\.com:8000/",
"^http://mp3\\.live\\.tv-radio\\.com/",
"^http://av\\.rasset\\.ie/av/live/",
"^http://gcnplayer\\.gcnlive\\.com/.+",
"^http://streaming\\.radionomy\\.com/",
"^http://mp3\\.ffh\\.de/",
"^http://(www\\.)?theradio\\.cc\\:8000/",
"^http://(audio\\d?|nfw)\\.video\\.ria\\.ru/",
"^http://eu1\\.fastcast4u\\.com:3048/",
"^http://[^\\./]+\\.radioscoop\\.(com|net):\\d+/",
"^http://[^\\./]+\\.streamchan\\.org:\\d+/",
"^http://[^/]*musicproxy\\.s12\\.de/",
"^http://stream\\.rfi\\.fr/",
"^http://striiming\\d?\\.trio\\.ee/",
"^http://streamer\\.radiocampus\\.be(:\\d+)?/",
"^http://relay\\.broadcastify\\.com/",
"^http://audio\\d?\\.radioreference\\.com/",
"^http://[^/]+\\.akadostream\\.ru(:\\d+)?/",
"^http://radio\\.silver\\.ru(:\\d+)?/",
"^http://icecast\\.szwoelf\\.com:8000/",
"^http://altair\\.micronick\\.com:8080/\\?action=stream",
"^http://94\\.25\\.53\\.13[1-4]/.+\\.mp3$",
"^http://server\\.lradio\\.ru:\\d+/",
"^http://188\\.93\\.17\\.201:8080/",
"^http://81\\.19\\.85\\.19[56]/.+\\.mp3$",
"^http://81\\.19\\.85\\.203/.+\\.mp3$",
"^http://play(\\d+)?\\.radio13\\.ru:8000/",
"^http://stream(\\d+)?\\.media\\.rambler\\.ru/",
"^http://pub(\\d+)?\\.di\\.fm/",
"^http://vostok\\.fmtuner\\.ru/",
"^http://109\\.120\\.141\\.181:8000/",
"^http://195\\.88\\.63\\.114:8000/",
"^http://radiosilver\\.corbina\\.net:8000/",
"^http://89\\.251\\.147\\.100/",
"^http://bcs\\d?\\.fontanka\\.fm:8000/",
"^http://stream2\\.cnmns\\.net/",
"^http://[^/]+\\.streamtheworld\\.com/",
"^http://[^/]+\\.gaduradio\\.pl/",
"^http://anka\\.org:8080/",
"^http://radio\\.visionotaku\\.com:8000/",
"^http://stream\\.r-a-d\\.io/",
"^http://r-a-d\\.io/.+\\.mp3$",
"^http://95\\.81\\.155\\.17/",
"^https?://icecast\\.rtl2?\\.fr/",
"^http://mp3tslg\\.tdf-cdn\\.com/",
"^http://[^/]+/anony/mjpg\\.cgi$",
"^https?://air\\.radiorecord\\.ru(:\\d+)?/",
"^https?://[^/]+\\.rastream\\.com(:\\d+)?/",
"^https?://audiots\\.scdn\\.arkena\\.com/",
"^https?://(www|draft)\\.blogger\\.com/(navbar\\.g|post-edit\\.g|delete-comment\\.g|comment-iframe\\.g|share-post\\.g|email-post\\.g|blog-this\\.g|delete-backlink\\.g|rearrange|blog_this\\.pyra)\\?",
"^https?://www\\.tumblr\\.com/(impixu\\?|share(/link/?)?\\?|reblog/)",
"^https?://plus\\.google\\.com/share\\?",
"^https?://(apis|plusone)\\.google\\.com/_/\\+1/",
"^https?://(ssl\\.|www\\.)?reddit\\.com/(login\\?dest=|submit\\?|static/button/button)",
"^https?://digg\\.com/submit\\?",
"^https?://(www\\.)?facebook\\.com/(plugins/like(box)?\\.php|sharer/sharer\\.php|sharer?\\.php|dialog/(feed|share))\\?",
"^https?://(www\\.)?twitter\\.com/(share\\?|intent/((re)?tweet|favorite)|home/?\\?status=|\\?status=)",
"^https?://platform\\d?\\.twitter\\.com/widgets/tweet_button.html\\?",
"^https?://www\\.newsvine\\.com/_wine/save\\?",
"^https?://www\\.netvibes\\.com/subscribe\\.php\\?",
"^https?://add\\.my\\.yahoo\\.com/(rss|content)\\?",
"^http://www\\.addtoany\\.com/(add_to/|share_save\\?)",
"^https?://www\\.addthis\\.com/bookmark\\.php\\?",
"^https?://(www\\.)?pinterest\\.com/pin/create/",
"^https?://www\\.linkedin\\.com/(cws/share|shareArticle)\\?",
"^https?://(www\\.)?stumbleupon\\.com/(submit\\?|badge/embed/)",
"^https?://csp\\.cyworld\\.com/bi/bi_recommend_pop\\.php\\?",
"^https://share\\.flipboard\\.com/bookmarklet/popout\\?",
"^https?://flattr.com/submit/auto\\?",
"^https?://(www\\.)?myspace\\.com/Modules/PostTo/",
"^https?://www\\.google\\.com/bookmarks/mark\\?",
"^http://myweb2\\.search\\.yahoo\\.com/myresults/bookmarklet\\?",
"^http://vuible\\.com/pins-settings/",
"^https?://news\\.ycombinator\\.com/submitlink\\?",
"^http://reporter\\.es\\.msn\\.com/\\?fn=contribute",
"^http://www\\.blinklist\\.com/index\\.php\\?Action=Blink/addblink\\.php",
"^http://sphinn\\.com/index\\.php\\?c=post&m=submit&",
"^http://posterous\\.com/share\\?",
"^http://del\\.icio\\.us/post\\?",
"^https?://delicious\\.com/(save|post)\\?",
"^https?://(www\\.)?friendfeed\\.com/share\\?",
"^https?://(www\\.)?xing\\.com/(app/user\\?op=share|social_plugins/share\\?)",
"^http://iwiw\\.hu/pages/share/share\\.jsp\\?",
"^http://memori(\\.qip)?\\.ru/link/\\?",
"^http://wow\\.ya\\.ru/posts_(add|share)_link\\.xml\\?",
"^https?://connect\\.mail\\.ru/share\\?",
"^http://zakladki\\.yandex\\.ru/newlink\\.xml\\?",
"^https?://vkontakte\\.ru/share\\.php\\?",
"^https?://www\\.odnoklassniki\\.ru/dk\\?st\\.cmd=addShare",
"^https?://www\\.google\\.com/(reader/link\\?|buzz/post\\?)",
"^https?://service\\.weibo\\.com/share/share\\.php\\?",
"^https?://(www\\.)?technorati\\.com/faves/?\\?add=",
"^https?://bufferapp\\.com/add\\?",
"^https?://b\\.hatena\\.ne\\.jp/add\\?",
"^https?://api\\.addthis\\.com/",
"^https?://bookmark\\.naver\\.com/post\\?",
"^https?://mail\\.google\\.com/mail/",
"^http://pixel\\.blog\\.hu/",
"^https?://pixel\\.quantserve\\.com/",
"^http://b\\.scorecardresearch\\.com/",
"^https?://(www|ssl)\\.google-analytics\\.com/(r/)?(__utm\\.gif|collect\\?)",
"^https?://p\\.opt\\.fimserve\\.com/",
"^https?://(\\d|www|secure)\\.gravatar\\.com/avatar/ad516503a11cd5ca435acc9bb6523536",
"^https?://imageshack\\.com/lost$",
"^https?://[^/]+\\.corp\\.ne1\\.yahoo\\.com/",
"^https?://.+/js-agent\\.newrelic\\.com/nr-\\d{3,3}(\\.min)?\\.js$",
"^https?://.+/stats\\.g\\.doubleclick\\.net/dc\\.js$",
"^https?://.+/js/chartbeat\\.js$",
"^http://www\\.khaleejtimes\\.com/.+/kt_.+/kt_",
"^http://www\\.khaleejtimes\\.com/.+/images/.+/images/",
"^http://www\\.khaleejtimes\\.com/.+/imgactv/.+/imgactv/",
"^http://photobucket\\.com/.+/albums/.+/albums/",
"^https?://([^/]+\\.)?gdcvault\\.com(/.*/|/)(fonts(/.*/|/)fonts/|css(/.*/|/)css/|img(/.*/|/)img/)",
"^https://static\\.licdn\\.com/sc/p/com\\.linkedin\\.nux(:|%3A)nux-static-content(\\+|%2B)[\\d\\.]+/f/",
"^https?://www\\.flickr\\.com/(explore/|photos/[^/]+/(sets/\\d+/(page\\d+/)?)?)\\d+_[a-f0-9]+(_[a-z])?\\.jpg$",
"^https?://static\\.licdn\\.com/sc/p/.+/f//",
"^http://www\\.warnerbros\\.com/\\d+$",
"^https?://tm\\.uol\\.com\\.br/h/.+/h/",
"^https?://media\\.opb\\.org/clips/embed/.+\\.js$",
// twitter.json
"^https?://((?:www|mobile)\\.)?twitter\\.com/.+\\?(?:id|lang|locale|screen_name)=",
"^https?://mobile\\.twitter\\.com/i/anonymize\\?data=",
// imdb.json
"^http://b\\.scorecardresearch\\.com/",
"^http://ad\\.doubleclick\\.net/",
"^http://www\\.imdb\\.com/rd/",
"^http://www\\.imdb\\.com/.+\\?ref_=",
"^http://www\\.imdb\\.com/.+/board/flat/",
"^http://www\\.imdb\\.com/.+/board/inline/",
"^http://www\\.imdb\\.com/.+/board/thread/",
"^http://www\\.imdb\\.com/help/boards_posting\\.html",
"^http://www\\.imdb\\.com/register/",
"^http://www\\.imdb\\.com/.+/board/.+/\\d+\\?d=",
"^http://www\\.imdb\\.com/.+/videogallery/.+/.+/",
// facebook.json
"^https?://error\\.facebook\\.com/common/scribe_endpoint\\.php\\?c=",
"^https?://www\\.facebook\\.com/[^/]+/(posts/|app_)[^/]+\\?(ref=page_internal&)?_fb_noscript=",
"^https?://www\\.facebook\\.com/[^/]+/photos/(pb|a)\\.[^/]+/[^/]+/.{4,4}/",
"^https?://www\\.facebook\\.com/[^/]+/photos/(pb|a)\\.[^/]+/[^/]+/\\?type=",
// internetcentrum.json
"%3Bamp%3Bamp",
"&action=edit",
"action=(?:komentar|send)",
"action=(?:multiple_products_add_product|notify|add_product|buy_now)",
"&action=submit",
"&amp;action=edit",
"amp;amp;",
"answer=.+?&anksent=true",
"[a-z0-9]=(?:off|on)",
"blog=1&disp=msgform",
"\\?cal=",
"calendar_menu/calendar\\.php",
"calendar_menu/event\\.php",
"calendar\\.php",
"calendar_scheduler\\.php",
"captcha.php",
"cas12&cas12",
"comment\\.php\\?akce=new",
"/comment/reply/\\d+",
"cPath=.+&sort=.+",
"destination=node/%2F\\d+",
"destination=node/\\d+",
"(?:displayimage|thumbnails)\\.php\\?pos=-\\d+",
"file=posting.+mode=quote",
"&highlight=&",
"^http://harizzzma\\.com",
"^http://www.nahraj.net/",
"index.*\\.php\\?option=com_eventcal",
"index.php\\?site=calendar",
"index\\.php\\?site=guestbook&type=(?:ASC|DESC)",
"index.php/Speci%C3%A1ln%C3%AD",
"index.php\\?title=Diskuse:",
"index.php\\?title=MediaWiki_diskuse:",
"index.php\\?title=Soubor_diskuse",
"index.php\\?title=Speci%C3%A1ln%C3%AD",
"index\\.php\\?\\w+&rok=(1995|2016)&mesic=\\d+&autor=\\d+$",
"index\\.php\\?.+year=198.",
"index\\.php\\?.+year=203.",
"kalendar-akci",
"kalendar\\.php",
"kalendarrok=\\d{4}",
//"lang=(?!czech|english)",
//"language=(?!cs|en)",
"LightNEasy\\.php\\?do=login",
"limit=.+limit=.+",
"login=",
"login\\.php",
"(?:login|registrace|live\\?)",
"mact=Calendar",
"main_page=(?:product_reviews_write|login|cookie_usage)",
"memberlist\\.php\\?mode=email",
"memberlist\\.php\\?mode=.+order=",
"(?:memberlist|viewprofile|viewtopic)\\.php\\?.*sk=.&sd=.",
"mini.+calendar",
"mm=\\d+.+yy=\\d{4}",
"mode=(?:lostpassword|sendpassword)",
"modules.+name=Forums.+view=(?:next|previous)",
"modules\\.php\\?name=coppermine.*file=displayimage.+&slideshow=\\d+",
"modules\\.php\\?name=coppermine.*meta=(?:topn|toprated|lastcom|lastup|lastupby|random|lastcomby)",
"modules\\.php\\?name=Statistics",
"mo=\\d+.+ye=\\d{4}",
"name=Kalender",
"name=Statistics",
"option=com_jcalpro.+date=\\d{4}-",
"\\?option=com.+&month=.+&year=\\d{4}",
"option=&Itemid=.+&date=\\d{4}-",
//"order=(?!1)",
"orderby=(?:name|note|count|news)",
"photo.php\\?i=-\\d+",
"/photos.+\\?url=",
".*\\..*\\..*\\.pl",
"p=ordersBasket.+sOption=add",
"portal\\.php\\?month=[\\d]+",
"postdays=0&postorder=asc",
"prev_next=(?:prev|next)",
"/calendar/",
"product_reviews_write\\.php\\?",
"profile\\.php\\?mode=email",
"profile\\.php\\?mode=register",
"\\?q=event.+/(?:day|list|month|table|week)/all/all",
"random_num=\\d+",
"Recentchangeslinked/",
"report\\.php\\?f=.+",
"search_id=mini_cal&d=\\d+",
"SESSION_ID=",
"showcal\\.php",
"site=guestbook.+type=(?:ASC|DESC)",
//"/sites/all/(sites|modules|libraries|scripts|themes)/.+/\\1",
"Souprava=.+Souprava=.+",
"Special:Whatlinkshere",
"start-index=-\\d+",
"/switchuilocale/",
"target[xy]=.+target[xy]=.+",
"tellafriend\\.php",
":Userlogin&",
"user/(?:register|login)",
"viewtopic\\.php\\?.*highlight=",
"viewtopic\\.php\\?p=\\d+",
"viewtopic\\.php\\?.+view=print",
"y=\\d{4}&m=\\d+",
// forums.json
"/cron\\.php\\?",
"/external\\.php\\?type=rss",
"/login\\.php\\?",
"/newreply\\.php\\?",
"/private\\.php\\?",
"/privmsg\\.php\\?",
"/register\\.php\\?",
"/sendmessage\\.php\\?",
"/subscription\\.php\\?",
"/posting\\.php\\?",
"/viewtopic\\.php\\?.+&view=(next|previous)",
"/viewtopic\\.php\\?.+&hilit=",
"/feed\\.php\\?",
"/index\\.php\\?option=com_mailto",
"&view=login&return=",
"&format=opensearch",
"/misc\\.php\\?do=whoposted",
"/newthread\\.php\\?",
"/post_thanks\\.php\\?",
"/blog_post\\.php\\?do=newblog",
"/forumdisplay\\.php.*[\\?&]do=markread",
"/userpoll/vote\\.php\\?",
"/showthread\\.php.*[\\?&]goto=(next(old|new)est|newpost)",
"/editpost\\.php\\?",
"/\\?view=getlastpost$",
"/index\\.php\\?sharelink=",
"/ucp\\.php\\?mode=delete_cookies",
}
......@@ -3,6 +3,7 @@ package crawl
import (
"fmt"
"net/url"
"regexp"
"strings"
)
......@@ -95,3 +96,30 @@ func NewSeedScope(seeds []*url.URL) Scope {
}
return NewURLPrefixScope(pfx)
}
type regexpIgnoreScope struct {
ignores []*regexp.Regexp
}
func (s *regexpIgnoreScope) Check(uri *url.URL, depth int) bool {
uriStr := uri.String()
for _, i := range s.ignores {
if i.MatchString(uriStr) {
return false
}
}
return true
}
func NewRegexpIgnoreScope(ignores []string) Scope {
if ignores == nil {
ignores = defaultIgnorePatterns
}
r := regexpIgnoreScope{
ignores: make([]*regexp.Regexp, 0, len(ignores)),
}
for _, i := range ignores {
r.ignores = append(r.ignores, regexp.MustCompile(i))
}
return &r
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment