Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
crawl
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ale
crawl
Commits
63bd51e0
Commit
63bd51e0
authored
9 years ago
by
ale
Browse files
Options
Downloads
Patches
Plain Diff
add ignore list from ArchiveBot
parent
aa6e67d7
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
cmd/crawl/crawl.go
+73
-1
73 additions, 1 deletion
cmd/crawl/crawl.go
gen-ignores.py
+31
-0
31 additions, 0 deletions
gen-ignores.py
ignore_patterns.go
+453
-0
453 additions, 0 deletions
ignore_patterns.go
scope.go
+28
-0
28 additions, 0 deletions
scope.go
with
585 additions
and
1 deletion
cmd/crawl/crawl.go
+
73
−
1
View file @
63bd51e0
...
...
@@ -13,6 +13,9 @@ import (
"os"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"git.autistici.org/ale/crawl"
"git.autistici.org/ale/crawl/analysis"
...
...
@@ -115,6 +118,74 @@ func NewSaveHandler(w *warc.Writer) crawl.Handler {
}
}
type
crawlStats
struct
{
bytes
int64
start
time
.
Time
lock
sync
.
Mutex
states
map
[
int
]
int
}
func
(
c
*
crawlStats
)
Update
(
resp
*
http
.
Response
)
{
c
.
lock
.
Lock
()
defer
c
.
lock
.
Unlock
()
c
.
states
[
resp
.
StatusCode
]
++
resp
.
Body
=
&
byteCounter
{
resp
.
Body
}
}
func
(
c
*
crawlStats
)
UpdateBytes
(
n
int64
)
{
atomic
.
AddInt64
(
&
c
.
bytes
,
n
)
}
func
(
c
*
crawlStats
)
Dump
()
{
c
.
lock
.
Lock
()
defer
c
.
lock
.
Unlock
()
rate
:=
float64
(
c
.
bytes
)
/
time
.
Since
(
c
.
start
)
.
Seconds
()
/
1000
fmt
.
Fprintf
(
os
.
Stderr
,
"stats: downloaded %d bytes (%.4g KB/s), status: %v
\n
"
,
c
.
bytes
,
rate
,
c
.
states
)
}
var
(
stats
*
crawlStats
client
*
http
.
Client
)
func
fetch
(
urlstr
string
)
(
*
http
.
Response
,
error
)
{
resp
,
err
:=
client
.
Get
(
urlstr
)
if
err
==
nil
{
stats
.
Update
(
resp
)
}
return
resp
,
err
}
func
init
()
{
client
=
&
http
.
Client
{}
stats
=
&
crawlStats
{
states
:
make
(
map
[
int
]
int
),
start
:
time
.
Now
(),
}
go
func
()
{
for
range
time
.
Tick
(
10
*
time
.
Second
)
{
stats
.
Dump
()
}
}()
}
type
byteCounter
struct
{
io
.
ReadCloser
}
func
(
b
*
byteCounter
)
Read
(
buf
[]
byte
)
(
int
,
error
)
{
n
,
err
:=
b
.
ReadCloser
.
Read
(
buf
)
if
n
>
0
{
stats
.
UpdateBytes
(
int64
(
n
))
}
return
n
,
err
}
func
main
()
{
flag
.
Parse
()
...
...
@@ -128,6 +199,7 @@ func main() {
crawl
.
NewSchemeScope
(
strings
.
Split
(
*
validSchemes
,
","
)),
crawl
.
NewDepthScope
(
*
depth
),
crawl
.
NewSeedScope
(
seeds
),
crawl
.
NewRegexpIgnoreScope
(
nil
),
}
w
:=
warc
.
NewWriter
(
outf
)
...
...
@@ -135,7 +207,7 @@ func main() {
saver
:=
NewSaveHandler
(
w
)
crawler
,
err
:=
crawl
.
NewCrawler
(
"crawldb"
,
seeds
,
scope
,
crawl
.
FetcherFunc
(
http
.
Get
),
crawl
.
NewRedirectHandler
(
saver
))
crawler
,
err
:=
crawl
.
NewCrawler
(
"crawldb"
,
seeds
,
scope
,
crawl
.
FetcherFunc
(
fetch
),
crawl
.
NewRedirectHandler
(
saver
))
if
err
!=
nil
{
log
.
Fatal
(
err
)
}
...
...
This diff is collapsed.
Click to expand it.
gen-ignores.py
0 → 100755
+
31
−
0
View file @
63bd51e0
#!/usr/bin/python
#
# Parse ArchiveBot ignore regexp patterns and generate a Go source
# file with a global variable including all of them.
#
# Invoke with a single argument, the location of a checked-out copy of
# https://github.com/ArchiveTeam/ArchiveBot/tree/master/db/ignore_patterns.
#
import
glob
import
json
import
os
import
sys
archivebot_ignore_path
=
sys
.
argv
[
1
]
print
'
package crawl
\n\n
var defaultIgnorePatterns = []string{
'
for
fn
in
glob
.
glob
(
os
.
path
.
join
(
archivebot_ignore_path
,
'
*.json
'
)):
try
:
with
open
(
fn
)
as
fd
:
print
'
\n\t
// %s
'
%
os
.
path
.
basename
(
fn
)
for
p
in
json
.
load
(
fd
)[
'
patterns
'
]:
if
'
\\\\
1
'
in
p
or
'
(?!
'
in
p
:
# RE2 does not support backreferences or other
# fancy PCRE constructs. This excludes <10
# patterns from the ignore list.
continue
print
'
\t
%s,
'
%
json
.
dumps
(
p
)
except
Exception
,
e
:
print
>>
sys
.
stderr
,
'
error in %s: %s
'
%
(
fn
,
e
)
print
'
}
'
This diff is collapsed.
Click to expand it.
ignore_patterns.go
0 → 100644
+
453
−
0
View file @
63bd51e0
package
crawl
var
defaultIgnorePatterns
=
[]
string
{
// WordPress.
"wp-login
\\
.php"
,
"/wp-admin/"
,
"/xmlrpc
\\
.php"
,
// googleplus.json
"^https?://accounts
\\
.google
\\
.com/ServiceLogin"
,
"^https?://accounts
\\
.google
\\
.com/SignUp"
,
"^https?://lh4
\\
.googleusercontent
\\
.com/proxy/[^/]+"
,
"^https?://plus
\\
.google
\\
.com/_/scs/apps-static/"
,
// mediawiki.json
"[
\\
?&]oldid=
\\
d+"
,
"[
\\
?&]curid=
\\
d+"
,
"[
\\
?&]limit=(20|100|250|500)"
,
"[
\\
?&]hide(minor|bots|anons|liu|myself|redirs|links|trans|patrolled)="
,
"([
\\
?&]title=|/)Special:(UserLogin|UserLogout|Translate|MobileFeedback|MobileOptions|RecentChangesLinked|Diff|MobileDiff)"
,
"([
\\
?&]title=|/)Special:RecentChanges&from=
\\
d+"
,
"([
\\
?&]title=|/)Special:ListFiles&dir=prev&offset=
\\
d+"
,
"([
\\
?&]title=|/)Special:(ListFiles|PrefixIndex).*&"
,
"([
\\
?&]title=|/)Special:ListFiles.*&user="
,
"([
\\
?&]title=|/)Special:Log/"
,
"[
\\
?&]action=edit§ion=(
\\
d+|new)"
,
"[
\\
?&]feed(format)?=atom"
,
"[
\\
?&]redlink=1"
,
"[
\\
?&]printable=yes"
,
"[
\\
?&]mobileaction="
,
"[
\\
?&]undo(after)?=
\\
d+"
,
"^http://a
\\
.wikia-beacon
\\
.com/__track/"
,
"/User_talk:.+/User_talk:"
,
"/User_blog:.+/User_blog:"
,
"/User:.+/User:"
,
// nosortedindex.json
"
\\
?C=[NMSD];O=[AD]$"
,
// coppermine.json
"(?:displayimage|thumbnails)
\\
.php[?&]album=(?:topn|toprated|lastcom|lastup|lastupby|random|lastcomby)"
,
"ratepic
\\
.php"
,
"addfav
\\
.php
\\
?.*ref=displayimage
\\
.php"
,
"displayimage
\\
.php
\\
?.*slideshow=
\\
d+"
,
// youtube.json
"^https?://accounts
\\
.google
\\
.com/ServiceLogin"
,
"
\\
.?youtube
\\
.com/user/[^/]+/(playlists|channels|videos)
\\
?(flow|view|sort|live_view)="
,
// reddit.json
"^https?://www
\\
.reddit
\\
.com/gold
\\
?goldtype="
,
"^https?://www
\\
.reddit
\\
.com/r/[^/]+/comments/[a-z0-9]+/[^/]+/[a-z0-9]+"
,
"^https?://www
\\
.reddit
\\
.com/r/[^/]+/comments/[a-z0-9]+.*
\\
?sort="
,
"^https?://www
\\
.reddit
\\
.com/r/[^/]+/comments/[a-z0-9]+/[^/]+/
\\
.compact"
,
"^https?://www
\\
.reddit
\\
.com/r/[^/]+/(top|new|rising|controversial|gilded|ads)/.+[
\\
?&]after="
,
"^https?://www
\\
.reddit
\\
.com/r/[^/]+/related/"
,
"^https?://www
\\
.reddit
\\
.com/r/[^/]+/(gilded)?
\\
.mobile
\\
?"
,
"^https?://www
\\
.reddit
\\
.com/r/[^/]+/search/?
\\
?"
,
"^https?://www
\\
.reddit
\\
.com/r/[^/]+/wiki/(revisions|discussions)/user/.+"
,
"^https?://www
\\
.reddit
\\
.com/user/[^/]+/(comments/)?.+[
\\
?&]sort="
,
"^https?://www
\\
.reddit
\\
.com/.+/
\\
.rss$"
,
"^https?://simple
\\
.reddit
\\
.com/"
,
"^https?://pixel
\\
.redditmedia
\\
.com/pixel/"
,
"
\\
.reddit
\\
.com/message/compose/?
\\
?"
,
"^https?://m
\\
.reddit
\\
.com/"
,
// nogravatar.json
"^https?://(
\\
d|secure)
\\
.gravatar
\\
.com/avatar/"
,
// meetupeverywhere.json
"^https?://.*
\\
.meetup
\\
.com/login/"
,
// pinterest.json
"^https?://www
\\
.pinterest
\\
.com/[^/]+/
\\
^/[^/]+/"
,
"^https?://www
\\
.pinterest
\\
.com/[^/]+/[^/]+/
\\
^/[^/]+/"
,
"^https?://www
\\
.pinterest
\\
.com/[^/]+/[^/]+
\\
.[^/]+"
,
"^https?://www
\\
.pinterest
\\
.com/[^/]+/[^/]+/[^/]+
\\
.[^/]+"
,
"^https?://www
\\
.pinterest
\\
.com/[^/]+/webapp/js/app/(desktop|common)/bundle-(jcrop|mapbox)
\\
.js"
,
"^https?://www
\\
.pinterest
\\
.com/[^/]+/[^/]+/webapp/js/app/(desktop|common)/bundle-(jcrop|mapbox)
\\
.js"
,
// noonion.json
// blogs.json
"[
\\
?&]replytocom="
,
"[
\\
?&]share="
,
"/page/%d/$"
,
"
\\
?showComment(=|%5C)"
,
"/quote-comment-
\\
d+/$"
,
"/wp-login
\\
.php
\\
?"
,
"^https?://r
\\
-login
\\
.wordpress
\\
.com/remote
\\
-login
\\
.php"
,
"'
\\
%20
\\
+
\\
%20liker
\\
.(avatar|profile)_URL
\\
%20
\\
+
\\
%20'"
,
"
\\
%22
\\
%20
\\
+
\\
%20$wrapper
\\
.data
\\
("
,
"^http://.+
\\
.blogspot
\\
.(com|in|com
\\
.au|co
\\
.uk|jp|co
\\
.nz|ca|de|it|fr|se|sg|es|pt|com
\\
.br|ar|mx|kr)/(search(
\\
?|/label/)|
\\
d{4,4}/
\\
d{2,2}/CSI/$)"
,
"livejournal
\\
.com/ljcounter/?
\\
?"
,
"
\\
?replyto=[0-9]+"
,
"[
\\
?&]mode=reply"
,
"xiti
\\
.com/hit
\\
.xiti
\\
?"
,
"/stats
\\
.g
\\
.doubleclick
\\
.net/dc
\\
.js$"
,
"/jetpack-comment/
\\
?"
,
"
\\
?like_comment=
\\
d+"
,
"^https?://.+/.+/disqus
\\
.com/forums/$"
,
"(
\\
?|%5Cx26)route=(/page/:page|/archive/:year/:month|/tagged/:tag|/post/:id|/image/:post_id)"
,
"%5Cx26route=/archive"
,
"^http://
\\
d+
\\
.media
\\
.tumblr
\\
.com/avatar_.+_16
\\
.png$"
,
"^http://www
\\
.livejournal
\\
.com/(tools/memadd|update|login)
\\
.bml
\\
?"
,
"^http://[^
\\
.]+
\\
.livejournal
\\
.com/.+[
\\
?&]mode=reply"
,
"^http://[^
\\
.]+
\\
.livejournal
\\
.com/.+/
\\
*sup_ru/ru/UTF-8/"
,
"^http://[^
\\
.]+
\\
.livejournal
\\
.com/.+http://[^
\\
.]+
\\
.livejournal
\\
.com/"
,
"^http://[^
\\
.]+
\\
.livejournal
\\
.com/.+/stats
\\
.g
\\
.doubleclick
\\
.net/dc
\\
.js$"
,
"^https?://www
\\
.dreamwidth
\\
.org/tools/(memadd|tellafriend)
\\
?"
,
"^https?://[^
\\
.]+
\\
.dreamwidth
\\
.org/.+[
\\
?&]mode=reply"
,
// global.json
//"/(.*)/(\\1/){3,}",
"%25252525"
,
"/App_Themes/.+/App_Themes/"
,
"/bxSlider/.+/bxSlider/"
,
"/bxSlider/bxSlider/"
,
"/slides/slides/.+/slides/"
,
"/slides/.+/slides/slides/"
,
"/slides/slides/slides/"
,
"/js/js/.+/js/"
,
"/js/.+/js/js/"
,
"/js/js/js/"
,
"/css/css/.+/css/"
,
"/css/.+/css/css/"
,
"/css/css/css/"
,
"/styles/styles/.+/styles/"
,
"/styles/.+/styles/styles/"
,
"/styles/styles/styles/"
,
"/scripts/scripts/.+/scripts/"
,
"/scripts/.+/scripts/scripts/"
,
"/scripts/scripts/scripts/"
,
"/images/images/.+/images/"
,
"/images/.+/images/images/"
,
"/images/images/images/"
,
"/img/img/.+/img/"
,
"/img/.+/img/img/"
,
"/img/img/img/"
,
"/clientscript/clientscript/.+/clientscript/"
,
"/clientscript/.+/clientscript/clientscript/"
,
"/clientscript/clientscript/clientscript/"
,
"/lib/exe/.*lib[-_]exe[-_]lib[-_]exe[-_]"
,
"/(%5C)+(%22|%27)"
,
"/%5C/%5C/"
,
"/%27
\\
+[^/]+
\\
+%27"
,
"/%22
\\
+[^/]+
\\
+%22"
,
"/%27%20
\\
+[^/]+
\\
+%20%27"
,
"/%22%20
\\
+[^/]+
\\
+%20%22"
,
"/
\\\\
+(%22|%27)"
,
"/
\\\\
+[
\"
']"
,
"/
\\\\
/
\\\\
/"
,
"/'
\\
+[^/]+
\\
+'"
,
"^https?://localhost(:
\\
d+)?/"
,
"^https?://(127|10)
\\
.
\\
d+
\\
.
\\
d+
\\
.
\\
d+(:
\\
d+)?/"
,
"^https?://172
\\
.(1[6-9]|2
\\
d|3[01])
\\
.
\\
d+
\\
.
\\
d+(:
\\
d+)?/"
,
"^https?://192
\\
.168
\\
.
\\
d+
\\
.
\\
d+(:
\\
d+)?/"
,
"^https?://www
\\
.google
\\
.com/recaptcha/api"
,
"^https?://geo
\\
.yahoo
\\
.com/b
\\
?"
,
"^https?://((s-)?static
\\
.ak
\\
.fbcdn
\\
.net|(connect
\\
.|www
\\
.)?facebook
\\
.com)/connect
\\
.php/js/.*rsrc
\\
.php"
,
"^https?://www
\\
.flickr
\\
.com/change_language
\\
.gne"
,
"^https?://((www|web|web-beta|wayback)
\\
.)?archive
\\
.org/"
,
"^https?://www
\\
.google
\\
.((com|ad|ae|al|am|as|at|az|ba|be|bf|bg|bi|bj|bs|bt|by|ca|cd|cf|cg|ch|ci|cl|cm|cn|cv|cz|de|dj|dk|dm|dz|ee|es|fi|fm|fr|ga|ge|gg|gl|gm|gp|gr|gy|hn|hr|ht|hu|ie|im|iq|is|it|je|jo|ki|kg|kz|la|li|lk|lt|lu|lv|md|me|mg|mk|ml|mn|ms|mu|mv|mw|ne|nl|no|nr|nu|pl|pn|ps|pt|ro|ru|rw|sc|se|sh|si|sk|sn|so|sm|sr|st|td|tg|tk|tl|tm|tn|to|tt|vg|vu|ws|rs|cat)|(com
\\
.(af|ag|ai|ar|au|bd|bh|bn|bo|br|bz|co|cu|cy|do|ec|eg|et|fj|gh|gi|gt|hk|jm|kh|kw|lb|ly|mm|mt|mx|my|na|nf|ng|ni|np|om|pa|pe|pg|ph|pk|pr|py|qa|sa|sb|sg|sl|sv|tj|tr|tw|ua|uy|vc|vn))|(co
\\
.(ao|bw|ck|cr|id|il|in|jp|ke|kr|ls|ma|mz|nz|th|tz|ug|uk|uz|ve|vi|za|zm|zw)))/finance
\\
?noIL=1&q=[^&]+&ei="
,
"^https?://upload
\\
.wikimedia
\\
.org/wikipedia/[^/]+/thumb/"
,
"^http://b
\\
.scorecardresearch
\\
.com/"
,
"^http://i
\\
.dev
\\
.cdn
\\
.turner
\\
.com/"
,
"^http://video-subtitle
\\
.tedcdn
\\
.com/"
,
"^http://download
\\
.ted
\\
.com/"
,
"^http://msft
\\
.digitalrivercontent
\\
.net/win/.+
\\
.iso"
,
"^https?://tmz
\\
.vo
\\
.llnwd
\\
.net/"
,
"^https?://(www
\\
.)?megaupload
\\
.com/"
,
"^https?://(www
\\
.)?filesonic
\\
.com/"
,
"^https?://(www
\\
.)?wupload
\\
.com/"
,
"^https?://prod-preview
\\
.wired
\\
.com/"
,
"^http://([^
\\
./]+
\\
.)?stream
\\
.publicradio
\\
.org/"
,
"^http://icecast
\\
.streaming
\\
.castor
\\
.nl/"
,
"^http://wm1
\\
.streaming
\\
.castor
\\
.nl:8000/"
,
"^http://icecast
\\
.databoss
\\
.nl:8000/"
,
"^http://stream
\\
.rynothebearded
\\
.com:8000/"
,
"^http://mp3
\\
.live
\\
.tv-radio
\\
.com/"
,
"^http://av
\\
.rasset
\\
.ie/av/live/"
,
"^http://gcnplayer
\\
.gcnlive
\\
.com/.+"
,
"^http://streaming
\\
.radionomy
\\
.com/"
,
"^http://mp3
\\
.ffh
\\
.de/"
,
"^http://(www
\\
.)?theradio
\\
.cc
\\
:8000/"
,
"^http://(audio
\\
d?|nfw)
\\
.video
\\
.ria
\\
.ru/"
,
"^http://eu1
\\
.fastcast4u
\\
.com:3048/"
,
"^http://[^
\\
./]+
\\
.radioscoop
\\
.(com|net):
\\
d+/"
,
"^http://[^
\\
./]+
\\
.streamchan
\\
.org:
\\
d+/"
,
"^http://[^/]*musicproxy
\\
.s12
\\
.de/"
,
"^http://stream
\\
.rfi
\\
.fr/"
,
"^http://striiming
\\
d?
\\
.trio
\\
.ee/"
,
"^http://streamer
\\
.radiocampus
\\
.be(:
\\
d+)?/"
,
"^http://relay
\\
.broadcastify
\\
.com/"
,
"^http://audio
\\
d?
\\
.radioreference
\\
.com/"
,
"^http://[^/]+
\\
.akadostream
\\
.ru(:
\\
d+)?/"
,
"^http://radio
\\
.silver
\\
.ru(:
\\
d+)?/"
,
"^http://icecast
\\
.szwoelf
\\
.com:8000/"
,
"^http://altair
\\
.micronick
\\
.com:8080/
\\
?action=stream"
,
"^http://94
\\
.25
\\
.53
\\
.13[1-4]/.+
\\
.mp3$"
,
"^http://server
\\
.lradio
\\
.ru:
\\
d+/"
,
"^http://188
\\
.93
\\
.17
\\
.201:8080/"
,
"^http://81
\\
.19
\\
.85
\\
.19[56]/.+
\\
.mp3$"
,
"^http://81
\\
.19
\\
.85
\\
.203/.+
\\
.mp3$"
,
"^http://play(
\\
d+)?
\\
.radio13
\\
.ru:8000/"
,
"^http://stream(
\\
d+)?
\\
.media
\\
.rambler
\\
.ru/"
,
"^http://pub(
\\
d+)?
\\
.di
\\
.fm/"
,
"^http://vostok
\\
.fmtuner
\\
.ru/"
,
"^http://109
\\
.120
\\
.141
\\
.181:8000/"
,
"^http://195
\\
.88
\\
.63
\\
.114:8000/"
,
"^http://radiosilver
\\
.corbina
\\
.net:8000/"
,
"^http://89
\\
.251
\\
.147
\\
.100/"
,
"^http://bcs
\\
d?
\\
.fontanka
\\
.fm:8000/"
,
"^http://stream2
\\
.cnmns
\\
.net/"
,
"^http://[^/]+
\\
.streamtheworld
\\
.com/"
,
"^http://[^/]+
\\
.gaduradio
\\
.pl/"
,
"^http://anka
\\
.org:8080/"
,
"^http://radio
\\
.visionotaku
\\
.com:8000/"
,
"^http://stream
\\
.r-a-d
\\
.io/"
,
"^http://r-a-d
\\
.io/.+
\\
.mp3$"
,
"^http://95
\\
.81
\\
.155
\\
.17/"
,
"^https?://icecast
\\
.rtl2?
\\
.fr/"
,
"^http://mp3tslg
\\
.tdf-cdn
\\
.com/"
,
"^http://[^/]+/anony/mjpg
\\
.cgi$"
,
"^https?://air
\\
.radiorecord
\\
.ru(:
\\
d+)?/"
,
"^https?://[^/]+
\\
.rastream
\\
.com(:
\\
d+)?/"
,
"^https?://audiots
\\
.scdn
\\
.arkena
\\
.com/"
,
"^https?://(www|draft)
\\
.blogger
\\
.com/(navbar
\\
.g|post-edit
\\
.g|delete-comment
\\
.g|comment-iframe
\\
.g|share-post
\\
.g|email-post
\\
.g|blog-this
\\
.g|delete-backlink
\\
.g|rearrange|blog_this
\\
.pyra)
\\
?"
,
"^https?://www
\\
.tumblr
\\
.com/(impixu
\\
?|share(/link/?)?
\\
?|reblog/)"
,
"^https?://plus
\\
.google
\\
.com/share
\\
?"
,
"^https?://(apis|plusone)
\\
.google
\\
.com/_/
\\
+1/"
,
"^https?://(ssl
\\
.|www
\\
.)?reddit
\\
.com/(login
\\
?dest=|submit
\\
?|static/button/button)"
,
"^https?://digg
\\
.com/submit
\\
?"
,
"^https?://(www
\\
.)?facebook
\\
.com/(plugins/like(box)?
\\
.php|sharer/sharer
\\
.php|sharer?
\\
.php|dialog/(feed|share))
\\
?"
,
"^https?://(www
\\
.)?twitter
\\
.com/(share
\\
?|intent/((re)?tweet|favorite)|home/?
\\
?status=|
\\
?status=)"
,
"^https?://platform
\\
d?
\\
.twitter
\\
.com/widgets/tweet_button.html
\\
?"
,
"^https?://www
\\
.newsvine
\\
.com/_wine/save
\\
?"
,
"^https?://www
\\
.netvibes
\\
.com/subscribe
\\
.php
\\
?"
,
"^https?://add
\\
.my
\\
.yahoo
\\
.com/(rss|content)
\\
?"
,
"^http://www
\\
.addtoany
\\
.com/(add_to/|share_save
\\
?)"
,
"^https?://www
\\
.addthis
\\
.com/bookmark
\\
.php
\\
?"
,
"^https?://(www
\\
.)?pinterest
\\
.com/pin/create/"
,
"^https?://www
\\
.linkedin
\\
.com/(cws/share|shareArticle)
\\
?"
,
"^https?://(www
\\
.)?stumbleupon
\\
.com/(submit
\\
?|badge/embed/)"
,
"^https?://csp
\\
.cyworld
\\
.com/bi/bi_recommend_pop
\\
.php
\\
?"
,
"^https://share
\\
.flipboard
\\
.com/bookmarklet/popout
\\
?"
,
"^https?://flattr.com/submit/auto
\\
?"
,
"^https?://(www
\\
.)?myspace
\\
.com/Modules/PostTo/"
,
"^https?://www
\\
.google
\\
.com/bookmarks/mark
\\
?"
,
"^http://myweb2
\\
.search
\\
.yahoo
\\
.com/myresults/bookmarklet
\\
?"
,
"^http://vuible
\\
.com/pins-settings/"
,
"^https?://news
\\
.ycombinator
\\
.com/submitlink
\\
?"
,
"^http://reporter
\\
.es
\\
.msn
\\
.com/
\\
?fn=contribute"
,
"^http://www
\\
.blinklist
\\
.com/index
\\
.php
\\
?Action=Blink/addblink
\\
.php"
,
"^http://sphinn
\\
.com/index
\\
.php
\\
?c=post&m=submit&"
,
"^http://posterous
\\
.com/share
\\
?"
,
"^http://del
\\
.icio
\\
.us/post
\\
?"
,
"^https?://delicious
\\
.com/(save|post)
\\
?"
,
"^https?://(www
\\
.)?friendfeed
\\
.com/share
\\
?"
,
"^https?://(www
\\
.)?xing
\\
.com/(app/user
\\
?op=share|social_plugins/share
\\
?)"
,
"^http://iwiw
\\
.hu/pages/share/share
\\
.jsp
\\
?"
,
"^http://memori(
\\
.qip)?
\\
.ru/link/
\\
?"
,
"^http://wow
\\
.ya
\\
.ru/posts_(add|share)_link
\\
.xml
\\
?"
,
"^https?://connect
\\
.mail
\\
.ru/share
\\
?"
,
"^http://zakladki
\\
.yandex
\\
.ru/newlink
\\
.xml
\\
?"
,
"^https?://vkontakte
\\
.ru/share
\\
.php
\\
?"
,
"^https?://www
\\
.odnoklassniki
\\
.ru/dk
\\
?st
\\
.cmd=addShare"
,
"^https?://www
\\
.google
\\
.com/(reader/link
\\
?|buzz/post
\\
?)"
,
"^https?://service
\\
.weibo
\\
.com/share/share
\\
.php
\\
?"
,
"^https?://(www
\\
.)?technorati
\\
.com/faves/?
\\
?add="
,
"^https?://bufferapp
\\
.com/add
\\
?"
,
"^https?://b
\\
.hatena
\\
.ne
\\
.jp/add
\\
?"
,
"^https?://api
\\
.addthis
\\
.com/"
,
"^https?://bookmark
\\
.naver
\\
.com/post
\\
?"
,
"^https?://mail
\\
.google
\\
.com/mail/"
,
"^http://pixel
\\
.blog
\\
.hu/"
,
"^https?://pixel
\\
.quantserve
\\
.com/"
,
"^http://b
\\
.scorecardresearch
\\
.com/"
,
"^https?://(www|ssl)
\\
.google-analytics
\\
.com/(r/)?(__utm
\\
.gif|collect
\\
?)"
,
"^https?://p
\\
.opt
\\
.fimserve
\\
.com/"
,
"^https?://(
\\
d|www|secure)
\\
.gravatar
\\
.com/avatar/ad516503a11cd5ca435acc9bb6523536"
,
"^https?://imageshack
\\
.com/lost$"
,
"^https?://[^/]+
\\
.corp
\\
.ne1
\\
.yahoo
\\
.com/"
,
"^https?://.+/js-agent
\\
.newrelic
\\
.com/nr-
\\
d{3,3}(
\\
.min)?
\\
.js$"
,
"^https?://.+/stats
\\
.g
\\
.doubleclick
\\
.net/dc
\\
.js$"
,
"^https?://.+/js/chartbeat
\\
.js$"
,
"^http://www
\\
.khaleejtimes
\\
.com/.+/kt_.+/kt_"
,
"^http://www
\\
.khaleejtimes
\\
.com/.+/images/.+/images/"
,
"^http://www
\\
.khaleejtimes
\\
.com/.+/imgactv/.+/imgactv/"
,
"^http://photobucket
\\
.com/.+/albums/.+/albums/"
,
"^https?://([^/]+
\\
.)?gdcvault
\\
.com(/.*/|/)(fonts(/.*/|/)fonts/|css(/.*/|/)css/|img(/.*/|/)img/)"
,
"^https://static
\\
.licdn
\\
.com/sc/p/com
\\
.linkedin
\\
.nux(:|%3A)nux-static-content(
\\
+|%2B)[
\\
d
\\
.]+/f/"
,
"^https?://www
\\
.flickr
\\
.com/(explore/|photos/[^/]+/(sets/
\\
d+/(page
\\
d+/)?)?)
\\
d+_[a-f0-9]+(_[a-z])?
\\
.jpg$"
,
"^https?://static
\\
.licdn
\\
.com/sc/p/.+/f//"
,
"^http://www
\\
.warnerbros
\\
.com/
\\
d+$"
,
"^https?://tm
\\
.uol
\\
.com
\\
.br/h/.+/h/"
,
"^https?://media
\\
.opb
\\
.org/clips/embed/.+
\\
.js$"
,
// twitter.json
"^https?://((?:www|mobile)
\\
.)?twitter
\\
.com/.+
\\
?(?:id|lang|locale|screen_name)="
,
"^https?://mobile
\\
.twitter
\\
.com/i/anonymize
\\
?data="
,
// imdb.json
"^http://b
\\
.scorecardresearch
\\
.com/"
,
"^http://ad
\\
.doubleclick
\\
.net/"
,
"^http://www
\\
.imdb
\\
.com/rd/"
,
"^http://www
\\
.imdb
\\
.com/.+
\\
?ref_="
,
"^http://www
\\
.imdb
\\
.com/.+/board/flat/"
,
"^http://www
\\
.imdb
\\
.com/.+/board/inline/"
,
"^http://www
\\
.imdb
\\
.com/.+/board/thread/"
,
"^http://www
\\
.imdb
\\
.com/help/boards_posting
\\
.html"
,
"^http://www
\\
.imdb
\\
.com/register/"
,
"^http://www
\\
.imdb
\\
.com/.+/board/.+/
\\
d+
\\
?d="
,
"^http://www
\\
.imdb
\\
.com/.+/videogallery/.+/.+/"
,
// facebook.json
"^https?://error
\\
.facebook
\\
.com/common/scribe_endpoint
\\
.php
\\
?c="
,
"^https?://www
\\
.facebook
\\
.com/[^/]+/(posts/|app_)[^/]+
\\
?(ref=page_internal&)?_fb_noscript="
,
"^https?://www
\\
.facebook
\\
.com/[^/]+/photos/(pb|a)
\\
.[^/]+/[^/]+/.{4,4}/"
,
"^https?://www
\\
.facebook
\\
.com/[^/]+/photos/(pb|a)
\\
.[^/]+/[^/]+/
\\
?type="
,
// internetcentrum.json
"%3Bamp%3Bamp"
,
"&action=edit"
,
"action=(?:komentar|send)"
,
"action=(?:multiple_products_add_product|notify|add_product|buy_now)"
,
"&action=submit"
,
"&action=edit"
,
"amp;amp;"
,
"answer=.+?&anksent=true"
,
"[a-z0-9]=(?:off|on)"
,
"blog=1&disp=msgform"
,
"
\\
?cal="
,
"calendar_menu/calendar
\\
.php"
,
"calendar_menu/event
\\
.php"
,
"calendar
\\
.php"
,
"calendar_scheduler
\\
.php"
,
"captcha.php"
,
"cas12&cas12"
,
"comment
\\
.php
\\
?akce=new"
,
"/comment/reply/
\\
d+"
,
"cPath=.+&sort=.+"
,
"destination=node/%2F
\\
d+"
,
"destination=node/
\\
d+"
,
"(?:displayimage|thumbnails)
\\
.php
\\
?pos=-
\\
d+"
,
"file=posting.+mode=quote"
,
"&highlight=&"
,
"^http://harizzzma
\\
.com"
,
"^http://www.nahraj.net/"
,
"index.*
\\
.php
\\
?option=com_eventcal"
,
"index.php
\\
?site=calendar"
,
"index
\\
.php
\\
?site=guestbook&type=(?:ASC|DESC)"
,
"index.php/Speci%C3%A1ln%C3%AD"
,
"index.php
\\
?title=Diskuse:"
,
"index.php
\\
?title=MediaWiki_diskuse:"
,
"index.php
\\
?title=Soubor_diskuse"
,
"index.php
\\
?title=Speci%C3%A1ln%C3%AD"
,
"index
\\
.php
\\
?
\\
w+&rok=(1995|2016)&mesic=
\\
d+&autor=
\\
d+$"
,
"index
\\
.php
\\
?.+year=198."
,
"index
\\
.php
\\
?.+year=203."
,
"kalendar-akci"
,
"kalendar
\\
.php"
,
"kalendarrok=
\\
d{4}"
,
//"lang=(?!czech|english)",
//"language=(?!cs|en)",
"LightNEasy
\\
.php
\\
?do=login"
,
"limit=.+limit=.+"
,
"login="
,
"login
\\
.php"
,
"(?:login|registrace|live
\\
?)"
,
"mact=Calendar"
,
"main_page=(?:product_reviews_write|login|cookie_usage)"
,
"memberlist
\\
.php
\\
?mode=email"
,
"memberlist
\\
.php
\\
?mode=.+order="
,
"(?:memberlist|viewprofile|viewtopic)
\\
.php
\\
?.*sk=.&sd=."
,
"mini.+calendar"
,
"mm=
\\
d+.+yy=
\\
d{4}"
,
"mode=(?:lostpassword|sendpassword)"
,
"modules.+name=Forums.+view=(?:next|previous)"
,
"modules
\\
.php
\\
?name=coppermine.*file=displayimage.+&slideshow=
\\
d+"
,
"modules
\\
.php
\\
?name=coppermine.*meta=(?:topn|toprated|lastcom|lastup|lastupby|random|lastcomby)"
,
"modules
\\
.php
\\
?name=Statistics"
,
"mo=
\\
d+.+ye=
\\
d{4}"
,
"name=Kalender"
,
"name=Statistics"
,
"option=com_jcalpro.+date=
\\
d{4}-"
,
"
\\
?option=com.+&month=.+&year=
\\
d{4}"
,
"option=&Itemid=.+&date=
\\
d{4}-"
,
//"order=(?!1)",
"orderby=(?:name|note|count|news)"
,
"photo.php
\\
?i=-
\\
d+"
,
"/photos.+
\\
?url="
,
".*
\\
..*
\\
..*
\\
.pl"
,
"p=ordersBasket.+sOption=add"
,
"portal
\\
.php
\\
?month=[
\\
d]+"
,
"postdays=0&postorder=asc"
,
"prev_next=(?:prev|next)"
,
"/calendar/"
,
"product_reviews_write
\\
.php
\\
?"
,
"profile
\\
.php
\\
?mode=email"
,
"profile
\\
.php
\\
?mode=register"
,
"
\\
?q=event.+/(?:day|list|month|table|week)/all/all"
,
"random_num=
\\
d+"
,
"Recentchangeslinked/"
,
"report
\\
.php
\\
?f=.+"
,
"search_id=mini_cal&d=
\\
d+"
,
"SESSION_ID="
,
"showcal
\\
.php"
,
"site=guestbook.+type=(?:ASC|DESC)"
,
//"/sites/all/(sites|modules|libraries|scripts|themes)/.+/\\1",
"Souprava=.+Souprava=.+"
,
"Special:Whatlinkshere"
,
"start-index=-
\\
d+"
,
"/switchuilocale/"
,
"target[xy]=.+target[xy]=.+"
,
"tellafriend
\\
.php"
,
":Userlogin&"
,
"user/(?:register|login)"
,
"viewtopic
\\
.php
\\
?.*highlight="
,
"viewtopic
\\
.php
\\
?p=
\\
d+"
,
"viewtopic
\\
.php
\\
?.+view=print"
,
"y=
\\
d{4}&m=
\\
d+"
,
// forums.json
"/cron
\\
.php
\\
?"
,
"/external
\\
.php
\\
?type=rss"
,
"/login
\\
.php
\\
?"
,
"/newreply
\\
.php
\\
?"
,
"/private
\\
.php
\\
?"
,
"/privmsg
\\
.php
\\
?"
,
"/register
\\
.php
\\
?"
,
"/sendmessage
\\
.php
\\
?"
,
"/subscription
\\
.php
\\
?"
,
"/posting
\\
.php
\\
?"
,
"/viewtopic
\\
.php
\\
?.+&view=(next|previous)"
,
"/viewtopic
\\
.php
\\
?.+&hilit="
,
"/feed
\\
.php
\\
?"
,
"/index
\\
.php
\\
?option=com_mailto"
,
"&view=login&return="
,
"&format=opensearch"
,
"/misc
\\
.php
\\
?do=whoposted"
,
"/newthread
\\
.php
\\
?"
,
"/post_thanks
\\
.php
\\
?"
,
"/blog_post
\\
.php
\\
?do=newblog"
,
"/forumdisplay
\\
.php.*[
\\
?&]do=markread"
,
"/userpoll/vote
\\
.php
\\
?"
,
"/showthread
\\
.php.*[
\\
?&]goto=(next(old|new)est|newpost)"
,
"/editpost
\\
.php
\\
?"
,
"/
\\
?view=getlastpost$"
,
"/index
\\
.php
\\
?sharelink="
,
"/ucp
\\
.php
\\
?mode=delete_cookies"
,
}
This diff is collapsed.
Click to expand it.
scope.go
+
28
−
0
View file @
63bd51e0
...
...
@@ -3,6 +3,7 @@ package crawl
import
(
"fmt"
"net/url"
"regexp"
"strings"
)
...
...
@@ -95,3 +96,30 @@ func NewSeedScope(seeds []*url.URL) Scope {
}
return
NewURLPrefixScope
(
pfx
)
}
type
regexpIgnoreScope
struct
{
ignores
[]
*
regexp
.
Regexp
}
func
(
s
*
regexpIgnoreScope
)
Check
(
uri
*
url
.
URL
,
depth
int
)
bool
{
uriStr
:=
uri
.
String
()
for
_
,
i
:=
range
s
.
ignores
{
if
i
.
MatchString
(
uriStr
)
{
return
false
}
}
return
true
}
func
NewRegexpIgnoreScope
(
ignores
[]
string
)
Scope
{
if
ignores
==
nil
{
ignores
=
defaultIgnorePatterns
}
r
:=
regexpIgnoreScope
{
ignores
:
make
([]
*
regexp
.
Regexp
,
0
,
len
(
ignores
)),
}
for
_
,
i
:=
range
ignores
{
r
.
ignores
=
append
(
r
.
ignores
,
regexp
.
MustCompile
(
i
))
}
return
&
r
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment