Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
crawl
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ale
crawl
Commits
4c82422d
Commit
4c82422d
authored
10 years ago
by
ale
Browse files
Options
Downloads
Patches
Plain Diff
make Scope checking more modular
parent
efe98903
No related branches found
No related tags found
No related merge requests found
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
cmd/crawl/crawl.go
+5
-1
5 additions, 1 deletion
cmd/crawl/crawl.go
cmd/links/links.go
+5
-1
5 additions, 1 deletion
cmd/links/links.go
crawler.go
+8
-53
8 additions, 53 deletions
crawler.go
scope.go
+94
-0
94 additions, 0 deletions
scope.go
with
112 additions
and
55 deletions
cmd/crawl/crawl.go
+
5
−
1
View file @
4c82422d
...
...
@@ -124,7 +124,11 @@ func main() {
}
seeds
:=
crawl
.
MustParseURLs
(
flag
.
Args
())
scope
:=
crawl
.
NewSeedScope
(
seeds
,
*
depth
,
strings
.
Split
(
*
validSchemes
,
","
))
scope
:=
[]
crawl
.
Scope
{
crawl
.
NewSchemeScope
(
strings
.
Split
(
*
validSchemes
,
","
)),
crawl
.
NewDepthScope
(
*
depth
),
crawl
.
NewSeedScope
(
seeds
),
}
w
:=
warc
.
NewWriter
(
outf
)
defer
w
.
Close
()
...
...
This diff is collapsed.
Click to expand it.
cmd/links/links.go
+
5
−
1
View file @
4c82422d
...
...
@@ -38,7 +38,11 @@ func main() {
flag
.
Parse
()
seeds
:=
crawl
.
MustParseURLs
(
flag
.
Args
())
scope
:=
crawl
.
NewSeedScope
(
seeds
,
*
depth
,
strings
.
Split
(
*
validSchemes
,
","
))
scope
:=
[]
crawl
.
Scope
{
crawl
.
NewSchemeScope
(
strings
.
Split
(
*
validSchemes
,
","
)),
crawl
.
NewDepthScope
(
*
depth
),
crawl
.
NewSeedScope
(
seeds
),
}
crawler
,
err
:=
crawl
.
NewCrawler
(
"crawldb"
,
seeds
,
scope
,
crawl
.
FetcherFunc
(
http
.
Get
),
crawl
.
HandlerFunc
(
extractLinks
))
if
err
!=
nil
{
...
...
This diff is collapsed.
Click to expand it.
crawler.go
+
8
−
53
View file @
4c82422d
...
...
@@ -8,7 +8,6 @@ import (
"log"
"net/http"
"net/url"
"strings"
"sync"
"time"
...
...
@@ -58,10 +57,6 @@ type URLInfo struct {
Error
error
}
type
Scope
interface
{
Check
(
*
url
.
URL
,
int
)
bool
}
type
Fetcher
interface
{
Fetch
(
string
)
(
*
http
.
Response
,
error
)
}
...
...
@@ -86,7 +81,7 @@ func (f HandlerFunc) Handle(db *Crawler, u string, depth int, resp *http.Respons
type
Crawler
struct
{
db
*
gobDB
seeds
[]
*
url
.
URL
scope
Scope
scope
s
[]
Scope
fetcher
Fetcher
handler
Handler
...
...
@@ -111,10 +106,12 @@ func (c *Crawler) Enqueue(u *url.URL, depth int) {
// Normalize the URL.
urlStr
:=
purell
.
NormalizeURL
(
u
,
purell
.
FlagsSafe
|
purell
.
FlagRemoveDotSegments
|
purell
.
FlagRemoveDuplicateSlashes
|
purell
.
FlagRemoveFragment
|
purell
.
FlagRemoveDirectoryIndex
|
purell
.
FlagSortQuery
)
// See if it's in scope.
if
!
c
.
scope
.
Check
(
u
,
depth
)
{
// See if it's in scope. Checks are ANDed.
for
_
,
sc
:=
range
c
.
scopes
{
if
!
sc
.
Check
(
u
,
depth
)
{
return
}
}
c
.
enqueueMx
.
Lock
()
defer
c
.
enqueueMx
.
Unlock
()
...
...
@@ -202,46 +199,6 @@ func (c *Crawler) urlHandler(queue <-chan QueuePair) {
}
}
type
seedScope
struct
{
seeds
[]
*
url
.
URL
schemes
map
[
string
]
struct
{}
maxDepth
int
}
func
(
s
*
seedScope
)
Check
(
u
*
url
.
URL
,
depth
int
)
bool
{
// Ignore non-allowed schemes.
if
_
,
ok
:=
s
.
schemes
[
u
.
Scheme
];
!
ok
{
return
false
}
// Do not crawl beyond maxDepth.
if
depth
>
s
.
maxDepth
{
return
false
}
// Check each seed prefix.
for
_
,
seed
:=
range
s
.
seeds
{
if
u
.
Host
==
seed
.
Host
&&
strings
.
HasPrefix
(
u
.
Path
,
seed
.
Path
)
{
return
true
}
}
return
false
}
// NewSeedScope returns a Scope that will only allow crawling the seed
// domains, and not beyond the specified maximum link depth.
func
NewSeedScope
(
seeds
[]
*
url
.
URL
,
maxDepth
int
,
allowedSchemes
[]
string
)
Scope
{
scope
:=
&
seedScope
{
seeds
:
seeds
,
maxDepth
:
maxDepth
,
schemes
:
make
(
map
[
string
]
struct
{}),
}
for
_
,
s
:=
range
allowedSchemes
{
scope
.
schemes
[
s
]
=
struct
{}{}
}
return
scope
}
func
MustParseURLs
(
urls
[]
string
)
[]
*
url
.
URL
{
// Parse the seed URLs.
var
parsed
[]
*
url
.
URL
...
...
@@ -256,7 +213,7 @@ func MustParseURLs(urls []string) []*url.URL {
}
// NewCrawler creates a new Crawler object with the specified behavior.
func
NewCrawler
(
path
string
,
seeds
[]
*
url
.
URL
,
scope
Scope
,
f
Fetcher
,
h
Handler
)
(
*
Crawler
,
error
)
{
func
NewCrawler
(
path
string
,
seeds
[]
*
url
.
URL
,
scope
s
[]
Scope
,
f
Fetcher
,
h
Handler
)
(
*
Crawler
,
error
)
{
// Open the crawl database.
db
,
err
:=
newGobDB
(
path
)
if
err
!=
nil
{
...
...
@@ -267,7 +224,7 @@ func NewCrawler(path string, seeds []*url.URL, scope Scope, f Fetcher, h Handler
fetcher
:
f
,
handler
:
&
standardPageHandler
{
h
},
seeds
:
seeds
,
scope
:
scope
,
scope
s
:
scope
s
,
}
return
c
,
nil
}
...
...
@@ -321,8 +278,6 @@ func (wrap *standardPageHandler) Handle(c *Crawler, u string, depth int, resp *h
}
info
.
Error
=
err
//log.Printf("[CRAWL] %+v", info)
c
.
UpdateURL
(
info
)
return
nil
}
This diff is collapsed.
Click to expand it.
scope.go
0 → 100644
+
94
−
0
View file @
4c82422d
package
crawl
import
(
"fmt"
"net/url"
"strings"
)
type
Scope
interface
{
Check
(
*
url
.
URL
,
int
)
bool
}
type
maxDepthScope
struct
{
maxDepth
int
}
func
(
s
*
maxDepthScope
)
Check
(
uri
*
url
.
URL
,
depth
int
)
bool
{
return
depth
<
s
.
maxDepth
}
// NewDepthScope returns a Scope that will limit crawls to a
// maximum link depth with respect to the crawl seeds.
func
NewDepthScope
(
maxDepth
int
)
Scope
{
return
&
maxDepthScope
{
maxDepth
}
}
type
schemeScope
struct
{
allowedSchemes
map
[
string
]
struct
{}
}
func
(
s
*
schemeScope
)
Check
(
uri
*
url
.
URL
,
depth
int
)
bool
{
_
,
ok
:=
s
.
allowedSchemes
[
uri
.
Scheme
]
return
ok
}
// NewSchemeScope limits the crawl to the specified URL schemes.
func
NewSchemeScope
(
schemes
[]
string
)
Scope
{
m
:=
make
(
map
[
string
]
struct
{})
for
_
,
s
:=
range
schemes
{
m
[
s
]
=
struct
{}{}
}
return
&
schemeScope
{
m
}
}
// A URLPrefixMap makes it easy to check for URL prefixes (even for
// very large lists). The URL scheme is ignored, along with an
// eventual "www." prefix.
type
URLPrefixMap
map
[
string
]
struct
{}
func
normalizeUrlPrefix
(
uri
*
url
.
URL
)
string
{
return
strings
.
TrimPrefix
(
uri
.
Host
,
"www."
)
+
strings
.
TrimSuffix
(
uri
.
Path
,
"/"
)
}
func
(
m
URLPrefixMap
)
Add
(
uri
*
url
.
URL
)
{
m
[
normalizeUrlPrefix
(
uri
)]
=
struct
{}{}
}
func
(
m
URLPrefixMap
)
Contains
(
uri
*
url
.
URL
)
bool
{
s
:=
strings
.
TrimPrefix
(
uri
.
Host
,
"www."
)
for
_
,
p
:=
range
strings
.
Split
(
uri
.
Path
,
"/"
)
{
if
p
==
""
{
continue
}
s
=
fmt
.
Sprintf
(
"%s/%s"
,
s
,
p
)
if
_
,
ok
:=
m
[
s
];
ok
{
return
true
}
}
return
false
}
type
urlPrefixScope
struct
{
prefixes
URLPrefixMap
}
func
(
s
*
urlPrefixScope
)
Check
(
uri
*
url
.
URL
,
depth
int
)
bool
{
return
s
.
prefixes
.
Contains
(
uri
)
}
// NewURLPrefixScope returns a Scope that limits the crawl to a set of
// allowed URL prefixes.
func
NewURLPrefixScope
(
prefixes
URLPrefixMap
)
Scope
{
return
&
urlPrefixScope
{
prefixes
}
}
// NewSeedScope returns a Scope that will only allow crawling the seed
// prefixes.
func
NewSeedScope
(
seeds
[]
*
url
.
URL
)
Scope
{
pfx
:=
make
(
URLPrefixMap
)
for
_
,
s
:=
range
seeds
{
pfx
.
Add
(
s
)
}
return
NewURLPrefixScope
(
pfx
)
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment