Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
crawl
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ale
crawl
Compare revisions
be54594a36e2b51af5762b1725c55a85f9f171f3 to 179eb138652e164d82933d08ea890a76ade46166
Compare revisions
Changes are shown as if the
source
revision was being merged into the
target
revision.
Learn more about comparing revisions.
Source
ale/crawl
Select target project
No results found
179eb138652e164d82933d08ea890a76ade46166
Select Git revision
Branches
better-queue
master
2 results
Swap
Target
ale/crawl
Select target project
ale/crawl
1 result
be54594a36e2b51af5762b1725c55a85f9f171f3
Select Git revision
Branches
better-queue
master
2 results
Show changes
Only incoming changes from source
Include changes to target since source was created
Compare
Commits on Source (3)
Increase error retry delay
· e5390968
ale
authored
2 years ago
e5390968
Fix CI script
· fa3a9fb8
ale
authored
2 years ago
fa3a9fb8
Update gen-ignores to py3, and regenerate ignore list
· 179eb138
ale
authored
2 years ago
179eb138
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
.gitlab-ci.yml
+1
-7
1 addition, 7 deletions
.gitlab-ci.yml
crawler.go
+2
-1
2 additions, 1 deletion
crawler.go
gen-ignores.py
+7
-7
7 additions, 7 deletions
gen-ignores.py
ignore_patterns.go
+569
-110
569 additions, 110 deletions
ignore_patterns.go
with
579 additions
and
125 deletions
.gitlab-ci.yml
View file @
179eb138
include
:
"
https://git.autistici.org/ai3/build-deb/raw/master/ci-common.yml"
include
:
"
https://git.autistici.org/ai3/build-deb/raw/master/ci-common.yml"
stages
:
-
test
-
build_pkgsrc
-
build_pkg
-
upload_pkg
run_tests
:
run_tests
:
stage
:
test
stage
:
test
image
:
"
golang:
latest
"
image
:
"
golang:
1.19
"
script
:
"
go
test
-v
./..."
script
:
"
go
test
-v
./..."
This diff is collapsed.
Click to expand it.
crawler.go
View file @
179eb138
...
@@ -20,7 +20,7 @@ import (
...
@@ -20,7 +20,7 @@ import (
lutil
"github.com/syndtr/goleveldb/leveldb/util"
lutil
"github.com/syndtr/goleveldb/leveldb/util"
)
)
var
errorRetryDelay
=
1
80
*
time
.
Second
var
errorRetryDelay
=
1
2
*
time
.
Hour
type
gobDB
struct
{
type
gobDB
struct
{
*
leveldb
.
DB
*
leveldb
.
DB
...
@@ -188,6 +188,7 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error {
...
@@ -188,6 +188,7 @@ func (c *Crawler) Enqueue(link Outlink, depth int) error {
// See if it's in scope.
// See if it's in scope.
if
!
c
.
scope
.
Check
(
link
,
depth
)
{
if
!
c
.
scope
.
Check
(
link
,
depth
)
{
log
.
Printf
(
"%s is not in scope"
,
link
.
URL
)
return
nil
return
nil
}
}
...
...
This diff is collapsed.
Click to expand it.
gen-ignores.py
View file @
179eb138
...
@@ -13,19 +13,19 @@ import os
...
@@ -13,19 +13,19 @@ import os
import
sys
import
sys
archivebot_ignore_path
=
sys
.
argv
[
1
]
archivebot_ignore_path
=
sys
.
argv
[
1
]
print
'
package crawl
\n\n
var defaultIgnorePatterns = []string{
'
print
(
'
package crawl
\n\n
var defaultIgnorePatterns = []string{
'
)
for
fn
in
glob
.
glob
(
os
.
path
.
join
(
archivebot_ignore_path
,
'
*.json
'
)):
for
fn
in
glob
.
glob
(
os
.
path
.
join
(
archivebot_ignore_path
,
'
*.json
'
)):
try
:
try
:
with
open
(
fn
)
as
fd
:
with
open
(
fn
)
as
fd
:
print
'
\n\t
// %s
'
%
os
.
path
.
basename
(
fn
)
print
(
'
\n\t
// %s
'
%
os
.
path
.
basename
(
fn
)
)
for
p
in
json
.
load
(
fd
)[
'
patterns
'
]:
for
p
in
json
.
load
(
fd
)[
'
patterns
'
]:
if
'
\\\\
1
'
in
p
or
'
(?!
'
in
p
:
if
'
\\\\
1
'
in
p
or
'
(?!
'
in
p
:
# RE2 does not support backreferences or other
# RE2 does not support backreferences or other
# fancy PCRE constructs. This excludes <10
# fancy PCRE constructs. This excludes <10
# patterns from the ignore list.
# patterns from the ignore list.
continue
continue
p
rint
'
\t
%s,
'
%
json
.
dumps
(
p
)
p
=
p
.
replace
(
'
{primary_netloc}
'
,
'
.*
'
)
except
Exception
,
e
:
print
(
'
\t
%s,
'
%
json
.
dumps
(
p
))
print
>>
sys
.
stderr
,
'
error in %s: %s
'
%
(
fn
,
e
)
except
Exception
as
e
:
print
'
}
'
print
(
'
error in %s: %s
'
%
(
fn
,
e
),
file
=
sys
.
stderr
)
print
(
'
}
'
)
This diff is collapsed.
Click to expand it.
ignore_patterns.go
View file @
179eb138
This diff is collapsed.
Click to expand it.