Skip to content
Snippets Groups Projects
Commit 6f5bef5f authored by ale's avatar ale
Browse files

Use a global http.Client with sane settings

parent 979f2e8d
Branches
No related tags found
No related merge requests found
package crawl
import (
"crypto/tls"
"net/http"
"net/http/cookiejar"
"time"
)
var defaultClientTimeout = 60 * time.Second
var DefaultClient *http.Client
// DefaultClient returns a http.Client suitable for crawling: does not
// follow redirects, accepts invalid TLS certificates, sets a
// reasonable timeout for requests.
func init() {
jar, _ := cookiejar.New(nil)
DefaultClient = &http.Client{
Timeout: defaultClientTimeout,
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true,
},
},
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
},
Jar: jar,
}
}
...@@ -11,6 +11,7 @@ import ( ...@@ -11,6 +11,7 @@ import (
"log" "log"
"net/http" "net/http"
"os" "os"
"runtime/pprof"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
...@@ -30,6 +31,8 @@ var ( ...@@ -30,6 +31,8 @@ var (
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)") alwaysIncludeRelated = flag.Bool("include-related", false, "always include related resources (css, images, etc)")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
cpuprofile = flag.String("cpuprofile", "", "create cpu profile")
) )
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
...@@ -147,14 +150,10 @@ func (c *crawlStats) Dump() { ...@@ -147,14 +150,10 @@ func (c *crawlStats) Dump() {
fmt.Fprintf(os.Stderr, "stats: downloaded %d bytes (%.4g KB/s), status: %v\n", c.bytes, rate, c.states) fmt.Fprintf(os.Stderr, "stats: downloaded %d bytes (%.4g KB/s), status: %v\n", c.bytes, rate, c.states)
} }
var ( var stats *crawlStats
stats *crawlStats
client *http.Client
)
func fetch(urlstr string) (*http.Response, error) { func fetch(urlstr string) (*http.Response, error) {
resp, err := client.Get(urlstr) resp, err := crawl.DefaultClient.Get(urlstr)
if err == nil { if err == nil {
stats.Update(resp) stats.Update(resp)
} }
...@@ -162,8 +161,6 @@ func fetch(urlstr string) (*http.Response, error) { ...@@ -162,8 +161,6 @@ func fetch(urlstr string) (*http.Response, error) {
} }
func init() { func init() {
client = &http.Client{}
stats = &crawlStats{ stats = &crawlStats{
states: make(map[int]int), states: make(map[int]int),
start: time.Now(), start: time.Now(),
...@@ -191,6 +188,17 @@ func (b *byteCounter) Read(buf []byte) (int, error) { ...@@ -191,6 +188,17 @@ func (b *byteCounter) Read(buf []byte) (int, error) {
func main() { func main() {
flag.Parse() flag.Parse()
if *cpuprofile != "" {
f, err := os.Create(*cpuprofile)
if err != nil {
log.Fatal("could not create CPU profile: ", err)
}
if err := pprof.StartCPUProfile(f); err != nil {
log.Fatal("could not start CPU profile: ", err)
}
defer pprof.StopCPUProfile()
}
outf, err := os.Create(*outputFile) outf, err := os.Create(*outputFile)
if err != nil { if err != nil {
log.Fatal(err) log.Fatal(err)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment