From 23a80bd68c5c51967eaf4e6a857c5d59fe58daf5 Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Fri, 31 Aug 2018 11:08:50 +0100
Subject: [PATCH] Add a simple test for the full WARC crawler

---
 cmd/crawl/crawl.go      |  3 ++
 cmd/crawl/crawl_test.go | 77 +++++++++++++++++++++++++++++++++++++++++
 crawler_test.go         |  2 +-
 3 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 cmd/crawl/crawl_test.go

diff --git a/cmd/crawl/crawl.go b/cmd/crawl/crawl.go
index cf2af5d..bbbd65b 100644
--- a/cmd/crawl/crawl.go
+++ b/cmd/crawl/crawl.go
@@ -62,6 +62,7 @@ func hdr2str(h http.Header) []byte {
 type warcSaveHandler struct {
 	warc       *warc.Writer
 	warcInfoID string
+	numWritten int
 }
 
 func (h *warcSaveHandler) writeWARCRecord(typ, uri string, data []byte) error {
@@ -109,6 +110,8 @@ func (h *warcSaveHandler) Handle(c *crawl.Crawler, u string, depth int, resp *ht
 		return werr
 	}
 
+	h.numWritten++
+
 	return extractLinks(c, u, depth, resp, nil)
 }
 
diff --git a/cmd/crawl/crawl_test.go b/cmd/crawl/crawl_test.go
new file mode 100644
index 0000000..46bb2ad
--- /dev/null
+++ b/cmd/crawl/crawl_test.go
@@ -0,0 +1,77 @@
+package main
+
+import (
+	"fmt"
+	"io/ioutil"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"git.autistici.org/ale/crawl"
+	"git.autistici.org/ale/crawl/warc"
+)
+
+func linkTo(w http.ResponseWriter, uri string) {
+	w.Header().Set("Content-Type", "text/html")
+	fmt.Fprintf(w, "<html><body><a href=\"%s\">link!</a></body></html>", uri)
+}
+
+func TestCrawl(t *testing.T) {
+	tmpdir, err := ioutil.TempDir("", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.RemoveAll(tmpdir)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/":
+			linkTo(w, "/redir")
+		case "/b":
+			linkTo(w, "/")
+		case "/redir":
+			http.Redirect(w, r, "/b", http.StatusFound)
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer srv.Close()
+
+	seeds := crawl.MustParseURLs([]string{srv.URL + "/"})
+	scope := crawl.AND(
+		crawl.NewSchemeScope([]string{"http"}),
+		crawl.NewDepthScope(10),
+		crawl.NewSeedScope(seeds),
+	)
+
+	outf, err := os.Create(filepath.Join(tmpdir, "warc.gz"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	w := warc.NewWriter(outf)
+	defer w.Close()
+	saver, err := newWarcSaveHandler(w)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	crawler, err := crawl.NewCrawler(
+		filepath.Join(tmpdir, "db"),
+		seeds,
+		scope,
+		crawl.FetcherFunc(fetch),
+		crawl.HandleRetries(crawl.FollowRedirects(crawl.FilterErrors(saver))),
+	)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	crawler.Run(1)
+	crawler.Close()
+
+	if n := saver.(*warcSaveHandler).numWritten; n != 3 {
+		t.Fatalf("warc handler wrote %d records, expected 3", n)
+	}
+}
diff --git a/crawler_test.go b/crawler_test.go
index fecc850..7b5c92c 100644
--- a/crawler_test.go
+++ b/crawler_test.go
@@ -44,7 +44,7 @@ func TestCrawler(t *testing.T) {
 		return nil
 	})
 
-	crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), FollowRedirects(h))
+	crawler, err := NewCrawler(dir+"/crawl.db", seeds, scope, FetcherFunc(http.Get), HandleRetries(FilterErrors(FollowRedirects(h))))
 	if err != nil {
 		t.Fatal("NewCrawler", err)
 	}
-- 
GitLab