Skip to content
Snippets Groups Projects
Commit f0c14e5e authored by ale's avatar ale
Browse files

move link extraction to a common location

parent d4c561c2
No related branches found
No related tags found
No related merge requests found
// Extract links from HTML/CSS content.
package analysis
import (
"fmt"
"io/ioutil"
"net/http"
"net/url"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
)
var (
urlcssRx = regexp.MustCompile(`background.*:.*url\(["']?([^'"\)]+)["']?\)`)
linkMatches = []struct {
tag string
attr string
}{
{"a", "href"},
{"link", "href"},
{"img", "src"},
{"script", "src"},
}
)
func GetLinks(resp *http.Response) ([]*url.URL, error) {
var outlinks []string
ctype := resp.Header.Get("Content-Type")
if strings.HasPrefix(ctype, "text/html") {
doc, err := goquery.NewDocumentFromResponse(resp)
if err != nil {
return nil, err
}
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
outlinks = append(outlinks, val)
})
}
} else if strings.HasPrefix(ctype, "text/css") {
if data, err := ioutil.ReadAll(resp.Body); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
outlinks = append(outlinks, val[1])
}
}
}
// Uniquify and parse outbound links.
var result []*url.URL
links := make(map[string]*url.URL)
for _, val := range outlinks {
if linkurl, err := resp.Request.URL.Parse(val); err == nil {
links[linkurl.String()] = linkurl
}
}
for _, link := range links {
result = append(result, link)
}
return result, nil
}
...@@ -10,15 +10,13 @@ import ( ...@@ -10,15 +10,13 @@ import (
"io/ioutil" "io/ioutil"
"log" "log"
"net/http" "net/http"
"net/url"
"os" "os"
"regexp"
"strconv" "strconv"
"strings" "strings"
"git.autistici.org/ale/crawl" "git.autistici.org/ale/crawl"
"git.autistici.org/ale/crawl/analysis"
"git.autistici.org/ale/crawl/warc" "git.autistici.org/ale/crawl/warc"
"github.com/PuerkitoBio/goquery"
) )
var ( var (
...@@ -27,53 +25,15 @@ var ( ...@@ -27,53 +25,15 @@ var (
depth = flag.Int("depth", 10, "maximum link depth") depth = flag.Int("depth", 10, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file") outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
urlcssRx = regexp.MustCompile(`background.*:.*url\(["']?([^'"\)]+)["']?\)`)
) )
var linkMatches = []struct {
tag string
attr string
}{
{"a", "href"},
{"link", "href"},
{"img", "src"},
{"script", "src"},
}
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
var outlinks []string links, err := analysis.GetLinks(resp)
if err != nil {
ctype := resp.Header.Get("Content-Type") return err
if strings.HasPrefix(ctype, "text/html") {
doc, err := goquery.NewDocumentFromResponse(resp)
if err != nil {
return err
}
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
outlinks = append(outlinks, val)
})
}
} else if strings.HasPrefix(ctype, "text/css") {
if data, err := ioutil.ReadAll(resp.Body); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
outlinks = append(outlinks, val[1])
}
}
} }
// Uniquify and parse outbound links.
links := make(map[string]*url.URL)
for _, val := range outlinks {
if linkurl, err := resp.Request.URL.Parse(val); err == nil {
links[linkurl.String()] = linkurl
}
}
for _, link := range links { for _, link := range links {
//log.Printf("%s -> %s", u, link.String())
c.Enqueue(link, depth+1) c.Enqueue(link, depth+1)
} }
......
...@@ -6,14 +6,12 @@ package main ...@@ -6,14 +6,12 @@ package main
import ( import (
"flag" "flag"
"fmt"
"log" "log"
"net/http" "net/http"
"net/url"
"strings" "strings"
"git.autistici.org/ale/crawl" "git.autistici.org/ale/crawl"
"github.com/PuerkitoBio/goquery" "git.autistici.org/ale/crawl/analysis"
) )
var ( var (
...@@ -23,41 +21,16 @@ var ( ...@@ -23,41 +21,16 @@ var (
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols") validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
) )
var linkMatches = []struct {
tag string
attr string
}{
{"a", "href"},
{"link", "href"},
{"img", "src"},
{"script", "src"},
}
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error { func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
if !strings.HasPrefix(resp.Header.Get("Content-Type"), "text/html") { links, err := analysis.GetLinks(resp)
return nil
}
doc, err := goquery.NewDocumentFromResponse(resp)
if err != nil { if err != nil {
return err return err
} }
links := make(map[string]*url.URL)
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
if linkurl, err := resp.Request.URL.Parse(val); err == nil {
links[linkurl.String()] = linkurl
}
})
}
for _, link := range links { for _, link := range links {
//log.Printf("%s -> %s", u, link.String())
c.Enqueue(link, depth+1) c.Enqueue(link, depth+1)
} }
return nil return nil
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment