Skip to content
Snippets Groups Projects
Commit f0c14e5e authored by ale's avatar ale
Browse files

move link extraction to a common location

parent d4c561c2
No related branches found
No related tags found
No related merge requests found
// Extract links from HTML/CSS content.
package analysis
import (
"fmt"
"io/ioutil"
"net/http"
"net/url"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
)
var (
urlcssRx = regexp.MustCompile(`background.*:.*url\(["']?([^'"\)]+)["']?\)`)
linkMatches = []struct {
tag string
attr string
}{
{"a", "href"},
{"link", "href"},
{"img", "src"},
{"script", "src"},
}
)
func GetLinks(resp *http.Response) ([]*url.URL, error) {
var outlinks []string
ctype := resp.Header.Get("Content-Type")
if strings.HasPrefix(ctype, "text/html") {
doc, err := goquery.NewDocumentFromResponse(resp)
if err != nil {
return nil, err
}
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
outlinks = append(outlinks, val)
})
}
} else if strings.HasPrefix(ctype, "text/css") {
if data, err := ioutil.ReadAll(resp.Body); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
outlinks = append(outlinks, val[1])
}
}
}
// Uniquify and parse outbound links.
var result []*url.URL
links := make(map[string]*url.URL)
for _, val := range outlinks {
if linkurl, err := resp.Request.URL.Parse(val); err == nil {
links[linkurl.String()] = linkurl
}
}
for _, link := range links {
result = append(result, link)
}
return result, nil
}
......@@ -10,15 +10,13 @@ import (
"io/ioutil"
"log"
"net/http"
"net/url"
"os"
"regexp"
"strconv"
"strings"
"git.autistici.org/ale/crawl"
"git.autistici.org/ale/crawl/analysis"
"git.autistici.org/ale/crawl/warc"
"github.com/PuerkitoBio/goquery"
)
var (
......@@ -27,53 +25,15 @@ var (
depth = flag.Int("depth", 10, "maximum link depth")
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
outputFile = flag.String("output", "crawl.warc.gz", "output WARC file")
urlcssRx = regexp.MustCompile(`background.*:.*url\(["']?([^'"\)]+)["']?\)`)
)
var linkMatches = []struct {
tag string
attr string
}{
{"a", "href"},
{"link", "href"},
{"img", "src"},
{"script", "src"},
}
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
var outlinks []string
ctype := resp.Header.Get("Content-Type")
if strings.HasPrefix(ctype, "text/html") {
doc, err := goquery.NewDocumentFromResponse(resp)
if err != nil {
return err
}
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
outlinks = append(outlinks, val)
})
}
} else if strings.HasPrefix(ctype, "text/css") {
if data, err := ioutil.ReadAll(resp.Body); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
outlinks = append(outlinks, val[1])
}
}
links, err := analysis.GetLinks(resp)
if err != nil {
return err
}
// Uniquify and parse outbound links.
links := make(map[string]*url.URL)
for _, val := range outlinks {
if linkurl, err := resp.Request.URL.Parse(val); err == nil {
links[linkurl.String()] = linkurl
}
}
for _, link := range links {
//log.Printf("%s -> %s", u, link.String())
c.Enqueue(link, depth+1)
}
......
......@@ -6,14 +6,12 @@ package main
import (
"flag"
"fmt"
"log"
"net/http"
"net/url"
"strings"
"git.autistici.org/ale/crawl"
"github.com/PuerkitoBio/goquery"
"git.autistici.org/ale/crawl/analysis"
)
var (
......@@ -23,41 +21,16 @@ var (
validSchemes = flag.String("schemes", "http,https", "comma-separated list of allowed protocols")
)
var linkMatches = []struct {
tag string
attr string
}{
{"a", "href"},
{"link", "href"},
{"img", "src"},
{"script", "src"},
}
func extractLinks(c *crawl.Crawler, u string, depth int, resp *http.Response, err error) error {
if !strings.HasPrefix(resp.Header.Get("Content-Type"), "text/html") {
return nil
}
doc, err := goquery.NewDocumentFromResponse(resp)
links, err := analysis.GetLinks(resp)
if err != nil {
return err
}
links := make(map[string]*url.URL)
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
if linkurl, err := resp.Request.URL.Parse(val); err == nil {
links[linkurl.String()] = linkurl
}
})
}
for _, link := range links {
//log.Printf("%s -> %s", u, link.String())
c.Enqueue(link, depth+1)
}
return nil
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment