Skip to content
Snippets Groups Projects
Commit 6dc36ab4 authored by ale's avatar ale
Browse files

Parse links in inline style blocks

parent f4ef521d
No related branches found
No related tags found
No related merge requests found
......@@ -4,6 +4,7 @@ package analysis
import (
"fmt"
"io"
"io/ioutil"
"net/http"
"regexp"
......@@ -67,38 +68,40 @@ func extractLinks(resp *http.Response) []rawOutlink {
ctype := resp.Header.Get("Content-Type")
switch {
case strings.HasPrefix(ctype, "text/html"):
return extractLinksFromHTML(resp)
return extractLinksFromHTML(resp.Body, nil)
case strings.HasPrefix(ctype, "text/css"):
return extractLinksFromCSS(resp)
return extractLinksFromCSS(resp.Body, nil)
default:
return nil
}
}
func extractLinksFromHTML(resp *http.Response) []rawOutlink {
var outlinks []rawOutlink
// Use goquery to extract links from the parsed HTML
// contents (query patterns are described in the
// linkMatches table).
doc, err := goquery.NewDocumentFromReader(resp.Body)
func extractLinksFromHTML(r io.Reader, outlinks []rawOutlink) []rawOutlink {
// Use goquery to extract links from the parsed HTML contents
// (query patterns are described in the linkMatches table).
doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
return nil
}
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag})
})
}
// Find the inline <style> sections and parse them separately as CSS.
doc.Find("style").Each(func(i int, s *goquery.Selection) {
outlinks = extractLinksFromCSS(strings.NewReader(s.Text()), outlinks)
})
return outlinks
}
func extractLinksFromCSS(resp *http.Response) []rawOutlink {
// Use a simple (and actually quite bad) regular
// expression to extract "url()" links from CSS.
var outlinks []rawOutlink
if data, err := ioutil.ReadAll(resp.Body); err == nil {
func extractLinksFromCSS(r io.Reader, outlinks []rawOutlink) []rawOutlink {
// Use a simple (and actually quite bad) regular expression to
// extract "url()" and "@import" links from CSS.
if data, err := ioutil.ReadAll(r); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated})
}
......
package analysis
import (
"fmt"
"io/ioutil"
"net/http"
"net/url"
"strings"
"testing"
"github.com/google/go-cmp/cmp"
)
func makeResponse(ctype, body string) *http.Response {
u, _ := url.Parse("https://example.com/")
r := &http.Response{
Header: make(http.Header),
Body: ioutil.NopCloser(strings.NewReader(body)),
Request: &http.Request{
URL: u,
},
}
r.Header.Set("Content-Type", ctype)
return r
}
type testdata struct {
ctype string
body string
expectedLinks []string
}
func (td *testdata) runTestCase() error {
links, err := GetLinks(makeResponse(td.ctype, td.body))
if err != nil {
return fmt.Errorf("GetLinks() error: %v", err)
}
var linkStr []string
for _, l := range links {
linkStr = append(linkStr, l.URL.String())
}
if diff := cmp.Diff(td.expectedLinks, linkStr); diff != "" {
return fmt.Errorf("unexpected result:\n%s", diff)
}
return nil
}
var tests = []testdata{
{
"text/html",
`
<html><body>
<a href="/link1">link</a>
</body></html>
`,
[]string{
"https://example.com/link1",
},
},
{
"text/html",
`
<html><head><style type="text/css">
body { background: url('/link1'); }
</style></head>
<body></body></html>
`,
[]string{
"https://example.com/link1",
},
},
}
func TestLinks(t *testing.T) {
for _, tt := range tests {
if err := tt.runTestCase(); err != nil {
t.Error(err)
}
}
}
......@@ -4,10 +4,10 @@ require (
github.com/PuerkitoBio/goquery v1.5.0
github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578
github.com/andybalholm/cascadia v0.0.0-20181012154424-680b6a57bda4
github.com/golang/snappy v0.0.0-20190904063534-ff6b7dc882cf
github.com/andybalholm/cascadia v1.0.0
github.com/golang/snappy v0.0.1
github.com/pborman/uuid v0.0.0-20171128162732-e53336930665
github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d
golang.org/x/net v0.0.0-20190926025831-c00fd9afed17
golang.org/x/text v0.0.0-20190829152558-3d0f7978add9
golang.org/x/text v0.3.0
)
go.sum 0 → 100644
github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk=
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597 h1:1H3FyRw7YsqIty9WHPOVEGJaFJ1sfGVZ3PPDUw3ob2w=
github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M=
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
github.com/andybalholm/cascadia v0.0.0-20181012154424-680b6a57bda4/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/snappy v0.0.0-20190904063534-ff6b7dc882cf/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
github.com/pborman/uuid v0.0.0-20171128162732-e53336930665 h1:7G9lvlxEu1ZPLqJnsRY1MuoBaf2Mg4qbtcxNRXKdzFs=
github.com/pborman/uuid v0.0.0-20171128162732-e53336930665/go.mod h1:VyrYX9gd7irzKovcSS6BIIEwPRkP2Wm2m9ufcdFSJ34=
github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d h1:OgkXbz/O0zsJoaB+z6n/a3bNGCbCWhBPLfGr6qaBprM=
github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190926025831-c00fd9afed17 h1:qPnAdmjNA41t3QBTx2mFGf/SD1IoslhYu7AmdsVzCcs=
golang.org/x/net v0.0.0-20190926025831-c00fd9afed17/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.0.0-20190829152558-3d0f7978add9/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment