Commit 6dc36ab4 authored by ale's avatar ale

Parse links in inline style blocks

parent f4ef521d
Pipeline #4602 passed with stage
in 28 seconds
......@@ -4,6 +4,7 @@ package analysis
import (
"fmt"
"io"
"io/ioutil"
"net/http"
"regexp"
......@@ -67,38 +68,40 @@ func extractLinks(resp *http.Response) []rawOutlink {
ctype := resp.Header.Get("Content-Type")
switch {
case strings.HasPrefix(ctype, "text/html"):
return extractLinksFromHTML(resp)
return extractLinksFromHTML(resp.Body, nil)
case strings.HasPrefix(ctype, "text/css"):
return extractLinksFromCSS(resp)
return extractLinksFromCSS(resp.Body, nil)
default:
return nil
}
}
func extractLinksFromHTML(resp *http.Response) []rawOutlink {
var outlinks []rawOutlink
// Use goquery to extract links from the parsed HTML
// contents (query patterns are described in the
// linkMatches table).
doc, err := goquery.NewDocumentFromReader(resp.Body)
func extractLinksFromHTML(r io.Reader, outlinks []rawOutlink) []rawOutlink {
// Use goquery to extract links from the parsed HTML contents
// (query patterns are described in the linkMatches table).
doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
return nil
}
for _, lm := range linkMatches {
doc.Find(fmt.Sprintf("%s[%s]", lm.tag, lm.attr)).Each(func(i int, s *goquery.Selection) {
val, _ := s.Attr(lm.attr)
outlinks = append(outlinks, rawOutlink{URL: val, Tag: lm.linkTag})
})
}
// Find the inline <style> sections and parse them separately as CSS.
doc.Find("style").Each(func(i int, s *goquery.Selection) {
outlinks = extractLinksFromCSS(strings.NewReader(s.Text()), outlinks)
})
return outlinks
}
func extractLinksFromCSS(resp *http.Response) []rawOutlink {
// Use a simple (and actually quite bad) regular
// expression to extract "url()" links from CSS.
var outlinks []rawOutlink
if data, err := ioutil.ReadAll(resp.Body); err == nil {
func extractLinksFromCSS(r io.Reader, outlinks []rawOutlink) []rawOutlink {
// Use a simple (and actually quite bad) regular expression to
// extract "url()" and "@import" links from CSS.
if data, err := ioutil.ReadAll(r); err == nil {
for _, val := range urlcssRx.FindAllStringSubmatch(string(data), -1) {
outlinks = append(outlinks, rawOutlink{URL: val[1], Tag: crawl.TagRelated})
}
......
package analysis
import (
"fmt"
"io/ioutil"
"net/http"
"net/url"
"strings"
"testing"
"github.com/google/go-cmp/cmp"
)
func makeResponse(ctype, body string) *http.Response {
u, _ := url.Parse("https://example.com/")
r := &http.Response{
Header: make(http.Header),
Body: ioutil.NopCloser(strings.NewReader(body)),
Request: &http.Request{
URL: u,
},
}
r.Header.Set("Content-Type", ctype)
return r
}
type testdata struct {
ctype string
body string
expectedLinks []string
}
func (td *testdata) runTestCase() error {
links, err := GetLinks(makeResponse(td.ctype, td.body))
if err != nil {
return fmt.Errorf("GetLinks() error: %v", err)
}
var linkStr []string
for _, l := range links {
linkStr = append(linkStr, l.URL.String())
}
if diff := cmp.Diff(td.expectedLinks, linkStr); diff != "" {
return fmt.Errorf("unexpected result:\n%s", diff)
}
return nil
}
var tests = []testdata{
{
"text/html",
`
<html><body>
<a href="/link1">link</a>
</body></html>
`,
[]string{
"https://example.com/link1",
},
},
{
"text/html",
`
<html><head><style type="text/css">
body { background: url('/link1'); }
</style></head>
<body></body></html>
`,
[]string{
"https://example.com/link1",
},
},
}
func TestLinks(t *testing.T) {
for _, tt := range tests {
if err := tt.runTestCase(); err != nil {
t.Error(err)
}
}
}
......@@ -4,10 +4,10 @@ require (
github.com/PuerkitoBio/goquery v1.5.0
github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578
github.com/andybalholm/cascadia v0.0.0-20181012154424-680b6a57bda4
github.com/golang/snappy v0.0.0-20190904063534-ff6b7dc882cf
github.com/andybalholm/cascadia v1.0.0
github.com/golang/snappy v0.0.1
github.com/pborman/uuid v0.0.0-20171128162732-e53336930665
github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d
golang.org/x/net v0.0.0-20190926025831-c00fd9afed17
golang.org/x/text v0.0.0-20190829152558-3d0f7978add9
golang.org/x/text v0.3.0
)
github.com/PuerkitoBio/goquery v1.5.0 h1:uGvmFXOA73IKluu/F84Xd1tt/z07GYm8X49XKHP7EJk=
github.com/PuerkitoBio/goquery v1.5.0/go.mod h1:qD2PgZ9lccMbQlc7eEOjaeRlFQON7xY8kdmcsrnKqMg=
github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597 h1:1H3FyRw7YsqIty9WHPOVEGJaFJ1sfGVZ3PPDUw3ob2w=
github.com/PuerkitoBio/purell v0.0.0-20180310210909-975f53781597/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV/sSk/8dngufqelfh6jnri85riMAaF/M=
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
github.com/andybalholm/cascadia v0.0.0-20181012154424-680b6a57bda4/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o=
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/snappy v0.0.0-20190904063534-ff6b7dc882cf/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
github.com/pborman/uuid v0.0.0-20171128162732-e53336930665 h1:7G9lvlxEu1ZPLqJnsRY1MuoBaf2Mg4qbtcxNRXKdzFs=
github.com/pborman/uuid v0.0.0-20171128162732-e53336930665/go.mod h1:VyrYX9gd7irzKovcSS6BIIEwPRkP2Wm2m9ufcdFSJ34=
github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d h1:OgkXbz/O0zsJoaB+z6n/a3bNGCbCWhBPLfGr6qaBprM=
github.com/syndtr/goleveldb v0.0.0-20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190926025831-c00fd9afed17 h1:qPnAdmjNA41t3QBTx2mFGf/SD1IoslhYu7AmdsVzCcs=
golang.org/x/net v0.0.0-20190926025831-c00fd9afed17/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.0.0-20190829152558-3d0f7978add9/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment