Skip to content

Commit

Permalink
Merge branch 'master' of github.com:PuerkitoBio/gocrawl
Browse files Browse the repository at this point in the history
  • Loading branch information
mna committed Jul 22, 2019
2 parents fa35d57 + dada5e9 commit 06cb0b4
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"path"

"github.com/PuerkitoBio/goquery"
"github.com/andybalholm/cascadia"
"github.com/temoto/robotstxt"
"golang.org/x/net/html"
)
Expand Down Expand Up @@ -381,10 +382,15 @@ func handleBaseTag(root *url.URL, baseHref string, aHref string) string {
return resolvedURL.String()
}

var (
aHrefMatcher = cascadia.MustCompile("a[href]")
baseHrefMatcher = cascadia.MustCompile("base[href]")
)

// Scrape the document's content to gather all links
func (w *worker) processLinks(doc *goquery.Document) (result []*url.URL) {
baseURL, _ := doc.Find("base[href]").Attr("href")
urls := doc.Find("a[href]").Map(func(_ int, s *goquery.Selection) string {
baseURL, _ := doc.FindMatcher(baseHrefMatcher).Attr("href")
urls := doc.FindMatcher(aHrefMatcher).Map(func(_ int, s *goquery.Selection) string {
val, _ := s.Attr("href")
if baseURL != "" {
val = handleBaseTag(doc.Url, baseURL, val)
Expand Down

0 comments on commit 06cb0b4

Please sign in to comment.