Skip to content

Commit

Permalink
Merge pull request #742 from SSWConsulting/link-scan-improvements
Browse files Browse the repository at this point in the history
Improve link checking
  • Loading branch information
tombui99 authored Oct 30, 2023
2 parents 4a507b3 + 0d75a48 commit 6aba695
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 89 deletions.
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
############################
# STEP 1 build executable binary
############################
FROM golang:1.16.15-alpine AS builder
FROM golang:1.21.3-alpine AS builder
# Install git.
# Git is required for fetching the dependencies.
RUN apk update && apk add --no-cache git
Expand Down
2 changes: 1 addition & 1 deletion docker/go.mod
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
module github.com/SSWConsulting/SSW.CodeAuditor

go 1.16
go 1.21.3

require golang.org/x/net v0.17.0
38 changes: 0 additions & 38 deletions docker/go.sum
Original file line number Diff line number Diff line change
@@ -1,40 +1,2 @@
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
100 changes: 53 additions & 47 deletions docker/sswlinkauditor.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package main

import (
"encoding/json"
"context"
"crypto/tls"
"errors"
"fmt"
"io"
"io/ioutil"
"net"
"net/http"

urlP "net/url"
Expand Down Expand Up @@ -34,8 +36,6 @@ type Link struct {
anchor string
}

const unscannableLinksEndpoint = "https://asia-east2-sswlinkauditor-c1131.cloudfunctions.net/api/unscannableLinks";

func getHref(t html.Token) (ok bool, href string) {
for _, a := range t.Attr {
if a.Key == "href" || a.Key == "src" {
Expand All @@ -46,31 +46,58 @@ func getHref(t html.Token) (ok bool, href string) {
return
}

func check(link Link, linkch chan LinkStatus, number int, unscannableLinks []string) {
fmt.Println("CHEC", number, link.url)

client := &http.Client{
func getClient() *http.Client {
return &http.Client{
Timeout: 1 * time.Minute,
Transport: &http.Transport{
TLSNextProto: map[string]func(authority string, c *tls.Conn) http.RoundTripper{},
Dial: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).Dial,
TLSHandshakeTimeout: 10 * time.Second,
ResponseHeaderTimeout: 10 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
},
}
method := "HEAD"
}

// get list of links we consider unscannable and use a GET request to get a more accurate result
if isLinkUnscannable(link.url, unscannableLinks) {
method = "GET"
}
func addClientHeaders(r *http.Request) {
r.Header.Add("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36")
r.Header.Set("Cache-Control", "no-cache")
r.Header.Set("Connection", "keep-alive")
r.Header.Set("Accept-Encoding", "*")
}

func check(link Link, linkch chan LinkStatus, number int) {
fmt.Println("CHEC", number, link.url)

r, e := http.NewRequest(method, link.url, nil)
client := getClient()
defer client.CloseIdleConnections()

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
r, e := http.NewRequestWithContext(ctx, "GET", link.url, nil)
addClientHeaders(r)
r.Header.Add("Accept", "*/*")
dnsErr := new(net.DNSError)

if e != nil {
linkch <- LinkStatus{link.url, link.srcUrl, "Link Invalid", 0, link.anchor}
return
}

resp, error := client.Do(r)
resp, err := client.Do(r)

if error != nil {
linkch <- LinkStatus{link.url, link.srcUrl, "Empty Response", 0, link.anchor}
if err != nil {
fmt.Println("error: ", err)
if errors.As(err, &dnsErr) {
linkch <- LinkStatus{link.url, link.srcUrl, "Host error", 0, link.anchor}
} else {
linkch <- LinkStatus{link.url, link.srcUrl, "Unknown error", -1, link.anchor}
}
} else {
defer resp.Body.Close()
linkch <- LinkStatus{link.url, link.srcUrl, resp.Status, resp.StatusCode, link.anchor}
}
}
Expand All @@ -82,11 +109,16 @@ func crawl(link Link, ch chan Link, linkch chan LinkStatus, number int) {
Timeout: 1 * time.Minute,
}
resp, err := client.Get(link.url)
dnsErr := new(net.DNSError)

defer func() {
if err != nil {
fmt.Println("error:", err)
linkch <- LinkStatus{link.url, link.srcUrl, "Empty Response", 0, link.anchor}
fmt.Println("error: ", err)
if errors.As(err, &dnsErr) {
linkch <- LinkStatus{link.url, link.srcUrl, "Host error", 0, link.anchor}
} else {
linkch <- LinkStatus{link.url, link.srcUrl, "Unknown error", -1, link.anchor}
}
} else {
linkch <- LinkStatus{link.url, link.srcUrl, resp.Status, resp.StatusCode, link.anchor}
}
Expand Down Expand Up @@ -226,30 +258,6 @@ func sanitizeString(s string) string {
return replacer.Replace(s);
}

func isLinkUnscannable(a string, unscannableLinks []string) bool {
for _, b := range unscannableLinks {
if strings.HasPrefix(strings.ToLower(a), strings.ToLower(b)) {
return true
}
}
return false
}

func getUnscannableLinks() []string {
resp, err := http.Get(unscannableLinksEndpoint)
if err != nil {
fmt.Println("Error getting unscannable links", err)
return []string{}
}

defer resp.Body.Close()
respBody, _ := ioutil.ReadAll(resp.Body)

var linksList []string
json.Unmarshal(respBody, &linksList)
return linksList
}

func main() {
allUrls := make(map[string]LinkStatus)
startUrl := Link{os.Args[1], "", "a", ""}
Expand All @@ -265,8 +273,6 @@ func main() {

start := time.Now()

unscannableLinks := getUnscannableLinks();

chUrls := make(chan Link)
chAllUrls := make(chan LinkStatus)

Expand Down Expand Up @@ -304,7 +310,7 @@ func main() {
if strings.Index(link.url, startUrl.url) == 0 && link.linkType == "a" && !isResourceFile(link.url) {
crawl(link, chUrls, chAllUrls, crawling)
} else {
check(link, chAllUrls, crawling, unscannableLinks)
check(link, chAllUrls, crawling)
}

<-concurrentGoroutines
Expand All @@ -314,7 +320,7 @@ func main() {
if strings.Index(link.url, startUrl.url) == 0 && link.linkType == "a" && !isResourceFile(link.url) {
go crawl(link, chUrls, chAllUrls, crawling)
} else {
go check(link, chAllUrls, crawling, unscannableLinks)
go check(link, chAllUrls, crawling)
}
}

Expand Down
7 changes: 5 additions & 2 deletions docker/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -417,8 +417,11 @@ exports.processBrokenLinks = (
) => {
const __getBadResults = (allUrls) =>
allUrls
// Allow successful 2xx status code range (200-299)
.filter((url) => !((url["StatusCode"]?.startsWith('2') || url["StatusCode"]?.startsWith('3')) && url["StatusCode"]?.length === 3))
// Filter out successful 2xx status code range (200-299) and 429
.filter((url) => {
const code = parseInt(url?.StatusCode);
return code >= 0 && code !== 429 && (code < 200 || code > 399);
})
.map((x) => ({
src: x.Source || "",
dst: x.Destination || "",
Expand Down

0 comments on commit 6aba695

Please sign in to comment.