Skip to content

Commit

Permalink
Merge branch 'main' into cli-config-enhancements
Browse files Browse the repository at this point in the history
  • Loading branch information
equals215 committed Jul 18, 2024
2 parents e4834d3 + 7b7d7d7 commit 6239ea3
Show file tree
Hide file tree
Showing 10 changed files with 119 additions and 205 deletions.
23 changes: 17 additions & 6 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,13 @@ go 1.22.4

require (
git.archive.org/wb/gocrawlhq v1.2.5
github.com/CorentinB/warc v0.8.39
github.com/CorentinB/warc v0.8.40
github.com/PuerkitoBio/goquery v1.9.2
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2
github.com/beeker1121/goque v2.1.0+incompatible
github.com/clbanning/mxj/v2 v2.7.0
github.com/dustin/go-humanize v1.0.1
github.com/elastic/go-elasticsearch/v8 v8.14.0
github.com/gin-contrib/pprof v1.5.0
github.com/gin-gonic/gin v1.10.0
github.com/google/uuid v1.6.0
github.com/gosuri/uilive v0.0.4
github.com/gosuri/uitable v0.0.4
Expand Down Expand Up @@ -41,8 +39,6 @@ require (
github.com/andybalholm/brotli v1.1.0 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bytedance/sonic v1.11.9 // indirect
github.com/bytedance/sonic/loader v0.1.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cloudflare/circl v1.3.9 // indirect
github.com/cloudwego/base64x v0.1.4 // indirect
Expand All @@ -58,10 +54,15 @@ require (
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/go-playground/validator/v10 v10.22.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/elastic/elastic-transport-go/v8 v8.6.0 // indirect
github.com/fatih/color v1.17.0 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect
github.com/goccy/go-json v0.10.3 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/gomodule/redigo v1.9.2 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
Expand All @@ -73,6 +74,7 @@ require (
github.com/klauspost/cpuid/v2 v2.2.8 // indirect
github.com/klauspost/pgzip v1.2.6 // indirect
github.com/leodido/go-urn v1.4.0 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/lestrrat-go/strftime v1.0.6 // indirect
github.com/magiconair/properties v1.8.7 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
Expand All @@ -96,6 +98,10 @@ require (
github.com/rivo/uniseg v0.4.7 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
github.com/quic-go/quic-go v0.41.0 // indirect
github.com/refraction-networking/utls v1.6.3 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/satori/go.uuid v1.2.0 // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
github.com/spf13/cast v1.6.0 // indirect
Expand All @@ -111,6 +117,11 @@ require (
golang.org/x/arch v0.8.0 // indirect
golang.org/x/crypto v0.24.0 // indirect
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 // indirect
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
go.opentelemetry.io/otel v1.28.0 // indirect
go.opentelemetry.io/otel/metric v1.28.0 // indirect
go.opentelemetry.io/otel/trace v1.28.0 // indirect
golang.org/x/crypto v0.24.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.21.0 // indirect
golang.org/x/text v0.16.0 // indirect
Expand Down
28 changes: 17 additions & 11 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ git.archive.org/wb/gocrawlhq v1.2.5/go.mod h1:WiuNIB4Toqe8twVvwRu0fTSNC3KXFqA8/m
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/CorentinB/warc v0.8.39 h1:ZYccm4BKRle10aVPL9EU7nCVe0LImGxkK1fsqIE8Dw8=
github.com/CorentinB/warc v0.8.39/go.mod h1:Q9SHKf7pwcqzIWcxlzCtAWN8sKH+Q1BZxq1mSHJ9ttY=
github.com/CorentinB/warc v0.8.40 h1:6HIMT4jujlFTudeXtsoaFT+qJZYXeQlKdIED+c36Qpc=
github.com/CorentinB/warc v0.8.40/go.mod h1:Q9SHKf7pwcqzIWcxlzCtAWN8sKH+Q1BZxq1mSHJ9ttY=
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo=
Expand Down Expand Up @@ -42,6 +44,9 @@ github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJ
github.com/cloudwego/iasm v0.2.0 h1:1KNIy1I1H9hNNFEEH3DVnI4UujN+1zjpuk6gwHLTssg=
github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY=
github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/cpuguy83/go-md2man/v2 v2.0.4 h1:wfIWP927BUkWJb2NmU/kNDYIBTh/ziUX91+lVfRxZq4=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
Expand Down Expand Up @@ -237,8 +242,14 @@ github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoG
github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
github.com/refraction-networking/utls v1.6.6 h1:igFsYBUJPYM8Rno9xUuDoM5GQrVEqY4llzEXOkL43Ig=
github.com/refraction-networking/utls v1.6.6/go.mod h1:BC3O4vQzye5hqpmDTWUqi4P5DDhzJfkV1tdqtawQIH0=
github.com/quic-go/quic-go v0.41.0 h1:aD8MmHfgqTURWNJy48IYFg2OnxwHT3JL7ahGs73lb4k=
github.com/quic-go/quic-go v0.41.0/go.mod h1:qCkNjqczPEvgsOnxZ0eCD14lv+B2LHlFAB++CNOh9hA=
github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
github.com/refraction-networking/utls v1.6.3 h1:MFOfRN35sSx6K5AZNIoESsBuBxS2LCgRilRIdHb6fDc=
github.com/refraction-networking/utls v1.6.3/go.mod h1:yil9+7qSl+gBwJqztoQseO6Pr3h62pQoY1lXiNR/FPs=
github.com/remeh/sizedwaitgroup v1.0.0 h1:VNGGFwNo/R5+MJBf6yrsr110p0m4/OX4S3DCy7Kyl5E=
github.com/remeh/sizedwaitgroup v1.0.0/go.mod h1:3j2R4OIe/SeS6YDhICBy22RWjJC5eNCJ1V+9+NVNYlo=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
Expand Down Expand Up @@ -270,16 +281,9 @@ github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI=
github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
Expand All @@ -294,6 +298,10 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
github.com/urfave/cli/v2 v2.27.2 h1:6e0H+AkS+zDckwPCUrZkKX38mRaau4nL2uipkJpbkcI=
github.com/urfave/cli/v2 v2.27.2/go.mod h1:g0+79LmHHATl7DAcHO99smiR/T7uGLw84w8Y42x+4eM=
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4=
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0=
Expand Down Expand Up @@ -323,8 +331,6 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y
golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI=
golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 h1:vr/HnozRka3pE4EsMEg1lgkXJkTFJCVUX+S/ZT6wYzM=
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
Expand Down Expand Up @@ -393,6 +399,8 @@ golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/api v0.3.1/go.mod h1:6wY9I6uQWHQ8EM57III9mq/AjF+i8G65rmVagqKMtkk=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
Expand Down Expand Up @@ -423,5 +431,3 @@ honnef.co/go/tools v0.0.0-20180728063816-88497007e858/go.mod h1:rf3lG4BRIbNafJWh
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
mvdan.cc/xurls/v2 v2.5.0 h1:lyBNOm8Wo71UknhUs4QTFUNNMyxy2JEIaKKo0RWOh+8=
mvdan.cc/xurls/v2 v2.5.0/go.mod h1:yQgaGQ1rFtJUzkmKiHYSSfuQxqfYmd//X6PxvholpeE=
nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50=
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
91 changes: 46 additions & 45 deletions internal/pkg/crawl/api.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
package crawl

import (
"fmt"
"log/slog"
"encoding/json"
"os"
"strconv"
"strings"
"time"

"github.com/gin-contrib/pprof"
"github.com/gin-gonic/gin"
"net/http"
_ "net/http/pprof"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
Expand All @@ -28,81 +29,81 @@ type APIWorkerState struct {
Locked bool `json:"locked"`
}

// startAPI starts the API server for the crawl.
// startAPI starts the API server for the crawl
func (crawl *Crawl) startAPI() {
gin.SetMode(gin.ReleaseMode)
gin.DefaultWriter = crawl.Log.Writer(slog.LevelInfo)
gin.DefaultErrorWriter = crawl.Log.Writer(slog.LevelError)

r := gin.Default()

pprof.Register(r)

crawl.Log.Info("Starting API")
r.GET("/", func(c *gin.Context) {
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
crawledSeeds := crawl.CrawledSeeds.Value()
crawledAssets := crawl.CrawledAssets.Value()

c.JSON(200, gin.H{
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)

response := map[string]interface{}{
"rate": crawl.URIsPerSecond.Rate(),
"crawled": crawledSeeds + crawledAssets,
"crawledSeeds": crawledSeeds,
"crawledAssets": crawledAssets,
"queued": crawl.Frontier.QueueCount.Value(),
"uptime": time.Since(crawl.StartTime).String(),
})
})

// Handle Prometheus export
if crawl.Prometheus {
labels := make(map[string]string)

labels["crawljob"] = crawl.Job
hostname, err := os.Hostname()
if err != nil {
crawl.Log.Warn("Unable to retrieve hostname of machine")
hostname = "unknown"
}
labels["host"] = hostname + ":" + crawl.APIPort

crawl.PrometheusMetrics.DownloadedURI = promauto.NewCounter(prometheus.CounterOpts{
Name: crawl.PrometheusMetrics.Prefix + "downloaded_uri_count_total",
ConstLabels: labels,
Help: "The total number of crawled URI",
})
json.NewEncoder(w).Encode(response)
})

crawl.Log.Info("Starting Prometheus export")
r.GET("/metrics", gin.WrapH(promhttp.Handler()))
}
http.HandleFunc("/metrics", setupPrometheus(crawl).ServeHTTP)

r.GET("/workers", func(c *gin.Context) {
http.HandleFunc("/workers", func(w http.ResponseWriter, r *http.Request) {
workersState := crawl.GetWorkerState(-1)
c.JSON(200, workersState)
json.NewEncoder(w).Encode(workersState)
})

r.GET("/worker/:worker_id", func(c *gin.Context) {
workerID := c.Param("worker_id")
http.HandleFunc("/worker/", func(w http.ResponseWriter, r *http.Request) {
workerID := strings.TrimPrefix(r.URL.Path, "/worker/")
workerIDInt, err := strconv.Atoi(workerID)
if err != nil {
c.JSON(400, gin.H{
w.WriteHeader(http.StatusBadRequest)
json.NewEncoder(w).Encode(map[string]interface{}{
"error": "Unsupported worker ID",
})
return
}

workersState := crawl.GetWorkerState(workerIDInt)
if workersState == nil {
c.JSON(404, gin.H{
w.WriteHeader(http.StatusNotFound)
json.NewEncoder(w).Encode(map[string]interface{}{
"error": "Worker not found",
})
return
}

c.JSON(200, workersState)
json.NewEncoder(w).Encode(workersState)
})

err := r.Run(fmt.Sprintf(":%s", crawl.APIPort))
err := http.ListenAndServe(":"+crawl.APIPort, nil)
if err != nil {
crawl.Log.Fatal("unable to start API", "error", err.Error())
}
}

func setupPrometheus(crawl *Crawl) http.Handler {
labels := make(map[string]string)

labels["crawljob"] = crawl.Job
hostname, err := os.Hostname()
if err != nil {
crawl.Log.Warn("Unable to retrieve hostname of machine")
hostname = "unknown"
}
labels["host"] = hostname + ":" + crawl.APIPort

crawl.PrometheusMetrics.DownloadedURI = promauto.NewCounter(prometheus.CounterOpts{
Name: crawl.PrometheusMetrics.Prefix + "downloaded_uri_count_total",
ConstLabels: labels,
Help: "The total number of crawled URI",
})

crawl.Log.Info("starting Prometheus export")

return promhttp.Handler()
}
2 changes: 1 addition & 1 deletion internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func (c *Crawl) executeGET(item *frontier.Item, req *http.Request, isRedirection
)

defer func() {
if c.Prometheus {
if c.PrometheusMetrics != nil {
c.PrometheusMetrics.DownloadedURI.Inc()
}

Expand Down
17 changes: 12 additions & 5 deletions internal/pkg/crawl/hq.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,11 @@ func (c *Crawl) HQWebsocket() {
GoVersion: utils.GetVersion().GoVersion,
})
if err != nil {
logrus.WithFields(c.genLogFields(err, nil, nil)).Errorln("error sending identify payload to crawl HQ, trying to reconnect..")
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending identify payload to crawl HQ, trying to reconnect..")

err = c.HQClient.InitWebsocketConn()
if err != nil {
logrus.WithFields(c.genLogFields(err, nil, nil)).Errorln("error initializing websocket connection to crawl HQ")
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error initializing websocket connection to crawl HQ")
}
}

Expand Down Expand Up @@ -72,7 +73,7 @@ func (c *Crawl) HQProducer() {
for {
_, err := c.HQClient.Discovered(discoveredArray, "seed", false, false)
if err != nil {
logrus.WithFields(c.genLogFields(err, nil, nil)).Errorln("error sending payload to crawl HQ, waiting 1s then retrying..")
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..")
time.Sleep(time.Second)
continue
}
Expand All @@ -87,7 +88,7 @@ func (c *Crawl) HQProducer() {
for {
_, err := c.HQClient.Discovered(discoveredArray, "seed", false, false)
if err != nil {
logrus.WithFields(c.genLogFields(err, nil, nil)).Errorln("error sending payload to crawl HQ, waiting 1s then retrying..")
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..")
time.Sleep(time.Second)
continue
}
Expand Down Expand Up @@ -125,7 +126,9 @@ func (c *Crawl) HQProducer() {
for {
_, err := c.HQClient.Discovered([]gocrawlhq.URL{discoveredURL}, "seed", true, false)
if err != nil {
logrus.WithFields(c.genLogFields(err, nil, nil)).Errorln("error sending payload to crawl HQ, waiting 1s then retrying..")
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
"bypassSeencheck": discoveredItem.BypassSeencheck,
})).Error("error sending payload to crawl HQ, waiting 1s then retrying..")
time.Sleep(time.Second)
continue
}
Expand Down Expand Up @@ -177,6 +180,10 @@ func (c *Crawl) HQConsumer() {
// get batch from crawl HQ
batch, err := c.HQClient.Feed(HQBatchSize, c.HQStrategy)
if err != nil {
if strings.Contains(err.Error(), "feed is empty") {
time.Sleep(time.Second)
}

c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
"batchSize": HQBatchSize,
})).Error("error getting new URLs from crawl HQ")
Expand Down
11 changes: 10 additions & 1 deletion internal/pkg/crawl/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,16 @@ func (c *Crawl) printLiveStats() {
stats.AddRow(" - Crawled seeds:", crawledSeeds)
stats.AddRow(" - Crawled assets:", crawledAssets)
stats.AddRow(" - WARC writing queue:", c.Client.WaitGroup.Size())
stats.AddRow(" - Data:", humanize.Bytes(uint64(warc.DataTotal.Value())))
stats.AddRow(" - Data written:", humanize.Bytes(uint64(warc.DataTotal.Value())))

if !c.DisableLocalDedupe {
stats.AddRow(" - Deduped (local):", humanize.Bytes(uint64(warc.LocalDedupeTotal.Value())))
}

if c.CDXDedupeServer != "" {
stats.AddRow(" - Deduped (via CDX):", humanize.Bytes(uint64(warc.RemoteDedupeTotal.Value())))
}

stats.AddRow("", "")
stats.AddRow(" - Elapsed time:", time.Since(c.StartTime).String())
stats.AddRow(" - Allocated (heap):", bToMb(m.Alloc))
Expand Down
Loading

0 comments on commit 6239ea3

Please sign in to comment.