Skip to content

Commit

Permalink
add worker WatchHang() to debug hangs in production conditions
Browse files Browse the repository at this point in the history
  • Loading branch information
equals215 committed Jul 16, 2024
1 parent 6906650 commit b0e155b
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 0 deletions.
25 changes: 25 additions & 0 deletions internal/pkg/crawl/worker.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package crawl

import (
"log/slog"
"sync"
"time"

"github.com/davecgh/go-spew/spew"
"github.com/google/uuid"
"github.com/internetarchive/Zeno/internal/pkg/frontier"
"github.com/internetarchive/Zeno/internal/pkg/log"
Expand Down Expand Up @@ -158,3 +160,26 @@ func (wp *WorkerPool) NewWorker(crawlParameters *Crawl) *Worker {

return worker
}

// WatchHang is a function that checks if a worker is hanging based on the last time it was seen
func (w *Worker) WatchHang() {
w.logger.Info("Starting worker hang watcher")
for {
tryLockCounter := 0
time.Sleep(5 * time.Second)
for !w.TryLock() {
if tryLockCounter > 10 && w.state.status != completed {
w.logger.Error("Worker is hanging")
spew.Fprint(w.pool.Crawl.Log.Writer(slog.LevelError), w) // This will dump the worker state to the log, this is NOT a good idea in production but it's useful for debugging
return
}
tryLockCounter++
time.Sleep(1 * time.Second)
}
if w.state.status != idle && time.Since(w.state.lastSeen) > 10*time.Second {
w.logger.Warn("Worker is hanging, stopping it")
w.Stop()
}
w.Unlock()
}
}
1 change: 1 addition & 0 deletions internal/pkg/crawl/worker_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ func (wp *WorkerPool) Start() {
worker := wp.NewWorker(wp.Crawl)
wp.Crawl.Log.Info("Starting worker", "worker", worker.ID)
go worker.Run()
go worker.WatchHang()
}
go wp.WorkerWatcher()
}
Expand Down

0 comments on commit b0e155b

Please sign in to comment.