Skip to content

Commit

Permalink
update metrics code
Browse files Browse the repository at this point in the history
  • Loading branch information
bnewbold committed Feb 27, 2025
1 parent f09d07a commit 684ae14
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 21 deletions.
34 changes: 28 additions & 6 deletions cmd/domesday/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,53 @@ import (
)

var handleCacheHits = promauto.NewCounter(prometheus.CounterOpts{
Name: "atproto_redis_resolver_handle_cache_hits",
Name: "atproto_resolver_handle_cache_hits",
Help: "Number of cache hits for ATProto handle resolutions",
})

var handleCacheMisses = promauto.NewCounter(prometheus.CounterOpts{
Name: "atproto_redis_resolver_handle_cache_misses",
Name: "atproto_resolver_handle_cache_misses",
Help: "Number of cache misses for ATProto handle resolutions",
})

var handleRequestsCoalesced = promauto.NewCounter(prometheus.CounterOpts{
Name: "atproto_redis_resolver_handle_requests_coalesced",
Name: "atproto_resolver_handle_requests_coalesced",
Help: "Number of handle requests coalesced",
})

var handleResolutionErrors = promauto.NewCounter(prometheus.CounterOpts{
Name: "atproto_resolver_handle_resolution_errors",
Help: "Number of non-cached handle resolution errors",
})

var handleResolveDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "atproto_resolver_handle_duration",
Help: "Time to resolve a handle from network (not cached)",
Buckets: prometheus.ExponentialBucketsRange(0.001, 2, 15),
}, []string{"status"})

var didCacheHits = promauto.NewCounter(prometheus.CounterOpts{
Name: "atproto_redis_resolver_did_cache_hits",
Name: "atproto_resolver_did_cache_hits",
Help: "Number of cache hits for ATProto DID resolutions",
})

var didCacheMisses = promauto.NewCounter(prometheus.CounterOpts{
Name: "atproto_redis_resolver_did_cache_misses",
Name: "atproto_resolver_did_cache_misses",
Help: "Number of cache misses for ATProto DID resolutions",
})

var didRequestsCoalesced = promauto.NewCounter(prometheus.CounterOpts{
Name: "atproto_redis_resolver_did_requests_coalesced",
Name: "atproto_resolver_did_requests_coalesced",
Help: "Number of DID requests coalesced",
})

var didResolutionErrors = promauto.NewCounter(prometheus.CounterOpts{
Name: "atproto_resolver_did_resolution_errors",
Help: "Number of non-cached DID resolution errors",
})

var didResolveDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "atproto_resolver_did_duration",
Help: "Time to resolve a DID from network (not cached)",
Buckets: prometheus.ExponentialBucketsRange(0.001, 2, 15),
}, []string{"status"})
34 changes: 31 additions & 3 deletions cmd/domesday/resolver.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ type RedisResolver struct {
ErrTTL time.Duration
HitTTL time.Duration
InvalidHandleTTL time.Duration
Logger *slog.Logger

handleCache *cache.Cache
didCache *cache.Cache
Expand Down Expand Up @@ -99,7 +100,21 @@ func (d *RedisResolver) isDIDStale(e *didEntry) bool {
}

func (d *RedisResolver) refreshHandle(ctx context.Context, h syntax.Handle) handleEntry {
start := time.Now()
did, err := d.Inner.ResolveHandle(ctx, h)
duration := time.Since(start)

if err != nil {
d.Logger.Info("handle resolution failed", "handle", h, "duration", duration, "err", err)
handleResolutionErrors.Inc()
handleResolveDuration.WithLabelValues("fail").Observe(time.Since(start).Seconds())
} else {
handleResolveDuration.WithLabelValues("success").Observe(time.Since(start).Seconds())
}
if duration.Seconds() > 5.0 {
d.Logger.Info("slow handle resolution", "handle", h, "duration", duration)
}

he := handleEntry{
Updated: time.Now(),
DID: &did,
Expand All @@ -112,14 +127,27 @@ func (d *RedisResolver) refreshHandle(ctx context.Context, h syntax.Handle) hand
TTL: d.ErrTTL,
})
if err != nil {
slog.Error("identity cache write failed", "cache", "handle", "err", err)
d.Logger.Error("identity cache write failed", "cache", "handle", "err", err)
}
return he
}

func (d *RedisResolver) refreshDID(ctx context.Context, did syntax.DID) didEntry {

start := time.Now()
rawDoc, err := d.Inner.ResolveDIDRaw(ctx, did)
duration := time.Since(start)

if err != nil {
d.Logger.Info("DID resolution failed", "did", did, "duration", duration, "err", err)
didResolutionErrors.Inc()
didResolveDuration.WithLabelValues("fail").Observe(time.Since(start).Seconds())
} else {
didResolveDuration.WithLabelValues("success").Observe(time.Since(start).Seconds())
}
if duration.Seconds() > 5.0 {
d.Logger.Info("slow DID resolution", "did", did, "duration", duration)
}

// persist the DID lookup error, instead of processing it immediately
entry := didEntry{
Updated: time.Now(),
Expand All @@ -134,7 +162,7 @@ func (d *RedisResolver) refreshDID(ctx context.Context, did syntax.DID) didEntry
TTL: d.HitTTL,
})
if err != nil {
slog.Error("DID cache write failed", "cache", "did", "did", did, "err", err)
d.Logger.Error("DID cache write failed", "cache", "did", "did", did, "err", err)
}
return entry
}
Expand Down
26 changes: 14 additions & 12 deletions cmd/domesday/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import (
"time"

"github.com/bluesky-social/indigo/atproto/identity"
//"github.com/bluesky-social/indigo/atproto/identity/redisdir"

"github.com/labstack/echo/v4"
"github.com/labstack/echo/v4/middleware"
Expand Down Expand Up @@ -83,6 +82,7 @@ func NewServer(config Config) (*Server, error) {
if err != nil {
return nil, err
}
redisDir.Logger = logger

// configure redis client (for firehose consumer)
redisOpt, err := redis.ParseURL(config.RedisURL)
Expand Down Expand Up @@ -149,44 +149,46 @@ func (srv *Server) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
}

func (srv *Server) RunAPI() error {
slog.Info("starting server", "bind", srv.httpd.Addr)
srv.logger.Info("starting server", "bind", srv.httpd.Addr)
go func() {
if err := srv.httpd.ListenAndServe(); err != nil {
if !errors.Is(err, http.ErrServerClosed) {
slog.Error("HTTP server shutting down unexpectedly", "err", err)
srv.logger.Error("HTTP server shutting down unexpectedly", "err", err)
}
}
}()

// Wait for a signal to exit.
slog.Info("registering OS exit signal handler")
srv.logger.Info("registering OS exit signal handler")
quit := make(chan struct{})
exitSignals := make(chan os.Signal, 1)
signal.Notify(exitSignals, syscall.SIGINT, syscall.SIGTERM)
go func() {
sig := <-exitSignals
slog.Info("received OS exit signal", "signal", sig)
srv.logger.Info("received OS exit signal", "signal", sig)

// Shut down the HTTP server
if err := srv.Shutdown(); err != nil {
slog.Error("HTTP server shutdown error", "err", err)
srv.logger.Error("HTTP server shutdown error", "err", err)
}

// Trigger the return that causes an exit.
close(quit)
}()
<-quit
slog.Info("graceful shutdown complete")
srv.logger.Info("graceful shutdown complete")
return nil
}

func (srv *Server) RunMetrics(listen string) error {
http.Handle("/metrics", promhttp.Handler())
return http.ListenAndServe(listen, nil)
func (srv *Server) RunMetrics(bind string) error {
p := "/metrics"
srv.logger.Info("starting metrics endpoint", "bind", bind, "path", p)
http.Handle(p, promhttp.Handler())
return http.ListenAndServe(bind, nil)
}

func (srv *Server) Shutdown() error {
slog.Info("shutting down")
srv.logger.Info("shutting down")

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
Expand All @@ -207,7 +209,7 @@ func (srv *Server) errorHandler(err error, c echo.Context) {
errorMessage = fmt.Sprintf("%s", he.Message)
}
if code >= 500 {
slog.Warn("domesday-http-internal-error", "err", err)
srv.logger.Warn("domesday-http-internal-error", "err", err)
}
if !c.Response().Committed {
c.JSON(code, GenericError{Error: "InternalError", Message: errorMessage})
Expand Down

0 comments on commit 684ae14

Please sign in to comment.