From 2c53654f172d29bbd34f295f874481e8adffd67b Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Wed, 27 Sep 2023 11:53:14 -0400 Subject: [PATCH 01/32] Remove extraneous tracing detail. Tidy code. --- go.mod | 8 ++++ go.sum | 14 +++++++ workers.go | 111 +++++++++++++++++++---------------------------------- 3 files changed, 62 insertions(+), 71 deletions(-) diff --git a/go.mod b/go.mod index 2c2aee7d..bb95e40c 100644 --- a/go.mod +++ b/go.mod @@ -23,6 +23,7 @@ require ( go.opentelemetry.io/otel/trace v1.16.0 golang.org/x/net v0.10.0 golang.org/x/time v0.3.0 + google.golang.org/api v0.108.0 google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4 google.golang.org/grpc v1.55.0 google.golang.org/protobuf v1.30.0 @@ -33,6 +34,8 @@ require ( ) require ( + cloud.google.com/go/compute v1.18.0 // indirect + cloud.google.com/go/compute/metadata v0.2.3 // indirect github.com/armon/go-metrics v0.4.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cenkalti/backoff/v4 v4.2.1 // indirect @@ -43,10 +46,14 @@ require ( github.com/go-logr/logr v1.2.4 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/btree v1.1.1 // indirect github.com/google/go-cmp v0.5.9 // indirect github.com/google/gofuzz v1.1.0 // indirect + github.com/google/uuid v1.3.0 // indirect + github.com/googleapis/enterprise-certificate-proxy v0.2.1 // indirect + github.com/googleapis/gax-go/v2 v2.7.0 // indirect github.com/googleapis/gnostic v0.5.5 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/go-immutable-radix v1.3.1 // indirect @@ -68,6 +75,7 @@ require ( github.com/uptrace/opentelemetry-go-extra/otelutil v0.2.1 // indirect go.etcd.io/etcd/api/v3 v3.5.5 // indirect go.etcd.io/etcd/client/pkg/v3 v3.5.5 // indirect + go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.42.0 // indirect go.opentelemetry.io/otel/exporters/jaeger v1.16.0 // indirect go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.16.0 // indirect diff --git a/go.sum b/go.sum index 10c869d6..41681302 100644 --- a/go.sum +++ b/go.sum @@ -26,9 +26,12 @@ cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUM cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc= cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ= cloud.google.com/go/compute v1.18.0 h1:FEigFqoDbys2cvFkZ9Fjq4gnHBP55anJ0yQyau2f9oY= +cloud.google.com/go/compute v1.18.0/go.mod h1:1X7yHxec2Ga+Ss6jPyjxRxpu2uu7PLgsOVXvgU0yacs= cloud.google.com/go/compute/metadata v0.2.3 h1:mg4jlk7mCAj6xXp9UJ4fjI9VUI5rubuGBW5aJ7UnBMY= +cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2AawlZn8kiOGuCv6gTkwuA= cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= +cloud.google.com/go/longrunning v0.4.1 h1:v+yFJOfKC3yZdY6ZUI933pIYdhyhV8S3NpWrXWmg7jM= cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= @@ -163,6 +166,7 @@ github.com/golang/glog v1.1.0 h1:/d3pCKDPWNnvIWe0vVUpNP32qc8U3PDVxySP/y360qE= github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= @@ -230,8 +234,13 @@ github.com/google/pprof v0.0.0-20210226084205-cbba55b83ad5/go.mod h1:kpwsk12EmLe github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/enterprise-certificate-proxy v0.2.1 h1:RY7tHKZcRlk788d5WSo/e83gOyyy742E8GSs771ySpg= +github.com/googleapis/enterprise-certificate-proxy v0.2.1/go.mod h1:AwSRAtLfXpU5Nm3pW+v7rGDHp09LsPtGY9MduiEsR9k= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= +github.com/googleapis/gax-go/v2 v2.7.0 h1:IcsPKeInNvYi7eqSaDjiZqDDKu5rsmunY0Y1YupQSSQ= +github.com/googleapis/gax-go/v2 v2.7.0/go.mod h1:TEop28CZZQ2y+c0VxMUmu1lV+fQx57QpBWsYpwqHJx8= github.com/googleapis/gnostic v0.5.1/go.mod h1:6U4PtQXGIEt/Z3h5MAT7FNofLnw9vXk2cUuW7uA/OeU= github.com/googleapis/gnostic v0.5.5 h1:9fHAtK0uDfpveeqqo1hkEZJcFvYXAiCN3UutL8F9xHw= github.com/googleapis/gnostic v0.5.5/go.mod h1:7+EbHbldMins07ALC74bsA81Ovc97DwqyJO1AENw9kA= @@ -405,6 +414,7 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY= github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM= @@ -430,6 +440,8 @@ go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E= +go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= +go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.42.0 h1:ZOLJc06r4CB42laIXg/7udr0pbZyuAihN10A/XuiQRY= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.42.0/go.mod h1:5z+/ZWJQKXa9YT34fQNx5K8Hd1EoIhvtUygUQPqEOgQ= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.42.0 h1:pginetY7+onl4qN1vl0xW/V/v6OBZ0vVdH+esuJgvmM= @@ -756,6 +768,8 @@ google.golang.org/api v0.36.0/go.mod h1:+z5ficQTmoYpPn8LCUNVpK5I7hwkpjbcgqA7I34q google.golang.org/api v0.40.0/go.mod h1:fYKFpnQN0DsDSKRVRcQSDQNtqWPfM9i+zNPxepjRCQ8= google.golang.org/api v0.41.0/go.mod h1:RkxM5lITDfTzmyKFPt+wGrCJbVfniCr2ool8kTBzRTU= google.golang.org/api v0.43.0/go.mod h1:nQsDGjRXMo4lvh5hP0TKqF244gqhGcr/YSIykhUk/94= +google.golang.org/api v0.108.0 h1:WVBc/faN0DkKtR43Q/7+tPny9ZoLZdIiAyG5Q9vFClg= +google.golang.org/api v0.108.0/go.mod h1:2Ts0XTHNVWxypznxWOYUeI4g3WdP9Pk2Qk58+a/O9MY= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= diff --git a/workers.go b/workers.go index 8571a3a8..5fc23b06 100644 --- a/workers.go +++ b/workers.go @@ -38,7 +38,6 @@ package gubernator import ( "context" - "fmt" "io" "strconv" "sync" @@ -47,7 +46,6 @@ import ( "github.com/OneOfOne/xxhash" "github.com/mailgun/holster/v4/errors" "github.com/mailgun/holster/v4/setter" - "github.com/mailgun/holster/v4/tracing" "github.com/sirupsen/logrus" "go.opentelemetry.io/otel/trace" ) @@ -63,6 +61,7 @@ type WorkerPool struct { type Worker struct { name string + conf *Config cache Cache getRateLimitRequest chan *request storeRequest chan workerStoreRequest @@ -139,7 +138,7 @@ func NewWorkerPool(conf *Config) *WorkerPool { // Create workers. for i := 0; i < conf.Workers; i++ { chp.workers[i] = chp.newWorker() - go chp.worker(chp.workers[i]) + go chp.dispatch(chp.workers[i]) } return chp @@ -163,6 +162,7 @@ func (p *WorkerPool) newWorker() *Worker { const commandChannelSize = 10000 worker := &Worker{ + conf: p.conf, cache: p.conf.CacheFactory(p.workerCacheSize), getRateLimitRequest: make(chan *request, commandChannelSize), storeRequest: make(chan workerStoreRequest, commandChannelSize), @@ -187,7 +187,7 @@ func (p *WorkerPool) getWorker(key string) *Worker { // Each worker maintains its own state. // A hash ring will distribute requests to an assigned worker by key. // See: getWorker() -func (p *WorkerPool) worker(worker *Worker) { +func (p *WorkerPool) dispatch(worker *Worker) { for { // Dispatch requests from each channel. select { @@ -198,7 +198,17 @@ func (p *WorkerPool) worker(worker *Worker) { return } - p.handleGetRateLimit(req, worker.cache) + resp := new(response) + resp.rl, resp.err = worker.handleGetRateLimit(req.ctx, req.request, worker.cache) + + select { + case req.resp <- resp: + // Success. + + case <-req.ctx.Done(): + // Context canceled. + trace.SpanFromContext(req.ctx).RecordError(resp.err) + } case req, ok := <-worker.storeRequest: if !ok { @@ -207,7 +217,7 @@ func (p *WorkerPool) worker(worker *Worker) { return } - p.handleStore(req, worker.cache) + worker.handleStore(req, worker.cache) case req, ok := <-worker.loadRequest: if !ok { @@ -216,7 +226,7 @@ func (p *WorkerPool) worker(worker *Worker) { return } - p.handleLoad(req, worker.cache) + worker.handleLoad(req, worker.cache) case req, ok := <-worker.addCacheItemRequest: if !ok { @@ -225,7 +235,7 @@ func (p *WorkerPool) worker(worker *Worker) { return } - p.handleAddCacheItem(req, worker.cache) + worker.handleAddCacheItem(req, worker.cache) case req, ok := <-worker.getCacheItemRequest: if !ok { @@ -234,7 +244,7 @@ func (p *WorkerPool) worker(worker *Worker) { return } - p.handleGetCacheItem(req, worker.cache) + worker.handleGetCacheItem(req, worker.cache) case <-p.done: // Clean up. @@ -274,16 +284,13 @@ func (p *WorkerPool) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) } // Handle request received by worker. -func (p *WorkerPool) handleGetRateLimit(handlerRequest *request, cache Cache) { - ctx := tracing.StartNamedScopeDebug(handlerRequest.ctx, "WorkerPool.handleGetRateLimit") - defer tracing.EndScope(ctx, nil) - +func (worker *Worker) handleGetRateLimit(ctx context.Context, req *RateLimitReq, cache Cache) (*RateLimitResp, error) { var rlResponse *RateLimitResp var err error - switch handlerRequest.request.Algorithm { + switch req.Algorithm { case Algorithm_TOKEN_BUCKET: - rlResponse, err = tokenBucket(ctx, p.conf.Store, cache, handlerRequest.request) + rlResponse, err = tokenBucket(ctx, worker.conf.Store, cache, req) if err != nil { msg := "Error in tokenBucket" countError(err, msg) @@ -292,7 +299,7 @@ func (p *WorkerPool) handleGetRateLimit(handlerRequest *request, cache Cache) { } case Algorithm_LEAKY_BUCKET: - rlResponse, err = leakyBucket(ctx, p.conf.Store, cache, handlerRequest.request) + rlResponse, err = leakyBucket(ctx, worker.conf.Store, cache, req) if err != nil { msg := "Error in leakyBucket" countError(err, msg) @@ -301,33 +308,18 @@ func (p *WorkerPool) handleGetRateLimit(handlerRequest *request, cache Cache) { } default: - err = errors.Errorf("Invalid rate limit algorithm '%d'", handlerRequest.request.Algorithm) + err = errors.Errorf("Invalid rate limit algorithm '%d'", req.Algorithm) trace.SpanFromContext(ctx).RecordError(err) metricCheckErrorCounter.WithLabelValues("Invalid algorithm").Add(1) } - handlerResponse := &response{ - rl: rlResponse, - err: err, - } - - select { - case handlerRequest.resp <- handlerResponse: - // Success. - - case <-ctx.Done(): - // Context canceled. - trace.SpanFromContext(ctx).RecordError(err) - } + return rlResponse, err } // Load atomically loads cache from persistent storage. // Read from persistent storage. Load into each appropriate worker's cache. // Workers are locked during this load operation to prevent race conditions. func (p *WorkerPool) Load(ctx context.Context) (err error) { - ctx = tracing.StartNamedScope(ctx, "WorkerPool.Load") - defer func() { tracing.EndScope(ctx, err) }() - ch, err := p.conf.Loader.Load() if err != nil { return errors.Wrap(err, "Error in loader.Load") @@ -410,10 +402,7 @@ MAIN: return nil } -func (p *WorkerPool) handleLoad(request workerLoadRequest, cache Cache) { - ctx := tracing.StartNamedScopeDebug(request.ctx, "WorkerPool.handleLoad") - defer tracing.EndScope(ctx, nil) - +func (worker *Worker) handleLoad(request workerLoadRequest, cache Cache) { MAIN: for { var item *CacheItem @@ -426,7 +415,7 @@ MAIN: } // Successfully received item. - case <-ctx.Done(): + case <-request.ctx.Done(): // Context canceled. return } @@ -440,9 +429,9 @@ MAIN: case request.response <- response: // Successfully sent response. - case <-ctx.Done(): + case <-request.ctx.Done(): // Context canceled. - trace.SpanFromContext(ctx).RecordError(ctx.Err()) + trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) } } @@ -450,9 +439,6 @@ MAIN: // Save all workers' caches to persistent storage. // Workers are locked during this store operation to prevent race conditions. func (p *WorkerPool) Store(ctx context.Context) (err error) { - ctx = tracing.StartNamedScopeDebug(ctx, "WorkerPool.Store") - defer func() { tracing.EndScope(ctx, err) }() - var wg sync.WaitGroup out := make(chan *CacheItem, 500) @@ -461,8 +447,6 @@ func (p *WorkerPool) Store(ctx context.Context) (err error) { wg.Add(1) go func(ctx context.Context, worker *Worker) { - ctx = tracing.StartNamedScope(ctx, fmt.Sprintf("%p", worker)) - defer tracing.EndScope(ctx, nil) defer wg.Done() respChan := make(chan workerStoreResponse) @@ -511,18 +495,15 @@ func (p *WorkerPool) Store(ctx context.Context) (err error) { return nil } -func (p *WorkerPool) handleStore(request workerStoreRequest, cache Cache) { - ctx := tracing.StartNamedScopeDebug(request.ctx, "WorkerPool.handleStore") - defer tracing.EndScope(ctx, nil) - +func (worker *Worker) handleStore(request workerStoreRequest, cache Cache) { for item := range cache.Each() { select { case request.out <- item: // Successfully sent item. - case <-ctx.Done(): + case <-request.ctx.Done(): // Context canceled. - trace.SpanFromContext(ctx).RecordError(ctx.Err()) + trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) return } } @@ -533,17 +514,14 @@ func (p *WorkerPool) handleStore(request workerStoreRequest, cache Cache) { case request.response <- response: // Successfully sent response. - case <-ctx.Done(): + case <-request.ctx.Done(): // Context canceled. - trace.SpanFromContext(ctx).RecordError(ctx.Err()) + trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) } } // AddCacheItem adds an item to the worker's cache. func (p *WorkerPool) AddCacheItem(ctx context.Context, key string, item *CacheItem) (err error) { - ctx = tracing.StartNamedScopeDebug(ctx, "WorkerPool.AddCacheItem") - tracing.EndScope(ctx, err) - respChan := make(chan workerAddCacheItemResponse) worker := p.getWorker(key) req := workerAddCacheItemRequest{ @@ -573,10 +551,7 @@ func (p *WorkerPool) AddCacheItem(ctx context.Context, key string, item *CacheIt } } -func (p *WorkerPool) handleAddCacheItem(request workerAddCacheItemRequest, cache Cache) { - ctx := tracing.StartNamedScopeDebug(request.ctx, "WorkerPool.handleAddCacheItem") - defer tracing.EndScope(ctx, nil) - +func (worker *Worker) handleAddCacheItem(request workerAddCacheItemRequest, cache Cache) { exists := cache.Add(request.item) response := workerAddCacheItemResponse{exists} @@ -584,17 +559,14 @@ func (p *WorkerPool) handleAddCacheItem(request workerAddCacheItemRequest, cache case request.response <- response: // Successfully sent response. - case <-ctx.Done(): + case <-request.ctx.Done(): // Context canceled. - trace.SpanFromContext(ctx).RecordError(ctx.Err()) + trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) } } // GetCacheItem gets item from worker's cache. func (p *WorkerPool) GetCacheItem(ctx context.Context, key string) (item *CacheItem, found bool, err error) { - ctx = tracing.StartNamedScopeDebug(ctx, "WorkerPool.GetCacheItem") - tracing.EndScope(ctx, err) - respChan := make(chan workerGetCacheItemResponse) worker := p.getWorker(key) req := workerGetCacheItemRequest{ @@ -624,10 +596,7 @@ func (p *WorkerPool) GetCacheItem(ctx context.Context, key string) (item *CacheI } } -func (p *WorkerPool) handleGetCacheItem(request workerGetCacheItemRequest, cache Cache) { - ctx := tracing.StartNamedScopeDebug(request.ctx, "WorkerPool.handleGetCacheItem") - defer tracing.EndScope(ctx, nil) - +func (worker *Worker) handleGetCacheItem(request workerGetCacheItemRequest, cache Cache) { item, ok := cache.GetItem(request.key) response := workerGetCacheItemResponse{item, ok} @@ -635,8 +604,8 @@ func (p *WorkerPool) handleGetCacheItem(request workerGetCacheItemRequest, cache case request.response <- response: // Successfully sent response. - case <-ctx.Done(): + case <-request.ctx.Done(): // Context canceled. - trace.SpanFromContext(ctx).RecordError(ctx.Err()) + trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) } } From b6c7af4ac1dbbc42e2624a6285097e2ba9274aba Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Wed, 27 Sep 2023 16:07:37 -0400 Subject: [PATCH 02/32] Add `DisableBatching` config. Disable goroutines used for batching when batching is disabled. --- config.go | 2 ++ peer_client.go | 16 ++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/config.go b/config.go index 0e0d7cf7..438eac4c 100644 --- a/config.go +++ b/config.go @@ -53,6 +53,8 @@ type BehaviorConfig struct { BatchWait time.Duration // The max number of requests we can batch into a single peer request BatchLimit int + // DisableBatching disables batching behavior. + DisableBatching bool // How long a non-owning peer should wait before syncing hits to the owning peer GlobalSyncWait time.Duration diff --git a/peer_client.go b/peer_client.go index bd08387a..c7116c30 100644 --- a/peer_client.go +++ b/peer_client.go @@ -154,7 +154,10 @@ func (c *PeerClient) connect(ctx context.Context) (err error) { } c.client = NewPeersV1Client(c.conn) c.status = peerConnected - go c.run() + + if !c.conf.Behavior.DisableBatching { + go c.runBatch() + } return nil } c.mutex.RUnlock() @@ -187,7 +190,7 @@ func (c *PeerClient) GetPeerRateLimit(ctx context.Context, r *RateLimitReq) (res ) // If config asked for no batching - if HasBehavior(r.Behavior, Behavior_NO_BATCHING) { + if c.conf.Behavior.DisableBatching || HasBehavior(r.Behavior, Behavior_NO_BATCHING) { // If no metadata is provided if r.Metadata == nil { r.Metadata = make(map[string]string) @@ -368,9 +371,10 @@ func (c *PeerClient) getPeerRateLimitsBatch(ctx context.Context, r *RateLimitReq } } -// run waits for requests to be queued, when either c.batchWait time -// has elapsed or the queue reaches c.batchLimit. Send what is in the queue. -func (c *PeerClient) run() { +// run processes batching requests by waiting for requests to be queued. Send +// the queue as a batch when either c.batchWait time has elapsed or the queue +// reaches c.batchLimit. +func (c *PeerClient) runBatch() { var interval = NewInterval(c.conf.Behavior.BatchWait) defer interval.Stop() @@ -397,7 +401,7 @@ func (c *PeerClient) run() { "queueLen": len(queue), "batchLimit": c.conf.Behavior.BatchLimit, }). - Debug("run() reached batch limit") + Debug("runBatch() reached batch limit") ref := queue queue = nil go c.sendBatch(ctx, ref) From dbcdc2ea06507819473ebfb01528424972a16d34 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Thu, 28 Sep 2023 10:18:39 -0400 Subject: [PATCH 03/32] Log worker count. --- workers.go | 1 + 1 file changed, 1 insertion(+) diff --git a/workers.go b/workers.go index 5fc23b06..a5b97e09 100644 --- a/workers.go +++ b/workers.go @@ -136,6 +136,7 @@ func NewWorkerPool(conf *Config) *WorkerPool { } // Create workers. + logrus.Infof("Starting %d Gubernator workers...", conf.Workers) for i := 0; i < conf.Workers; i++ { chp.workers[i] = chp.newWorker() go chp.dispatch(chp.workers[i]) From 4f927d224b1a3437991ab7b4b9643cd1f324cd5d Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Thu, 28 Sep 2023 14:43:28 -0400 Subject: [PATCH 04/32] Fix logging. --- workers.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workers.go b/workers.go index a5b97e09..f221ce0e 100644 --- a/workers.go +++ b/workers.go @@ -136,7 +136,7 @@ func NewWorkerPool(conf *Config) *WorkerPool { } // Create workers. - logrus.Infof("Starting %d Gubernator workers...", conf.Workers) + conf.Logger.Infof("Starting %d Gubernator workers...", conf.Workers) for i := 0; i < conf.Workers; i++ { chp.workers[i] = chp.newWorker() go chp.dispatch(chp.workers[i]) From 595eb67bc39c99d9c73c4a6cf69b2cf91963e68c Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Thu, 28 Sep 2023 14:57:05 -0400 Subject: [PATCH 05/32] PIP-2675: Update golangci-lint. --- Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 8ff22d78..c0580763 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,15 @@ .DEFAULT_GOAL := release VERSION=$(shell cat version) LDFLAGS="-X main.Version=$(VERSION)" -GOLINT = $(GOPATH)/bin/golangci-lint +GOLANGCI_LINT = $(GOPATH)/bin/golangci-lint +GOLANGCI_LINT_VERSION = 1.54.2 -$(GOLINT): - curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(GOPATH)/bin v1.54.1 +$(GOLANGCI_LINT): + curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(GOPATH)/bin $(GOLANGCI_LINT_VERSION) .PHONY: lint -lint: $(GOLINT) - $(GOLINT) run --out-format tab --path-prefix `pwd` +lint: $(GOLANGCI_LINT) + $(GOLANGCI_LINT) run .PHONY: test test: From d2e07afad659cc819b6c4e71351a20eb694bfb20 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Thu, 28 Sep 2023 14:57:23 -0400 Subject: [PATCH 06/32] PIP-2675: Better `WorkerPool` request hashing. --- workers.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workers.go b/workers.go index f221ce0e..e8830a43 100644 --- a/workers.go +++ b/workers.go @@ -257,7 +257,7 @@ func (p *WorkerPool) dispatch(worker *Worker) { // GetRateLimit sends a GetRateLimit request to worker pool. func (p *WorkerPool) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) (retval *RateLimitResp, reterr error) { // Delegate request to assigned channel based on request key. - worker := p.getWorker(rlRequest.UniqueKey) + worker := p.getWorker(rlRequest.HashKey()) handlerRequest := &request{ ctx: ctx, resp: make(chan *response, 1), From f6046d12a7609706d4f99d45a28f296f46a945ea Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Thu, 28 Sep 2023 15:11:43 -0400 Subject: [PATCH 07/32] Fix logging. --- gubernator.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gubernator.go b/gubernator.go index 5efdf96c..934f1fd2 100644 --- a/gubernator.go +++ b/gubernator.go @@ -125,11 +125,11 @@ func NewV1Instance(conf Config) (s *V1Instance, err error) { return nil, err } + setter.SetDefault(&conf.Logger, logrus.WithField("category", "gubernator")) s = &V1Instance{ log: conf.Logger, conf: conf, } - setter.SetDefault(&s.log, logrus.WithField("category", "gubernator")) s.workerPool = NewWorkerPool(&conf) s.global = newGlobalManager(conf.Behaviors, s) From 9758b985cb0e1b1cef147ed638bd95ee894e8dd3 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Fri, 29 Sep 2023 11:40:16 -0400 Subject: [PATCH 08/32] Reduce tracing detail. --- algorithms.go | 26 ++++++++++---------------- gubernator.go | 33 ++++++++++++--------------------- peer_client.go | 50 +++++++++++++++----------------------------------- 3 files changed, 37 insertions(+), 72 deletions(-) diff --git a/algorithms.go b/algorithms.go index 8a5442d1..42826c70 100644 --- a/algorithms.go +++ b/algorithms.go @@ -20,7 +20,6 @@ import ( "context" "github.com/mailgun/holster/v4/clock" - "github.com/mailgun/holster/v4/tracing" "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" "go.opentelemetry.io/otel/attribute" @@ -29,9 +28,6 @@ import ( // Implements token bucket algorithm for rate limiting. https://en.wikipedia.org/wiki/Token_bucket func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) { - ctx = tracing.StartNamedScopeDebug(ctx, "tokenBucket") - defer func() { tracing.EndScope(ctx, err) }() - span := trace.SpanFromContext(ctx) tokenBucketTimer := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("tokenBucket")) defer tokenBucketTimer.ObserveDuration() @@ -52,6 +48,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * if ok { if item.Value == nil { msgPart := "tokenBucket: Invalid cache item; Value is nil" + span := trace.SpanFromContext(ctx) span.AddEvent(msgPart, trace.WithAttributes( attribute.String("hashKey", hashKey), attribute.String("key", r.UniqueKey), @@ -61,6 +58,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * ok = false } else if item.Key != hashKey { msgPart := "tokenBucket: Invalid cache item; key mismatch" + span := trace.SpanFromContext(ctx) span.AddEvent(msgPart, trace.WithAttributes( attribute.String("itemKey", item.Key), attribute.String("hashKey", hashKey), @@ -95,6 +93,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * t, ok := item.Value.(*TokenBucketItem) if !ok { // Client switched algorithms; perhaps due to a migration? + span := trace.SpanFromContext(ctx) span.AddEvent("Client switched algorithms; perhaps due to a migration?") c.Remove(hashKey) @@ -125,6 +124,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * // If the duration config changed, update the new ExpireAt. if t.Duration != r.Duration { + span := trace.SpanFromContext(ctx) span.AddEvent("Duration changed") expire := t.CreatedAt + r.Duration if HasBehavior(r.Behavior, Behavior_DURATION_IS_GREGORIAN) { @@ -163,6 +163,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * // If we are already at the limit. if rl.Remaining == 0 && r.Hits > 0 { + span := trace.SpanFromContext(ctx) span.AddEvent("Already over the limit") metricOverLimitCounter.Add(1) rl.Status = Status_OVER_LIMIT @@ -172,6 +173,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * // If requested hits takes the remainder. if t.Remaining == r.Hits { + span := trace.SpanFromContext(ctx) span.AddEvent("At the limit") t.Remaining = 0 rl.Remaining = 0 @@ -181,13 +183,13 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * // If requested is more than available, then return over the limit // without updating the cache. if r.Hits > t.Remaining { + span := trace.SpanFromContext(ctx) span.AddEvent("Over the limit") metricOverLimitCounter.Add(1) rl.Status = Status_OVER_LIMIT return rl, nil } - span.AddEvent("Under the limit") t.Remaining -= r.Hits rl.Remaining = t.Remaining return rl, nil @@ -199,10 +201,6 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * // Called by tokenBucket() when adding a new item in the store. func tokenBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) { - ctx = tracing.StartNamedScopeDebug(ctx, "tokenBucketNewItem") - defer func() { tracing.EndScope(ctx, err) }() - span := trace.SpanFromContext(ctx) - now := MillisecondNow() expire := now + r.Duration @@ -237,6 +235,7 @@ func tokenBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) // Client could be requesting that we always return OVER_LIMIT. if r.Hits > r.Limit { + span := trace.SpanFromContext(ctx) span.AddEvent("Over the limit") metricOverLimitCounter.Add(1) rl.Status = Status_OVER_LIMIT @@ -255,10 +254,6 @@ func tokenBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) // Implements leaky bucket algorithm for rate limiting https://en.wikipedia.org/wiki/Leaky_bucket func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) { - ctx = tracing.StartNamedScopeDebug(ctx, "leakyBucket") - defer func() { tracing.EndScope(ctx, err) }() - span := trace.SpanFromContext(ctx) - leakyBucketTimer := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("V1Instance.getRateLimit_leakyBucket")) defer leakyBucketTimer.ObserveDuration() @@ -284,6 +279,7 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * if ok { if item.Value == nil { msgPart := "leakyBucket: Invalid cache item; Value is nil" + span := trace.SpanFromContext(ctx) span.AddEvent(msgPart, trace.WithAttributes( attribute.String("hashKey", hashKey), attribute.String("key", r.UniqueKey), @@ -293,6 +289,7 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * ok = false } else if item.Key != hashKey { msgPart := "leakyBucket: Invalid cache item; key mismatch" + span := trace.SpanFromContext(ctx) span.AddEvent(msgPart, trace.WithAttributes( attribute.String("itemKey", item.Key), attribute.String("hashKey", hashKey), @@ -425,9 +422,6 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * // Called by leakyBucket() when adding a new item in the store. func leakyBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) { - ctx = tracing.StartNamedScopeDebug(ctx, "leakyBucketNewItem") - defer func() { tracing.EndScope(ctx, err) }() - now := MillisecondNow() duration := r.Duration rate := float64(duration) / float64(r.Limit) diff --git a/gubernator.go b/gubernator.go index 934f1fd2..eb58ef12 100644 --- a/gubernator.go +++ b/gubernator.go @@ -115,9 +115,7 @@ var metricBatchSendDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ // NewV1Instance instantiate a single instance of a gubernator peer and register this // instance with the provided GRPCServer. func NewV1Instance(conf Config) (s *V1Instance, err error) { - ctx := tracing.StartNamedScopeDebug(context.Background(), "gubernator.NewV1Instance") - defer func() { tracing.EndScope(ctx, err) }() - + ctx := context.Background() if conf.GRPCServers == nil { return nil, errors.New("at least one GRPCServer instance is required") } @@ -213,21 +211,21 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G // For each item in the request body for i, req := range r.Requests { - _ = tracing.CallNamedScopeDebug(ctx, "Iterate requests", func(ctx context.Context) error { - key := req.Name + "_" + req.UniqueKey - var peer *PeerClient - var err error + key := req.Name + "_" + req.UniqueKey + var peer *PeerClient + var err error + for { if len(req.UniqueKey) == 0 { metricCheckErrorCounter.WithLabelValues("Invalid request").Add(1) resp.Responses[i] = &RateLimitResp{Error: "field 'unique_key' cannot be empty"} - return nil + break } if len(req.Name) == 0 { metricCheckErrorCounter.WithLabelValues("Invalid request").Add(1) resp.Responses[i] = &RateLimitResp{Error: "field 'namespace' cannot be empty"} - return nil + break } if ctx.Err() != nil { @@ -236,7 +234,7 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G resp.Responses[i] = &RateLimitResp{ Error: err.Error(), } - return nil + break } peer, err = s.GetPeer(ctx, key) @@ -246,7 +244,7 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G resp.Responses[i] = &RateLimitResp{ Error: err.Error(), } - return nil + break } // If our server instance is the owner of this rate limit @@ -272,7 +270,7 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G // Inform the client of the owner key of the key resp.Responses[i].Metadata = map[string]string{"owner": peer.Info().GRPCAddress} - return nil + break } // Request must be forwarded to peer that owns the key. @@ -287,9 +285,8 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G Idx: i, }) } - - return nil - }) + break + } } // Wait for any async responses if any @@ -577,12 +574,6 @@ func (s *V1Instance) getLocalRateLimit(ctx context.Context, r *RateLimitReq) (*R span.SetAttributes( attribute.String("request.key", r.UniqueKey), attribute.String("request.name", r.Name), - attribute.Int64("request.algorithm", int64(r.Algorithm)), - attribute.Int64("request.behavior", int64(r.Behavior)), - attribute.Int64("request.duration", r.Duration), - attribute.Int64("request.limit", r.Limit), - attribute.Int64("request.hits", r.Hits), - attribute.Int64("request.burst", r.Burst), ) defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("V1Instance.getLocalRateLimit")).ObserveDuration() diff --git a/peer_client.go b/peer_client.go index c7116c30..99d15787 100644 --- a/peer_client.go +++ b/peer_client.go @@ -96,9 +96,6 @@ func NewPeerClient(conf PeerConfig) *PeerClient { // Connect establishes a GRPC connection to a peer func (c *PeerClient) connect(ctx context.Context) (err error) { - ctx = tracing.StartScopeDebug(ctx) - defer func() { tracing.EndScope(ctx, err) }() - // NOTE: To future self, this mutex is used here because we need to know if the peer is disconnecting and // handle ErrClosing. Since this mutex MUST be here we take this opportunity to also see if we are connected. // Doing this here encapsulates managing the connected state to the PeerClient struct. Previously a PeerClient @@ -172,21 +169,19 @@ func (c *PeerClient) Info() PeerInfo { // GetPeerRateLimit forwards a rate limit request to a peer. If the rate limit has `behavior == BATCHING` configured, // this method will attempt to batch the rate limits func (c *PeerClient) GetPeerRateLimit(ctx context.Context, r *RateLimitReq) (resp *RateLimitResp, err error) { - ctx = tracing.StartNamedScope(ctx, "PeerClient.GetPeerRateLimit") - defer func() { tracing.EndScope(ctx, err) }() span := trace.SpanFromContext(ctx) span.SetAttributes( - attribute.String("peer.GRPCAddress", c.conf.Info.GRPCAddress), - attribute.String("peer.HTTPAddress", c.conf.Info.HTTPAddress), - attribute.String("peer.Datacenter", c.conf.Info.DataCenter), + // attribute.String("peer.GRPCAddress", c.conf.Info.GRPCAddress), + // attribute.String("peer.HTTPAddress", c.conf.Info.HTTPAddress), + // attribute.String("peer.Datacenter", c.conf.Info.DataCenter), attribute.String("request.key", r.UniqueKey), attribute.String("request.name", r.Name), - attribute.Int64("request.algorithm", int64(r.Algorithm)), - attribute.Int64("request.behavior", int64(r.Behavior)), - attribute.Int64("request.duration", r.Duration), - attribute.Int64("request.limit", r.Limit), - attribute.Int64("request.hits", r.Hits), - attribute.Int64("request.burst", r.Burst), + // attribute.Int64("request.algorithm", int64(r.Algorithm)), + // attribute.Int64("request.behavior", int64(r.Behavior)), + // attribute.Int64("request.duration", r.Duration), + // attribute.Int64("request.limit", r.Limit), + // attribute.Int64("request.hits", r.Hits), + // attribute.Int64("request.burst", r.Burst), ) // If config asked for no batching @@ -222,9 +217,6 @@ func (c *PeerClient) GetPeerRateLimit(ctx context.Context, r *RateLimitReq) (res // GetPeerRateLimits requests a list of rate limit statuses from a peer func (c *PeerClient) GetPeerRateLimits(ctx context.Context, r *GetPeerRateLimitsReq) (resp *GetPeerRateLimitsResp, err error) { - ctx = tracing.StartNamedScopeDebug(ctx, "PeerClient.GetPeerRateLimits") - defer func() { tracing.EndScope(ctx, err) }() - if err := c.connect(ctx); err != nil { err = errors.Wrap(err, "Error in connect") metricCheckErrorCounter.WithLabelValues("Connect error").Add(1) @@ -259,9 +251,6 @@ func (c *PeerClient) GetPeerRateLimits(ctx context.Context, r *GetPeerRateLimits // UpdatePeerGlobals sends global rate limit status updates to a peer func (c *PeerClient) UpdatePeerGlobals(ctx context.Context, r *UpdatePeerGlobalsReq) (resp *UpdatePeerGlobalsResp, err error) { - ctx = tracing.StartNamedScope(ctx, "PeerClient.UpdatePeerGlobals") - defer func() { tracing.EndScope(ctx, err) }() - if err := c.connect(ctx); err != nil { return nil, c.setLastErr(err) } @@ -314,9 +303,6 @@ func (c *PeerClient) GetLastErr() []string { } func (c *PeerClient) getPeerRateLimitsBatch(ctx context.Context, r *RateLimitReq) (resp *RateLimitResp, err error) { - ctx = tracing.StartNamedScopeDebug(ctx, "PeerClient.getPeerRateLimitsBatch") - defer func() { tracing.EndScope(ctx, err) }() - funcTimer := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("PeerClient.getPeerRateLimitsBatch")) defer funcTimer.ObserveDuration() @@ -333,12 +319,9 @@ func (c *PeerClient) getPeerRateLimitsBatch(ctx context.Context, r *RateLimitReq } // Wait for a response or context cancel - ctx2 := tracing.StartNamedScopeDebug(ctx, "Wait for response") - defer tracing.EndScope(ctx2, nil) - req := request{ resp: make(chan *response, 1), - ctx: ctx2, + ctx: ctx, request: r, } @@ -349,8 +332,8 @@ func (c *PeerClient) getPeerRateLimitsBatch(ctx context.Context, r *RateLimitReq select { case c.queue <- &req: // Successfully enqueued request. - case <-ctx2.Done(): - return nil, errors.Wrap(ctx2.Err(), "Context error while enqueuing request") + case <-ctx.Done(): + return nil, errors.Wrap(ctx.Err(), "Context error while enqueuing request") } c.wg.Add(1) @@ -366,8 +349,8 @@ func (c *PeerClient) getPeerRateLimitsBatch(ctx context.Context, r *RateLimitReq return nil, c.setLastErr(err) } return re.rl, nil - case <-ctx2.Done(): - return nil, errors.Wrap(ctx2.Err(), "Context error while waiting for response") + case <-ctx.Done(): + return nil, errors.Wrap(ctx.Err(), "Context error while waiting for response") } } @@ -432,9 +415,6 @@ func (c *PeerClient) runBatch() { // sendBatch sends the queue provided and returns the responses to // waiting go routines func (c *PeerClient) sendBatch(ctx context.Context, queue []*request) { - ctx = tracing.StartNamedScopeDebug(ctx, "PeerClient.sendBatch") - defer tracing.EndScope(ctx, nil) - batchSendTimer := prometheus.NewTimer(metricBatchSendDuration.WithLabelValues(c.conf.Info.GRPCAddress)) defer batchSendTimer.ObserveDuration() funcTimer := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("PeerClient.sendBatch")) @@ -444,7 +424,7 @@ func (c *PeerClient) sendBatch(ctx context.Context, queue []*request) { for _, r := range queue { // NOTE: This trace has the same name because it's in a separate trace than the one above. // We link the two traces, so we can relate our rate limit trace back to the above trace. - r.ctx = tracing.StartNamedScopeDebug(r.ctx, "PeerClient.sendBatch", + r.ctx = tracing.StartNamedScope(r.ctx, "PeerClient.sendBatch", trace.WithLinks(trace.LinkFromContext(ctx))) // If no metadata is provided if r.request.Metadata == nil { From 831e4cf9416beed725e0ed071cc7d5580a9c8ede Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Fri, 29 Sep 2023 13:50:22 -0400 Subject: [PATCH 09/32] Convert some metrics from summary to gauge. --- gubernator.go | 130 ++++++++++++++++++++++--------------------------- peer_client.go | 2 +- workers.go | 6 +-- 3 files changed, 63 insertions(+), 75 deletions(-) diff --git a/gubernator.go b/gubernator.go index eb58ef12..8f24885e 100644 --- a/gubernator.go +++ b/gubernator.go @@ -71,12 +71,9 @@ var metricAsyncRequestRetriesCounter = prometheus.NewCounterVec(prometheus.Count Name: "gubernator_asyncrequest_retries", Help: "The count of retries occurred in asyncRequest() forwarding a request to another peer.", }, []string{"name"}) -var metricQueueLength = prometheus.NewSummaryVec(prometheus.SummaryOpts{ +var metricQueueLength = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "gubernator_queue_length", Help: "The getRateLimitsBatch() queue length in PeerClient. This represents rate checks queued by for batching to a remote peer.", - Objectives: map[float64]float64{ - 0.99: 0.001, - }, }, []string{"peerAddr"}) var metricCheckCounter = prometheus.NewCounter(prometheus.CounterOpts{ Name: "gubernator_check_counter", @@ -86,23 +83,17 @@ var metricOverLimitCounter = prometheus.NewCounter(prometheus.CounterOpts{ Name: "gubernator_over_limit_counter", Help: "The number of rate limit checks that are over the limit.", }) -var metricConcurrentChecks = prometheus.NewSummary(prometheus.SummaryOpts{ +var metricConcurrentChecks = prometheus.NewGauge(prometheus.GaugeOpts{ Name: "gubernator_concurrent_checks_counter", Help: "The number of concurrent GetRateLimits API calls.", - Objectives: map[float64]float64{ - 0.99: 0.001, - }, }) var metricCheckErrorCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "gubernator_check_error_counter", Help: "The number of errors while checking rate limits.", }, []string{"error"}) -var metricWorkerQueueLength = prometheus.NewSummaryVec(prometheus.SummaryOpts{ +var metricWorkerQueueLength = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "gubernator_pool_queue_length", Help: "The number of GetRateLimit requests queued up in Gubernator workers.", - Objectives: map[float64]float64{ - 0.99: 0.001, - }, }, []string{"method", "worker"}) var metricBatchSendDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ Name: "gubernator_batch_send_duration", @@ -194,7 +185,7 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G concurrentCounter := atomic.AddInt64(&s.getRateLimitsCounter, 1) defer atomic.AddInt64(&s.getRateLimitsCounter, -1) span.SetAttributes(attribute.Int64("concurrentCounter", concurrentCounter)) - metricConcurrentChecks.Observe(float64(concurrentCounter)) + metricConcurrentChecks.Set(float64(concurrentCounter)) if len(r.Requests) > maxBatchSize { metricCheckErrorCounter.WithLabelValues("Request too large").Add(1) @@ -215,77 +206,74 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G var peer *PeerClient var err error - for { - if len(req.UniqueKey) == 0 { - metricCheckErrorCounter.WithLabelValues("Invalid request").Add(1) - resp.Responses[i] = &RateLimitResp{Error: "field 'unique_key' cannot be empty"} - break - } + if len(req.UniqueKey) == 0 { + metricCheckErrorCounter.WithLabelValues("Invalid request").Add(1) + resp.Responses[i] = &RateLimitResp{Error: "field 'unique_key' cannot be empty"} + continue + } - if len(req.Name) == 0 { - metricCheckErrorCounter.WithLabelValues("Invalid request").Add(1) - resp.Responses[i] = &RateLimitResp{Error: "field 'namespace' cannot be empty"} - break + if len(req.Name) == 0 { + metricCheckErrorCounter.WithLabelValues("Invalid request").Add(1) + resp.Responses[i] = &RateLimitResp{Error: "field 'namespace' cannot be empty"} + continue + } + + if ctx.Err() != nil { + err = errors.Wrap(ctx.Err(), "Error while iterating request items") + span.RecordError(err) + resp.Responses[i] = &RateLimitResp{ + Error: err.Error(), } + continue + } - if ctx.Err() != nil { - err = errors.Wrap(ctx.Err(), "Error while iterating request items") - span.RecordError(err) - resp.Responses[i] = &RateLimitResp{ - Error: err.Error(), - } - break + peer, err = s.GetPeer(ctx, key) + if err != nil { + countError(err, "Error in GetPeer") + err = errors.Wrapf(err, "Error in GetPeer, looking up peer that owns rate limit '%s'", key) + resp.Responses[i] = &RateLimitResp{ + Error: err.Error(), } + continue + } - peer, err = s.GetPeer(ctx, key) + // If our server instance is the owner of this rate limit + if peer.Info().IsOwner { + // Apply our rate limit algorithm to the request + metricGetRateLimitCounter.WithLabelValues("local").Add(1) + funcTimer1 := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("V1Instance.getLocalRateLimit (local)")) + resp.Responses[i], err = s.getLocalRateLimit(ctx, req) + funcTimer1.ObserveDuration() if err != nil { - countError(err, "Error in GetPeer") - err = errors.Wrapf(err, "Error in GetPeer, looking up peer that owns rate limit '%s'", key) - resp.Responses[i] = &RateLimitResp{ - Error: err.Error(), - } - break + err = errors.Wrapf(err, "Error while apply rate limit for '%s'", key) + span.RecordError(err) + resp.Responses[i] = &RateLimitResp{Error: err.Error()} } - - // If our server instance is the owner of this rate limit - if peer.Info().IsOwner { - // Apply our rate limit algorithm to the request - metricGetRateLimitCounter.WithLabelValues("local").Add(1) - funcTimer1 := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("V1Instance.getLocalRateLimit (local)")) - resp.Responses[i], err = s.getLocalRateLimit(ctx, req) - funcTimer1.ObserveDuration() + } else { + if HasBehavior(req.Behavior, Behavior_GLOBAL) { + resp.Responses[i], err = s.getGlobalRateLimit(ctx, req) if err != nil { - err = errors.Wrapf(err, "Error while apply rate limit for '%s'", key) + err = errors.Wrap(err, "Error in getGlobalRateLimit") span.RecordError(err) resp.Responses[i] = &RateLimitResp{Error: err.Error()} } - } else { - if HasBehavior(req.Behavior, Behavior_GLOBAL) { - resp.Responses[i], err = s.getGlobalRateLimit(ctx, req) - if err != nil { - err = errors.Wrap(err, "Error in getGlobalRateLimit") - span.RecordError(err) - resp.Responses[i] = &RateLimitResp{Error: err.Error()} - } - - // Inform the client of the owner key of the key - resp.Responses[i].Metadata = map[string]string{"owner": peer.Info().GRPCAddress} - break - } - // Request must be forwarded to peer that owns the key. - // Launch remote peer request in goroutine. - wg.Add(1) - go s.asyncRequest(ctx, &AsyncReq{ - AsyncCh: asyncCh, - Peer: peer, - Req: req, - WG: &wg, - Key: key, - Idx: i, - }) + // Inform the client of the owner key of the key + resp.Responses[i].Metadata = map[string]string{"owner": peer.Info().GRPCAddress} + continue } - break + + // Request must be forwarded to peer that owns the key. + // Launch remote peer request in goroutine. + wg.Add(1) + go s.asyncRequest(ctx, &AsyncReq{ + AsyncCh: asyncCh, + Peer: peer, + Req: req, + WG: &wg, + Key: key, + Idx: i, + }) } } diff --git a/peer_client.go b/peer_client.go index 99d15787..6c706f4b 100644 --- a/peer_client.go +++ b/peer_client.go @@ -327,7 +327,7 @@ func (c *PeerClient) getPeerRateLimitsBatch(ctx context.Context, r *RateLimitReq // Enqueue the request to be sent peerAddr := c.Info().GRPCAddress - metricQueueLength.WithLabelValues(peerAddr).Observe(float64(len(c.queue))) + metricQueueLength.WithLabelValues(peerAddr).Set(float64(len(c.queue))) select { case c.queue <- &req: diff --git a/workers.go b/workers.go index e8830a43..f3c40baa 100644 --- a/workers.go +++ b/workers.go @@ -272,7 +272,7 @@ func (p *WorkerPool) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) return nil, ctx.Err() } - metricWorkerQueueLength.WithLabelValues("GetRateLimit", worker.name).Observe(float64(len(worker.getRateLimitRequest))) + metricWorkerQueueLength.WithLabelValues("GetRateLimit", worker.name).Set(float64(len(worker.getRateLimitRequest))) // Wait for response. select { @@ -534,7 +534,7 @@ func (p *WorkerPool) AddCacheItem(ctx context.Context, key string, item *CacheIt select { case worker.addCacheItemRequest <- req: // Successfully sent request. - metricWorkerQueueLength.WithLabelValues("AddCacheItem", worker.name).Observe(float64(len(worker.addCacheItemRequest))) + metricWorkerQueueLength.WithLabelValues("AddCacheItem", worker.name).Set(float64(len(worker.addCacheItemRequest))) select { case <-respChan: @@ -579,7 +579,7 @@ func (p *WorkerPool) GetCacheItem(ctx context.Context, key string) (item *CacheI select { case worker.getCacheItemRequest <- req: // Successfully sent request. - metricWorkerQueueLength.WithLabelValues("GetCacheItem", worker.name).Observe(float64(len(worker.getCacheItemRequest))) + metricWorkerQueueLength.WithLabelValues("GetCacheItem", worker.name).Set(float64(len(worker.getCacheItemRequest))) select { case resp := <-respChan: From 2c93a90b8b3fb850bef0665dc24184b9d25565c9 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Fri, 29 Sep 2023 14:02:03 -0400 Subject: [PATCH 10/32] Fix error metric on timeout. --- go.mod | 1 + go.sum | 2 ++ gubernator.go | 13 ++----------- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/go.mod b/go.mod index bb95e40c..53d3075c 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/davecgh/go-spew v1.1.1 github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3 github.com/hashicorp/memberlist v0.5.0 + github.com/mailgun/errors v0.1.5 github.com/mailgun/holster/v4 v4.14.2 github.com/miekg/dns v1.1.50 github.com/pkg/errors v0.9.1 diff --git a/go.sum b/go.sum index 41681302..c5b49c1c 100644 --- a/go.sum +++ b/go.sum @@ -305,6 +305,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mailgun/errors v0.1.5 h1:riRpZqfUKTdc8saXvoEg2tYkbRyZESU1KvQ3UxPbdus= +github.com/mailgun/errors v0.1.5/go.mod h1:lw+Nh4r/aoUTz6uK915FdfZJo3yq60gPiflFHNpK4NQ= github.com/mailgun/holster/v4 v4.14.2 h1:wBS+IufPnQM8EBgQtLf7ywuUmmAnaEdRujul/l7xWY4= github.com/mailgun/holster/v4 v4.14.2/go.mod h1:0Gc8+xqL3ElCsceOt7ySj1J3j9s3WLBJaV07yeE61y4= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= diff --git a/gubernator.go b/gubernator.go index 8f24885e..2b4d7671 100644 --- a/gubernator.go +++ b/gubernator.go @@ -23,8 +23,8 @@ import ( "sync" "sync/atomic" + "github.com/mailgun/errors" "github.com/mailgun/holster/v4/ctxutil" - "github.com/mailgun/holster/v4/errors" "github.com/mailgun/holster/v4/setter" "github.com/mailgun/holster/v4/syncutil" "github.com/mailgun/holster/v4/tracing" @@ -763,14 +763,5 @@ func countError(err error, defaultType string) { } func isDeadlineExceeded(err error) bool { - for { - if err == nil { - return false - } - if errors.Is(err, context.DeadlineExceeded) { - return true - } - - err = errors.Unwrap(err) - } + return errors.As(err, context.DeadlineExceeded) } From 83d3f0f8bfdcf906141e82ec68d1cf12b4c66e2e Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Fri, 29 Sep 2023 14:17:56 -0400 Subject: [PATCH 11/32] Remove unnecessary buffer from `WorkerPool` command channels. --- workers.go | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/workers.go b/workers.go index f3c40baa..42e050b7 100644 --- a/workers.go +++ b/workers.go @@ -160,16 +160,14 @@ func (p *WorkerPool) Close() error { // Create a new pool worker instance. func (p *WorkerPool) newWorker() *Worker { - const commandChannelSize = 10000 - worker := &Worker{ conf: p.conf, cache: p.conf.CacheFactory(p.workerCacheSize), - getRateLimitRequest: make(chan *request, commandChannelSize), - storeRequest: make(chan workerStoreRequest, commandChannelSize), - loadRequest: make(chan workerLoadRequest, commandChannelSize), - addCacheItemRequest: make(chan workerAddCacheItemRequest, commandChannelSize), - getCacheItemRequest: make(chan workerGetCacheItemRequest, commandChannelSize), + getRateLimitRequest: make(chan *request), + storeRequest: make(chan workerStoreRequest), + loadRequest: make(chan workerLoadRequest), + addCacheItemRequest: make(chan workerAddCacheItemRequest), + getCacheItemRequest: make(chan workerGetCacheItemRequest), } workerNumber := atomic.AddInt64(&workerCounter, 1) - 1 worker.name = strconv.FormatInt(workerNumber, 10) @@ -199,8 +197,12 @@ func (p *WorkerPool) dispatch(worker *Worker) { return } + lenMetric := metricWorkerQueueLength.WithLabelValues("GetRateLimit", worker.name) + lenMetric.Inc() + resp := new(response) resp.rl, resp.err = worker.handleGetRateLimit(req.ctx, req.request, worker.cache) + lenMetric.Dec() select { case req.resp <- resp: @@ -218,7 +220,11 @@ func (p *WorkerPool) dispatch(worker *Worker) { return } + lenMetric := metricWorkerQueueLength.WithLabelValues("Store", worker.name) + lenMetric.Inc() + worker.handleStore(req, worker.cache) + lenMetric.Dec() case req, ok := <-worker.loadRequest: if !ok { @@ -227,7 +233,11 @@ func (p *WorkerPool) dispatch(worker *Worker) { return } + lenMetric := metricWorkerQueueLength.WithLabelValues("Load", worker.name) + lenMetric.Inc() + worker.handleLoad(req, worker.cache) + lenMetric.Dec() case req, ok := <-worker.addCacheItemRequest: if !ok { @@ -236,7 +246,11 @@ func (p *WorkerPool) dispatch(worker *Worker) { return } + lenMetric := metricWorkerQueueLength.WithLabelValues("AddCacheItem", worker.name) + lenMetric.Inc() + worker.handleAddCacheItem(req, worker.cache) + lenMetric.Dec() case req, ok := <-worker.getCacheItemRequest: if !ok { @@ -245,7 +259,11 @@ func (p *WorkerPool) dispatch(worker *Worker) { return } + lenMetric := metricWorkerQueueLength.WithLabelValues("GetCacheItem", worker.name) + lenMetric.Inc() + worker.handleGetCacheItem(req, worker.cache) + lenMetric.Dec() case <-p.done: // Clean up. @@ -272,8 +290,6 @@ func (p *WorkerPool) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) return nil, ctx.Err() } - metricWorkerQueueLength.WithLabelValues("GetRateLimit", worker.name).Set(float64(len(worker.getRateLimitRequest))) - // Wait for response. select { case handlerResponse := <-handlerRequest.resp: From 3b8e8825c907f66cfd139d77993bd04626fa652b Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Fri, 29 Sep 2023 14:18:27 -0400 Subject: [PATCH 12/32] Allocation optimization. --- workers.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workers.go b/workers.go index 42e050b7..db2b8367 100644 --- a/workers.go +++ b/workers.go @@ -63,7 +63,7 @@ type Worker struct { name string conf *Config cache Cache - getRateLimitRequest chan *request + getRateLimitRequest chan request storeRequest chan workerStoreRequest loadRequest chan workerLoadRequest addCacheItemRequest chan workerAddCacheItemRequest @@ -163,7 +163,7 @@ func (p *WorkerPool) newWorker() *Worker { worker := &Worker{ conf: p.conf, cache: p.conf.CacheFactory(p.workerCacheSize), - getRateLimitRequest: make(chan *request), + getRateLimitRequest: make(chan request), storeRequest: make(chan workerStoreRequest), loadRequest: make(chan workerLoadRequest), addCacheItemRequest: make(chan workerAddCacheItemRequest), @@ -276,7 +276,7 @@ func (p *WorkerPool) dispatch(worker *Worker) { func (p *WorkerPool) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) (retval *RateLimitResp, reterr error) { // Delegate request to assigned channel based on request key. worker := p.getWorker(rlRequest.HashKey()) - handlerRequest := &request{ + handlerRequest := request{ ctx: ctx, resp: make(chan *response, 1), request: rlRequest, From 873e541895828315f0d98e175fc4946cc543e986 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Fri, 29 Sep 2023 15:06:56 -0400 Subject: [PATCH 13/32] Add metric around `handleGetRateLimit`. --- gubernator.go | 2 ++ workers.go | 2 ++ 2 files changed, 4 insertions(+) diff --git a/gubernator.go b/gubernator.go index 2b4d7671..65deda73 100644 --- a/gubernator.go +++ b/gubernator.go @@ -64,7 +64,9 @@ var metricFuncTimeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ Name: "gubernator_func_duration", Help: "The timings of key functions in Gubernator in seconds.", Objectives: map[float64]float64{ + 1: 0.001, 0.99: 0.001, + 0.5: 0.01, }, }, []string{"name"}) var metricAsyncRequestRetriesCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ diff --git a/workers.go b/workers.go index db2b8367..c89e481a 100644 --- a/workers.go +++ b/workers.go @@ -46,6 +46,7 @@ import ( "github.com/OneOfOne/xxhash" "github.com/mailgun/holster/v4/errors" "github.com/mailgun/holster/v4/setter" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" "go.opentelemetry.io/otel/trace" ) @@ -302,6 +303,7 @@ func (p *WorkerPool) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) // Handle request received by worker. func (worker *Worker) handleGetRateLimit(ctx context.Context, req *RateLimitReq, cache Cache) (*RateLimitResp, error) { + defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("handleGetRateLimit")).ObserveDuration() var rlResponse *RateLimitResp var err error From 66e8943a10ddd29a2534780df3e30460203ed8e4 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Fri, 29 Sep 2023 15:07:12 -0400 Subject: [PATCH 14/32] Fix nil reference panic. --- gubernator.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gubernator.go b/gubernator.go index 65deda73..7457d5a2 100644 --- a/gubernator.go +++ b/gubernator.go @@ -765,5 +765,8 @@ func countError(err error, defaultType string) { } func isDeadlineExceeded(err error) bool { - return errors.As(err, context.DeadlineExceeded) + if err == nil { + return false + } + return errors.Is(err, context.DeadlineExceeded) } From 3bad7e88a29d311a8df7eb303cf0bb4a0f15fc51 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Fri, 29 Sep 2023 17:01:08 -0400 Subject: [PATCH 15/32] Fix unit test. --- config.go | 2 ++ gubernator.go | 3 --- workers_internal_test.go | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/config.go b/config.go index 438eac4c..33f7b868 100644 --- a/config.go +++ b/config.go @@ -127,7 +127,9 @@ func (c *Config) SetDefaults() error { setter.SetDefault(&c.LocalPicker, NewReplicatedConsistentHash(nil, defaultReplicas)) setter.SetDefault(&c.RegionPicker, NewRegionPicker(nil)) + setter.SetDefault(&c.CacheSize, 50_000) setter.SetDefault(&c.Workers, runtime.NumCPU()) + setter.SetDefault(&c.Logger, logrus.New().WithField("category", "gubernator")) if c.CacheFactory == nil { c.CacheFactory = func(maxSize int) Cache { diff --git a/gubernator.go b/gubernator.go index 7457d5a2..b33e0414 100644 --- a/gubernator.go +++ b/gubernator.go @@ -25,11 +25,9 @@ import ( "github.com/mailgun/errors" "github.com/mailgun/holster/v4/ctxutil" - "github.com/mailgun/holster/v4/setter" "github.com/mailgun/holster/v4/syncutil" "github.com/mailgun/holster/v4/tracing" "github.com/prometheus/client_golang/prometheus" - "github.com/sirupsen/logrus" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/propagation" "go.opentelemetry.io/otel/trace" @@ -116,7 +114,6 @@ func NewV1Instance(conf Config) (s *V1Instance, err error) { return nil, err } - setter.SetDefault(&conf.Logger, logrus.WithField("category", "gubernator")) s = &V1Instance{ log: conf.Logger, conf: conf, diff --git a/workers_internal_test.go b/workers_internal_test.go index 05743368..9b15c73e 100644 --- a/workers_internal_test.go +++ b/workers_internal_test.go @@ -39,7 +39,6 @@ func TestWorkersInternal(t *testing.T) { const concurrency = 32 conf := &Config{ Workers: concurrency, - CacheSize: 1000, } require.NoError(t, conf.SetDefaults()) From 2573caf71224f23e08b451985f58c3ece020c45d Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Mon, 2 Oct 2023 11:05:27 -0400 Subject: [PATCH 16/32] Add metric `gubernator_command_counter` to track worker activity in `WorkerPool`. --- gubernator.go | 12 ++++++------ workers.go | 36 +++++++++++------------------------- 2 files changed, 17 insertions(+), 31 deletions(-) diff --git a/gubernator.go b/gubernator.go index b33e0414..a74eec34 100644 --- a/gubernator.go +++ b/gubernator.go @@ -91,10 +91,6 @@ var metricCheckErrorCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "gubernator_check_error_counter", Help: "The number of errors while checking rate limits.", }, []string{"error"}) -var metricWorkerQueueLength = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "gubernator_pool_queue_length", - Help: "The number of GetRateLimit requests queued up in Gubernator workers.", -}, []string{"method", "worker"}) var metricBatchSendDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ Name: "gubernator_batch_send_duration", Help: "The timings of batch send operations to a remote peer.", @@ -102,6 +98,10 @@ var metricBatchSendDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ 0.99: 0.001, }, }, []string{"peerAddr"}) +var metricCommandCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "gubernator_command_counter", + Help: "The count of commands processed by each worker in WorkerPool.", +}, []string{"worker", "method"}) // NewV1Instance instantiate a single instance of a gubernator peer and register this // instance with the provided GRPCServer. @@ -709,8 +709,8 @@ func (s *V1Instance) Describe(ch chan<- *prometheus.Desc) { metricCheckErrorCounter.Describe(ch) metricOverLimitCounter.Describe(ch) metricCheckCounter.Describe(ch) - metricWorkerQueueLength.Describe(ch) metricBatchSendDuration.Describe(ch) + metricCommandCounter.Describe(ch) } // Collect fetches metrics from the server for use by prometheus @@ -725,8 +725,8 @@ func (s *V1Instance) Collect(ch chan<- prometheus.Metric) { metricCheckErrorCounter.Collect(ch) metricOverLimitCounter.Collect(ch) metricCheckCounter.Collect(ch) - metricWorkerQueueLength.Collect(ch) metricBatchSendDuration.Collect(ch) + metricCommandCounter.Collect(ch) } // HasBehavior returns true if the provided behavior is set diff --git a/workers.go b/workers.go index c89e481a..0e793c88 100644 --- a/workers.go +++ b/workers.go @@ -198,13 +198,8 @@ func (p *WorkerPool) dispatch(worker *Worker) { return } - lenMetric := metricWorkerQueueLength.WithLabelValues("GetRateLimit", worker.name) - lenMetric.Inc() - resp := new(response) resp.rl, resp.err = worker.handleGetRateLimit(req.ctx, req.request, worker.cache) - lenMetric.Dec() - select { case req.resp <- resp: // Success. @@ -213,6 +208,7 @@ func (p *WorkerPool) dispatch(worker *Worker) { // Context canceled. trace.SpanFromContext(req.ctx).RecordError(resp.err) } + metricCommandCounter.WithLabelValues(worker.name, "GetRateLimit").Inc() case req, ok := <-worker.storeRequest: if !ok { @@ -221,11 +217,8 @@ func (p *WorkerPool) dispatch(worker *Worker) { return } - lenMetric := metricWorkerQueueLength.WithLabelValues("Store", worker.name) - lenMetric.Inc() - worker.handleStore(req, worker.cache) - lenMetric.Dec() + metricCommandCounter.WithLabelValues(worker.name, "Store").Inc() case req, ok := <-worker.loadRequest: if !ok { @@ -234,11 +227,8 @@ func (p *WorkerPool) dispatch(worker *Worker) { return } - lenMetric := metricWorkerQueueLength.WithLabelValues("Load", worker.name) - lenMetric.Inc() - worker.handleLoad(req, worker.cache) - lenMetric.Dec() + metricCommandCounter.WithLabelValues(worker.name, "Load").Inc() case req, ok := <-worker.addCacheItemRequest: if !ok { @@ -247,11 +237,8 @@ func (p *WorkerPool) dispatch(worker *Worker) { return } - lenMetric := metricWorkerQueueLength.WithLabelValues("AddCacheItem", worker.name) - lenMetric.Inc() - worker.handleAddCacheItem(req, worker.cache) - lenMetric.Dec() + metricCommandCounter.WithLabelValues(worker.name, "AddCacheItem").Inc() case req, ok := <-worker.getCacheItemRequest: if !ok { @@ -260,11 +247,8 @@ func (p *WorkerPool) dispatch(worker *Worker) { return } - lenMetric := metricWorkerQueueLength.WithLabelValues("GetCacheItem", worker.name) - lenMetric.Inc() - worker.handleGetCacheItem(req, worker.cache) - lenMetric.Dec() + metricCommandCounter.WithLabelValues(worker.name, "GetCacheItem").Inc() case <-p.done: // Clean up. @@ -276,7 +260,11 @@ func (p *WorkerPool) dispatch(worker *Worker) { // GetRateLimit sends a GetRateLimit request to worker pool. func (p *WorkerPool) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) (retval *RateLimitResp, reterr error) { // Delegate request to assigned channel based on request key. + timer1 := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("WorkerPool.GetRateLimit_1")) worker := p.getWorker(rlRequest.HashKey()) + timer1.ObserveDuration() + + timer2 := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("WorkerPool.GetRateLimit_2")) handlerRequest := request{ ctx: ctx, resp: make(chan *response, 1), @@ -290,8 +278,10 @@ func (p *WorkerPool) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) case <-ctx.Done(): return nil, ctx.Err() } + timer2.ObserveDuration() // Wait for response. + defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("WorkerPool.GetRateLimit_3")).ObserveDuration() select { case handlerResponse := <-handlerRequest.resp: // Successfully read response. @@ -552,8 +542,6 @@ func (p *WorkerPool) AddCacheItem(ctx context.Context, key string, item *CacheIt select { case worker.addCacheItemRequest <- req: // Successfully sent request. - metricWorkerQueueLength.WithLabelValues("AddCacheItem", worker.name).Set(float64(len(worker.addCacheItemRequest))) - select { case <-respChan: // Successfully received response. @@ -597,8 +585,6 @@ func (p *WorkerPool) GetCacheItem(ctx context.Context, key string) (item *CacheI select { case worker.getCacheItemRequest <- req: // Successfully sent request. - metricWorkerQueueLength.WithLabelValues("GetCacheItem", worker.name).Set(float64(len(worker.getCacheItemRequest))) - select { case resp := <-respChan: // Successfully received response. From d7bc3a93541876c0f6c8c7649fabaa2d3cce0742 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Mon, 2 Oct 2023 13:50:26 -0400 Subject: [PATCH 17/32] Try `WorkerPool2` that uses multiple workers with a single cache for more even workload distribution. --- gubernator.go | 4 +- workers.go | 2 +- workers2.go | 492 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 495 insertions(+), 3 deletions(-) create mode 100644 workers2.go diff --git a/gubernator.go b/gubernator.go index a74eec34..76e62e1b 100644 --- a/gubernator.go +++ b/gubernator.go @@ -51,7 +51,7 @@ type V1Instance struct { conf Config isClosed bool getRateLimitsCounter int64 - workerPool *WorkerPool + workerPool *WorkerPool2 } var metricGetRateLimitCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ @@ -119,7 +119,7 @@ func NewV1Instance(conf Config) (s *V1Instance, err error) { conf: conf, } - s.workerPool = NewWorkerPool(&conf) + s.workerPool = NewWorkerPool2(&conf) s.global = newGlobalManager(conf.Behaviors, s) // Register our instance with all GRPC servers diff --git a/workers.go b/workers.go index 0e793c88..6481f957 100644 --- a/workers.go +++ b/workers.go @@ -293,7 +293,7 @@ func (p *WorkerPool) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) // Handle request received by worker. func (worker *Worker) handleGetRateLimit(ctx context.Context, req *RateLimitReq, cache Cache) (*RateLimitResp, error) { - defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("handleGetRateLimit")).ObserveDuration() + defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("Worker.handleGetRateLimit")).ObserveDuration() var rlResponse *RateLimitResp var err error diff --git a/workers2.go b/workers2.go new file mode 100644 index 00000000..ace3f4c6 --- /dev/null +++ b/workers2.go @@ -0,0 +1,492 @@ +package gubernator + +import ( + "context" + "io" + "strconv" + "sync" + "sync/atomic" + + "github.com/mailgun/holster/v4/setter" + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" + "go.opentelemetry.io/otel/trace" +) + +type WorkerPool2 struct { + conf *Config + cache Cache + cacheMu sync.Mutex + done chan struct{} + getRateLimitRequest chan request + storeRequest chan workerStoreRequest + loadRequest chan workerLoadRequest + addCacheItemRequest chan workerAddCacheItemRequest + getCacheItemRequest chan workerGetCacheItemRequest + workerCounter atomic.Int64 +} + +type Worker2 struct { + name string + conf *Config +} + +var _ io.Closer = &WorkerPool2{} + +func NewWorkerPool2(conf *Config) *WorkerPool2 { + setter.SetDefault(&conf.CacheSize, 50_000) + + // Compute hashRingStep as interval between workers' 63-bit hash ranges. + // 64th bit is used here as a max value that is just out of range of 63-bit space to calculate the step. + chp := &WorkerPool2{ + conf: conf, + cache: conf.CacheFactory(conf.CacheSize), + done: make(chan struct{}), + getRateLimitRequest: make(chan request), + storeRequest: make(chan workerStoreRequest), + loadRequest: make(chan workerLoadRequest), + addCacheItemRequest: make(chan workerAddCacheItemRequest), + getCacheItemRequest: make(chan workerGetCacheItemRequest), + } + + // Create workers. + conf.Logger.Infof("Starting %d Gubernator workers...", conf.Workers) + for i := 0; i < conf.Workers; i++ { + worker := chp.newWorker() + go worker.dispatch(chp) + } + + return chp +} + +func (p *WorkerPool2) Close() error { + close(p.done) + return nil +} + +// Create a new pool worker instance. +func (p *WorkerPool2) newWorker() *Worker2 { + workerNumber := p.workerCounter.Add(1) - 1 + worker := &Worker2{ + name: strconv.FormatInt(workerNumber, 10), + conf: p.conf, + } + return worker +} + +// Pool worker for processing Gubernator requests. +// Each worker maintains its own state. +// A hash ring will distribute requests to an assigned worker by key. +// See: getWorker() +func (worker *Worker2) dispatch(p *WorkerPool2) { + for { + // Dispatch requests from each channel. + select { + case req, ok := <-p.getRateLimitRequest: + if !ok { + // Channel closed. Unexpected, but should be handled. + logrus.Error("workerPool worker stopped because channel closed") + return + } + + resp := new(response) + p.cacheMu.Lock() + resp.rl, resp.err = worker.handleGetRateLimit(req.ctx, req.request, p.cache) + p.cacheMu.Unlock() + select { + case req.resp <- resp: + // Success. + + case <-req.ctx.Done(): + // Context canceled. + trace.SpanFromContext(req.ctx).RecordError(resp.err) + } + metricCommandCounter.WithLabelValues(worker.name, "GetRateLimit").Inc() + + case req, ok := <-p.storeRequest: + if !ok { + // Channel closed. Unexpected, but should be handled. + logrus.Error("workerPool worker stopped because channel closed") + return + } + + p.cacheMu.Lock() + worker.handleStore(req, p.cache) + p.cacheMu.Unlock() + metricCommandCounter.WithLabelValues(worker.name, "Store").Inc() + + case req, ok := <-p.loadRequest: + if !ok { + // Channel closed. Unexpected, but should be handled. + logrus.Error("workerPool worker stopped because channel closed") + return + } + + p.cacheMu.Lock() + worker.handleLoad(req, p.cache) + p.cacheMu.Unlock() + metricCommandCounter.WithLabelValues(worker.name, "Load").Inc() + + case req, ok := <-p.addCacheItemRequest: + if !ok { + // Channel closed. Unexpected, but should be handled. + logrus.Error("workerPool worker stopped because channel closed") + return + } + + p.cacheMu.Lock() + worker.handleAddCacheItem(req, p.cache) + p.cacheMu.Unlock() + metricCommandCounter.WithLabelValues(worker.name, "AddCacheItem").Inc() + + case req, ok := <-p.getCacheItemRequest: + if !ok { + // Channel closed. Unexpected, but should be handled. + logrus.Error("workerPool worker stopped because channel closed") + return + } + + p.cacheMu.Lock() + worker.handleGetCacheItem(req, p.cache) + p.cacheMu.Unlock() + metricCommandCounter.WithLabelValues(worker.name, "GetCacheItem").Inc() + + case <-p.done: + // Clean up. + return + } + } +} + +// GetRateLimit sends a GetRateLimit request to worker pool. +func (p *WorkerPool2) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) (retval *RateLimitResp, reterr error) { + // Delegate request to assigned channel based on request key. + timer2 := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("WorkerPool2.GetRateLimit_2")) + handlerRequest := request{ + ctx: ctx, + resp: make(chan *response, 1), + request: rlRequest, + } + + // Send request. + select { + case p.getRateLimitRequest <- handlerRequest: + // Successfully sent request. + case <-ctx.Done(): + return nil, ctx.Err() + } + timer2.ObserveDuration() + + // Wait for response. + defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("WorkerPool2.GetRateLimit_3")).ObserveDuration() + select { + case handlerResponse := <-handlerRequest.resp: + // Successfully read response. + return handlerResponse.rl, handlerResponse.err + case <-ctx.Done(): + return nil, ctx.Err() + } +} + +// Handle request received by worker. +func (worker *Worker2) handleGetRateLimit(ctx context.Context, req *RateLimitReq, cache Cache) (*RateLimitResp, error) { + defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("Worker2.handleGetRateLimit")).ObserveDuration() + var rlResponse *RateLimitResp + var err error + + switch req.Algorithm { + case Algorithm_TOKEN_BUCKET: + rlResponse, err = tokenBucket(ctx, worker.conf.Store, cache, req) + if err != nil { + msg := "Error in tokenBucket" + countError(err, msg) + err = errors.Wrap(err, msg) + trace.SpanFromContext(ctx).RecordError(err) + } + + case Algorithm_LEAKY_BUCKET: + rlResponse, err = leakyBucket(ctx, worker.conf.Store, cache, req) + if err != nil { + msg := "Error in leakyBucket" + countError(err, msg) + err = errors.Wrap(err, msg) + trace.SpanFromContext(ctx).RecordError(err) + } + + default: + err = errors.Errorf("Invalid rate limit algorithm '%d'", req.Algorithm) + trace.SpanFromContext(ctx).RecordError(err) + metricCheckErrorCounter.WithLabelValues("Invalid algorithm").Add(1) + } + + return rlResponse, err +} + +// Load atomically loads cache from persistent storage. +// Read from persistent storage. Load into each appropriate worker's cache. +// Workers are locked during this load operation to prevent race conditions. +func (p *WorkerPool2) Load(ctx context.Context) (err error) { + ch, err := p.conf.Loader.Load() + if err != nil { + return errors.Wrap(err, "Error in loader.Load") + } + + type loadChannel struct { + ch chan *CacheItem + respChan chan workerLoadResponse + } + + loadCh := loadChannel{ + ch: make(chan *CacheItem), + respChan: make(chan workerLoadResponse), + } + p.loadRequest <- workerLoadRequest{ + ctx: ctx, + response: loadCh.respChan, + in: loadCh.ch, + } + + // Send each item. +MAIN: + for { + var item *CacheItem + var ok bool + + select { + case item, ok = <-ch: + if !ok { + break MAIN + } + // Successfully received item. + + case <-ctx.Done(): + // Context canceled. + return ctx.Err() + } + + // Send item to worker's load channel. + select { + case loadCh.ch <- item: + // Successfully sent item. + + case <-ctx.Done(): + // Context canceled. + return ctx.Err() + } + } + + // Clean up. + close(loadCh.ch) + + select { + case <-loadCh.respChan: + // Successfully received response. + + case <-ctx.Done(): + // Context canceled. + return ctx.Err() + } + + return nil +} + +func (worker *Worker2) handleLoad(request workerLoadRequest, cache Cache) { +MAIN: + for { + var item *CacheItem + var ok bool + + select { + case item, ok = <-request.in: + if !ok { + break MAIN + } + // Successfully received item. + + case <-request.ctx.Done(): + // Context canceled. + return + } + + cache.Add(item) + } + + response := workerLoadResponse{} + + select { + case request.response <- response: + // Successfully sent response. + + case <-request.ctx.Done(): + // Context canceled. + trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) + } +} + +// Store atomically stores cache to persistent storage. +// Save all workers' caches to persistent storage. +// Workers are locked during this store operation to prevent race conditions. +func (p *WorkerPool2) Store(ctx context.Context) (err error) { + var wg sync.WaitGroup + out := make(chan *CacheItem, 500) + + // Iterate each worker's cache to `out` channel. + wg.Add(1) + + go func() { + defer wg.Done() + + respChan := make(chan workerStoreResponse) + req := workerStoreRequest{ + ctx: ctx, + response: respChan, + out: out, + } + + select { + case p.storeRequest <- req: + // Successfully sent request. + select { + case <-respChan: + // Successfully received response. + return + + case <-ctx.Done(): + // Context canceled. + trace.SpanFromContext(ctx).RecordError(ctx.Err()) + return + } + + case <-ctx.Done(): + // Context canceled. + trace.SpanFromContext(ctx).RecordError(ctx.Err()) + return + } + }() + + // When all iterators are done, close `out` channel. + go func() { + wg.Wait() + close(out) + }() + + if ctx.Err() != nil { + return ctx.Err() + } + + if err = p.conf.Loader.Save(out); err != nil { + return errors.Wrap(err, "while calling p.conf.Loader.Save()") + } + + return nil +} + +func (worker *Worker2) handleStore(request workerStoreRequest, cache Cache) { + for item := range cache.Each() { + select { + case request.out <- item: + // Successfully sent item. + + case <-request.ctx.Done(): + // Context canceled. + trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) + return + } + } + + response := workerStoreResponse{} + + select { + case request.response <- response: + // Successfully sent response. + + case <-request.ctx.Done(): + // Context canceled. + trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) + } +} + +// AddCacheItem adds an item to the worker's cache. +func (p *WorkerPool2) AddCacheItem(ctx context.Context, key string, item *CacheItem) (err error) { + respChan := make(chan workerAddCacheItemResponse) + req := workerAddCacheItemRequest{ + ctx: ctx, + response: respChan, + item: item, + } + + select { + case p.addCacheItemRequest <- req: + // Successfully sent request. + select { + case <-respChan: + // Successfully received response. + return nil + + case <-ctx.Done(): + // Context canceled. + return ctx.Err() + } + + case <-ctx.Done(): + // Context canceled. + return ctx.Err() + } +} + +func (worker *Worker2) handleAddCacheItem(request workerAddCacheItemRequest, cache Cache) { + exists := cache.Add(request.item) + response := workerAddCacheItemResponse{exists} + + select { + case request.response <- response: + // Successfully sent response. + + case <-request.ctx.Done(): + // Context canceled. + trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) + } +} + +// GetCacheItem gets item from worker's cache. +func (p *WorkerPool2) GetCacheItem(ctx context.Context, key string) (item *CacheItem, found bool, err error) { + respChan := make(chan workerGetCacheItemResponse) + req := workerGetCacheItemRequest{ + ctx: ctx, + response: respChan, + key: key, + } + + select { + case p.getCacheItemRequest <- req: + // Successfully sent request. + select { + case resp := <-respChan: + // Successfully received response. + return resp.item, resp.ok, nil + + case <-ctx.Done(): + // Context canceled. + return nil, false, ctx.Err() + } + + case <-ctx.Done(): + // Context canceled. + return nil, false, ctx.Err() + } +} + +func (worker *Worker2) handleGetCacheItem(request workerGetCacheItemRequest, cache Cache) { + item, ok := cache.GetItem(request.key) + response := workerGetCacheItemResponse{item, ok} + + select { + case request.response <- response: + // Successfully sent response. + + case <-request.ctx.Done(): + // Context canceled. + trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) + } +} From ab872fe01f074f5de316bc3475b635e2e35204a4 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Tue, 3 Oct 2023 09:21:58 -0400 Subject: [PATCH 18/32] PIP-2675: Add metric `gubernator_worker_queue` to track queuing in `WorkerPool2`. --- gubernator.go | 6 ++++++ workers2.go | 15 +++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/gubernator.go b/gubernator.go index 76e62e1b..fcfe91b4 100644 --- a/gubernator.go +++ b/gubernator.go @@ -102,6 +102,10 @@ var metricCommandCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "gubernator_command_counter", Help: "The count of commands processed by each worker in WorkerPool.", }, []string{"worker", "method"}) +var metricWorkerQueue = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "gubernator_worker_queue", + Help: "The count of requests queued up in WorkerPool.", +}, []string{"method"}) // NewV1Instance instantiate a single instance of a gubernator peer and register this // instance with the provided GRPCServer. @@ -711,6 +715,7 @@ func (s *V1Instance) Describe(ch chan<- *prometheus.Desc) { metricCheckCounter.Describe(ch) metricBatchSendDuration.Describe(ch) metricCommandCounter.Describe(ch) + metricWorkerQueue.Describe(ch) } // Collect fetches metrics from the server for use by prometheus @@ -727,6 +732,7 @@ func (s *V1Instance) Collect(ch chan<- prometheus.Metric) { metricCheckCounter.Collect(ch) metricBatchSendDuration.Collect(ch) metricCommandCounter.Collect(ch) + metricWorkerQueue.Collect(ch) } // HasBehavior returns true if the provided behavior is set diff --git a/workers2.go b/workers2.go index ace3f4c6..b105692f 100644 --- a/workers2.go +++ b/workers2.go @@ -162,6 +162,9 @@ func (worker *Worker2) dispatch(p *WorkerPool2) { // GetRateLimit sends a GetRateLimit request to worker pool. func (p *WorkerPool2) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) (retval *RateLimitResp, reterr error) { // Delegate request to assigned channel based on request key. + queueGauge := metricWorkerQueue.WithLabelValues("GetRateLimit") + queueGauge.Inc() + defer queueGauge.Dec() timer2 := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("WorkerPool2.GetRateLimit_2")) handlerRequest := request{ ctx: ctx, @@ -227,6 +230,9 @@ func (worker *Worker2) handleGetRateLimit(ctx context.Context, req *RateLimitReq // Read from persistent storage. Load into each appropriate worker's cache. // Workers are locked during this load operation to prevent race conditions. func (p *WorkerPool2) Load(ctx context.Context) (err error) { + queueGauge := metricWorkerQueue.WithLabelValues("Load") + queueGauge.Inc() + defer queueGauge.Dec() ch, err := p.conf.Loader.Load() if err != nil { return errors.Wrap(err, "Error in loader.Load") @@ -328,6 +334,9 @@ MAIN: // Save all workers' caches to persistent storage. // Workers are locked during this store operation to prevent race conditions. func (p *WorkerPool2) Store(ctx context.Context) (err error) { + queueGauge := metricWorkerQueue.WithLabelValues("Store") + queueGauge.Inc() + defer queueGauge.Dec() var wg sync.WaitGroup out := make(chan *CacheItem, 500) @@ -409,6 +418,9 @@ func (worker *Worker2) handleStore(request workerStoreRequest, cache Cache) { // AddCacheItem adds an item to the worker's cache. func (p *WorkerPool2) AddCacheItem(ctx context.Context, key string, item *CacheItem) (err error) { + queueGauge := metricWorkerQueue.WithLabelValues("AddCacheItem") + queueGauge.Inc() + defer queueGauge.Dec() respChan := make(chan workerAddCacheItemResponse) req := workerAddCacheItemRequest{ ctx: ctx, @@ -451,6 +463,9 @@ func (worker *Worker2) handleAddCacheItem(request workerAddCacheItemRequest, cac // GetCacheItem gets item from worker's cache. func (p *WorkerPool2) GetCacheItem(ctx context.Context, key string) (item *CacheItem, found bool, err error) { + queueGauge := metricWorkerQueue.WithLabelValues("GetCacheItem") + queueGauge.Inc() + defer queueGauge.Dec() respChan := make(chan workerGetCacheItemResponse) req := workerGetCacheItemRequest{ ctx: ctx, From f8ace5cc52bbe9aec00756f8feeacfe7213cf0e0 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Tue, 3 Oct 2023 10:32:08 -0400 Subject: [PATCH 19/32] Simplify metric `gubernator_concurrent_checks_counter`. --- gubernator.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/gubernator.go b/gubernator.go index fcfe91b4..841d754c 100644 --- a/gubernator.go +++ b/gubernator.go @@ -21,7 +21,6 @@ import ( "fmt" "strings" "sync" - "sync/atomic" "github.com/mailgun/errors" "github.com/mailgun/holster/v4/ctxutil" @@ -50,7 +49,6 @@ type V1Instance struct { log FieldLogger conf Config isClosed bool - getRateLimitsCounter int64 workerPool *WorkerPool2 } @@ -180,15 +178,12 @@ func (s *V1Instance) Close() (err error) { // rate limit `Name` and `UniqueKey` is not owned by this instance, then we forward the request to the // peer that does. func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*GetRateLimitsResp, error) { - span := trace.SpanFromContext(ctx) funcTimer := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("V1Instance.GetRateLimits")) defer funcTimer.ObserveDuration() - concurrentCounter := atomic.AddInt64(&s.getRateLimitsCounter, 1) - defer atomic.AddInt64(&s.getRateLimitsCounter, -1) - span.SetAttributes(attribute.Int64("concurrentCounter", concurrentCounter)) - metricConcurrentChecks.Set(float64(concurrentCounter)) + metricConcurrentChecks.Inc() + defer metricConcurrentChecks.Dec() if len(r.Requests) > maxBatchSize { metricCheckErrorCounter.WithLabelValues("Request too large").Add(1) @@ -223,6 +218,7 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G if ctx.Err() != nil { err = errors.Wrap(ctx.Err(), "Error while iterating request items") + span := trace.SpanFromContext(ctx) span.RecordError(err) resp.Responses[i] = &RateLimitResp{ Error: err.Error(), @@ -249,6 +245,7 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G funcTimer1.ObserveDuration() if err != nil { err = errors.Wrapf(err, "Error while apply rate limit for '%s'", key) + span := trace.SpanFromContext(ctx) span.RecordError(err) resp.Responses[i] = &RateLimitResp{Error: err.Error()} } @@ -257,6 +254,7 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G resp.Responses[i], err = s.getGlobalRateLimit(ctx, req) if err != nil { err = errors.Wrap(err, "Error in getGlobalRateLimit") + span := trace.SpanFromContext(ctx) span.RecordError(err) resp.Responses[i] = &RateLimitResp{Error: err.Error()} } From 701d4dd95985f148c032ae7b0c3c35fa4b1f665d Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Tue, 3 Oct 2023 13:01:27 -0400 Subject: [PATCH 20/32] Add `ForceGlobal` config option. --- config.go | 2 ++ gubernator.go | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/config.go b/config.go index 33f7b868..2c76082d 100644 --- a/config.go +++ b/config.go @@ -62,6 +62,8 @@ type BehaviorConfig struct { GlobalTimeout time.Duration // The max number of global updates we can batch into a single peer request GlobalBatchLimit int + // ForceGlobal forces global mode on all rate limit checks. + ForceGlobal bool } // Config for a gubernator instance diff --git a/gubernator.go b/gubernator.go index 841d754c..4421bf76 100644 --- a/gubernator.go +++ b/gubernator.go @@ -226,6 +226,10 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G continue } + if s.conf.Behaviors.ForceGlobal { + req.Behavior |= Behavior_GLOBAL + } + peer, err = s.GetPeer(ctx, key) if err != nil { countError(err, "Error in GetPeer") @@ -389,7 +393,7 @@ func (s *V1Instance) asyncRequest(ctx context.Context, req *AsyncReq) { // getGlobalRateLimit handles rate limits that are marked as `Behavior = GLOBAL`. Rate limit responses // are returned from the local cache and the hits are queued to be sent to the owning peer. func (s *V1Instance) getGlobalRateLimit(ctx context.Context, req *RateLimitReq) (resp *RateLimitResp, err error) { - ctx = tracing.StartNamedScopeDebug(ctx, "getGlobalRateLimit") + ctx = tracing.StartNamedScope(ctx, "V1Instance.getGlobalRateLimit") defer func() { tracing.EndScope(ctx, err) }() funcTimer := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("V1Instance.getGlobalRateLimit")) From 01f04416f657df45fbdd4d2aa79d7b3cf6e6d787 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Tue, 3 Oct 2023 15:23:36 -0400 Subject: [PATCH 21/32] Add metric for tracking global queue length. --- global.go | 5 +++++ gubernator.go | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/global.go b/global.go index 067933e8..3e4bd049 100644 --- a/global.go +++ b/global.go @@ -187,6 +187,8 @@ func (gm *globalManager) runBroadcasts() { if len(updates) != 0 { gm.broadcastPeers(context.Background(), updates) updates = make(map[string]*RateLimitReq) + } else { + metricGlobalQueueLength.Set(0) } case <-done: return false @@ -197,9 +199,12 @@ func (gm *globalManager) runBroadcasts() { // broadcastPeers broadcasts global rate limit statuses to all other peers func (gm *globalManager) broadcastPeers(ctx context.Context, updates map[string]*RateLimitReq) { + defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("globalManager.broadcastPeers")).ObserveDuration() var req UpdatePeerGlobalsReq start := clock.Now() + metricGlobalQueueLength.Set(float64(len(updates))) + for _, r := range updates { // Copy the original since we are removing the GLOBAL behavior rl := proto.Clone(r).(*RateLimitReq) diff --git a/gubernator.go b/gubernator.go index 4421bf76..b8bf7ad0 100644 --- a/gubernator.go +++ b/gubernator.go @@ -104,6 +104,10 @@ var metricWorkerQueue = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "gubernator_worker_queue", Help: "The count of requests queued up in WorkerPool.", }, []string{"method"}) +var metricGlobalQueueLength = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "gubernator_global_queue_length", + Help: "The count of requests queued up for global broadcast. This is only used for GetRateLimit requests using global behavior.", +}) // NewV1Instance instantiate a single instance of a gubernator peer and register this // instance with the provided GRPCServer. @@ -227,7 +231,7 @@ func (s *V1Instance) GetRateLimits(ctx context.Context, r *GetRateLimitsReq) (*G } if s.conf.Behaviors.ForceGlobal { - req.Behavior |= Behavior_GLOBAL + SetBehavior(&req.Behavior, Behavior_GLOBAL, true) } peer, err = s.GetPeer(ctx, key) @@ -718,6 +722,7 @@ func (s *V1Instance) Describe(ch chan<- *prometheus.Desc) { metricBatchSendDuration.Describe(ch) metricCommandCounter.Describe(ch) metricWorkerQueue.Describe(ch) + metricGlobalQueueLength.Describe(ch) } // Collect fetches metrics from the server for use by prometheus @@ -735,6 +740,7 @@ func (s *V1Instance) Collect(ch chan<- prometheus.Metric) { metricBatchSendDuration.Collect(ch) metricCommandCounter.Collect(ch) metricWorkerQueue.Collect(ch) + metricGlobalQueueLength.Collect(ch) } // HasBehavior returns true if the provided behavior is set From e1e74fe30c6d1ef7c2f29e97ecda2f639cda5d48 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Fri, 6 Oct 2023 09:31:11 -0400 Subject: [PATCH 22/32] Tidy trace attributes in `getLocalRateLimit`/`getGlobalRateLimit`. --- gubernator.go | 15 ++++++++------- peer_client.go | 13 ++----------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/gubernator.go b/gubernator.go index b8bf7ad0..5966e64c 100644 --- a/gubernator.go +++ b/gubernator.go @@ -397,7 +397,10 @@ func (s *V1Instance) asyncRequest(ctx context.Context, req *AsyncReq) { // getGlobalRateLimit handles rate limits that are marked as `Behavior = GLOBAL`. Rate limit responses // are returned from the local cache and the hits are queued to be sent to the owning peer. func (s *V1Instance) getGlobalRateLimit(ctx context.Context, req *RateLimitReq) (resp *RateLimitResp, err error) { - ctx = tracing.StartNamedScope(ctx, "V1Instance.getGlobalRateLimit") + ctx = tracing.StartNamedScope(ctx, "V1Instance.getGlobalRateLimit", trace.WithAttributes( + attribute.String("ratelimit.key", req.UniqueKey), + attribute.String("ratelimit.name", req.Name), + )) defer func() { tracing.EndScope(ctx, err) }() funcTimer := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("V1Instance.getGlobalRateLimit")) @@ -566,12 +569,10 @@ func (s *V1Instance) HealthCheck(ctx context.Context, r *HealthCheckReq) (health } func (s *V1Instance) getLocalRateLimit(ctx context.Context, r *RateLimitReq) (*RateLimitResp, error) { - ctx = tracing.StartNamedScope(ctx, "V1Instance.getLocalRateLimit") - span := trace.SpanFromContext(ctx) - span.SetAttributes( - attribute.String("request.key", r.UniqueKey), - attribute.String("request.name", r.Name), - ) + ctx = tracing.StartNamedScope(ctx, "V1Instance.getLocalRateLimit", trace.WithAttributes( + attribute.String("ratelimit.key", r.UniqueKey), + attribute.String("ratelimit.name", r.Name), + )) defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("V1Instance.getLocalRateLimit")).ObserveDuration() metricCheckCounter.Add(1) diff --git a/peer_client.go b/peer_client.go index 6c706f4b..18a5efc8 100644 --- a/peer_client.go +++ b/peer_client.go @@ -171,17 +171,8 @@ func (c *PeerClient) Info() PeerInfo { func (c *PeerClient) GetPeerRateLimit(ctx context.Context, r *RateLimitReq) (resp *RateLimitResp, err error) { span := trace.SpanFromContext(ctx) span.SetAttributes( - // attribute.String("peer.GRPCAddress", c.conf.Info.GRPCAddress), - // attribute.String("peer.HTTPAddress", c.conf.Info.HTTPAddress), - // attribute.String("peer.Datacenter", c.conf.Info.DataCenter), - attribute.String("request.key", r.UniqueKey), - attribute.String("request.name", r.Name), - // attribute.Int64("request.algorithm", int64(r.Algorithm)), - // attribute.Int64("request.behavior", int64(r.Behavior)), - // attribute.Int64("request.duration", r.Duration), - // attribute.Int64("request.limit", r.Limit), - // attribute.Int64("request.hits", r.Hits), - // attribute.Int64("request.burst", r.Burst), + attribute.String("ratelimit.key", r.UniqueKey), + attribute.String("ratelimit.name", r.Name), ) // If config asked for no batching From fdbca472f67b079d52ba3e1350513e0e0341293c Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Fri, 6 Oct 2023 17:29:11 -0400 Subject: [PATCH 23/32] Add metric for global broadcast counts. --- global.go | 21 ++++++++++++++------- gubernator.go | 10 ++++++---- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/global.go b/global.go index 3e4bd049..1fc619e1 100644 --- a/global.go +++ b/global.go @@ -37,23 +37,28 @@ type globalManager struct { log FieldLogger instance *V1Instance - asyncMetrics prometheus.Summary - broadcastMetrics prometheus.Summary + metricAsyncDuration prometheus.Summary + metricBroadcastDuration prometheus.Summary + metricBroadcastCounter *prometheus.CounterVec } func newGlobalManager(conf BehaviorConfig, instance *V1Instance) *globalManager { gm := globalManager{ log: instance.log, - asyncMetrics: prometheus.NewSummary(prometheus.SummaryOpts{ + metricAsyncDuration: prometheus.NewSummary(prometheus.SummaryOpts{ Help: "The duration of GLOBAL async sends in seconds.", Name: "gubernator_async_durations", Objectives: map[float64]float64{0.5: 0.05, 0.99: 0.001}, }), - broadcastMetrics: prometheus.NewSummary(prometheus.SummaryOpts{ + metricBroadcastDuration: prometheus.NewSummary(prometheus.SummaryOpts{ Help: "The duration of GLOBAL broadcasts to peers in seconds.", Name: "gubernator_broadcast_durations", Objectives: map[float64]float64{0.5: 0.05, 0.99: 0.001}, }), + metricBroadcastCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ + Help: "The count of broadcasts.", + Name: "gubernator_broadcast_counter", + }, []string{"condition"}), asyncQueue: make(chan *RateLimitReq, conf.GlobalBatchLimit), broadcastQueue: make(chan *RateLimitReq, conf.GlobalBatchLimit), instance: instance, @@ -157,7 +162,7 @@ func (gm *globalManager) sendHits(hits map[string]*RateLimitReq) { continue } } - gm.asyncMetrics.Observe(time.Since(start).Seconds()) + gm.metricAsyncDuration.Observe(time.Since(start).Seconds()) } // runBroadcasts collects status changes for global rate limits and broadcasts the changes to each peer in the cluster. @@ -171,7 +176,8 @@ func (gm *globalManager) runBroadcasts() { updates[r.HashKey()] = r // Send the hits if we reached our batch limit - if len(updates) == gm.conf.GlobalBatchLimit { + if len(updates) >= gm.conf.GlobalBatchLimit { + gm.metricBroadcastCounter.WithLabelValues("queue_full").Inc() gm.broadcastPeers(context.Background(), updates) updates = make(map[string]*RateLimitReq) return true @@ -185,6 +191,7 @@ func (gm *globalManager) runBroadcasts() { case <-interval.C: if len(updates) != 0 { + gm.metricBroadcastCounter.WithLabelValues("timer").Inc() gm.broadcastPeers(context.Background(), updates) updates = make(map[string]*RateLimitReq) } else { @@ -245,7 +252,7 @@ func (gm *globalManager) broadcastPeers(ctx context.Context, updates map[string] } } - gm.broadcastMetrics.Observe(time.Since(start).Seconds()) + gm.metricBroadcastDuration.Observe(time.Since(start).Seconds()) } func (gm *globalManager) Close() { diff --git a/gubernator.go b/gubernator.go index 5966e64c..5cc0b3ad 100644 --- a/gubernator.go +++ b/gubernator.go @@ -710,8 +710,9 @@ func (s *V1Instance) GetRegionPickers() map[string]PeerPicker { // Describe fetches prometheus metrics to be registered func (s *V1Instance) Describe(ch chan<- *prometheus.Desc) { - ch <- s.global.asyncMetrics.Desc() - ch <- s.global.broadcastMetrics.Desc() + ch <- s.global.metricAsyncDuration.Desc() + ch <- s.global.metricBroadcastDuration.Desc() + s.global.metricBroadcastCounter.Describe(ch) metricGetRateLimitCounter.Describe(ch) metricFuncTimeDuration.Describe(ch) metricAsyncRequestRetriesCounter.Describe(ch) @@ -728,8 +729,9 @@ func (s *V1Instance) Describe(ch chan<- *prometheus.Desc) { // Collect fetches metrics from the server for use by prometheus func (s *V1Instance) Collect(ch chan<- prometheus.Metric) { - ch <- s.global.asyncMetrics - ch <- s.global.broadcastMetrics + ch <- s.global.metricAsyncDuration + ch <- s.global.metricBroadcastDuration + s.global.metricBroadcastCounter.Collect(ch) metricGetRateLimitCounter.Collect(ch) metricFuncTimeDuration.Collect(ch) metricAsyncRequestRetriesCounter.Collect(ch) From 5aaf36e652c9a55816c0f8575353a23dbcde84cb Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Mon, 9 Oct 2023 10:17:14 -0400 Subject: [PATCH 24/32] Configuration by env vars. --- config.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config.go b/config.go index 2c76082d..49f53042 100644 --- a/config.go +++ b/config.go @@ -339,10 +339,12 @@ func SetupDaemonConfig(logger *logrus.Logger, configFile string) (DaemonConfig, setter.SetDefault(&conf.Behaviors.BatchTimeout, getEnvDuration(log, "GUBER_BATCH_TIMEOUT")) setter.SetDefault(&conf.Behaviors.BatchLimit, getEnvInteger(log, "GUBER_BATCH_LIMIT")) setter.SetDefault(&conf.Behaviors.BatchWait, getEnvDuration(log, "GUBER_BATCH_WAIT")) + setter.SetDefault(&conf.Behaviors.DisableBatching, getEnvBool(log, "GUBER_DISABLE_BATCHING")) setter.SetDefault(&conf.Behaviors.GlobalTimeout, getEnvDuration(log, "GUBER_GLOBAL_TIMEOUT")) setter.SetDefault(&conf.Behaviors.GlobalBatchLimit, getEnvInteger(log, "GUBER_GLOBAL_BATCH_LIMIT")) setter.SetDefault(&conf.Behaviors.GlobalSyncWait, getEnvDuration(log, "GUBER_GLOBAL_SYNC_WAIT")) + setter.SetDefault(&conf.Behaviors.ForceGlobal, getEnvBool(log, "GUBER_FORCE_GLOBAL")) // TLS Config if anyHasPrefix("GUBER_TLS_", os.Environ()) { From a963c28219b8f6f93bdeb6484100b975a446d680 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Tue, 10 Oct 2023 10:00:50 -0400 Subject: [PATCH 25/32] Try switching back to original `WorkerPool`. --- gubernator.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gubernator.go b/gubernator.go index 5cc0b3ad..3476f8da 100644 --- a/gubernator.go +++ b/gubernator.go @@ -49,7 +49,7 @@ type V1Instance struct { log FieldLogger conf Config isClosed bool - workerPool *WorkerPool2 + workerPool *WorkerPool } var metricGetRateLimitCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ @@ -125,7 +125,7 @@ func NewV1Instance(conf Config) (s *V1Instance, err error) { conf: conf, } - s.workerPool = NewWorkerPool2(&conf) + s.workerPool = NewWorkerPool(&conf) s.global = newGlobalManager(conf.Behaviors, s) // Register our instance with all GRPC servers From 69f7e5a5cfe3ca6d31842479b9a73a56fa47e27a Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Tue, 10 Oct 2023 10:52:46 -0400 Subject: [PATCH 26/32] Tidy code. --- global.go | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/global.go b/global.go index 1fc619e1..a10a7cf4 100644 --- a/global.go +++ b/global.go @@ -18,9 +18,7 @@ package gubernator import ( "context" - "time" - "github.com/mailgun/holster/v4/clock" "github.com/mailgun/holster/v4/ctxutil" "github.com/mailgun/holster/v4/syncutil" "github.com/prometheus/client_golang/prometheus" @@ -46,18 +44,18 @@ func newGlobalManager(conf BehaviorConfig, instance *V1Instance) *globalManager gm := globalManager{ log: instance.log, metricAsyncDuration: prometheus.NewSummary(prometheus.SummaryOpts{ - Help: "The duration of GLOBAL async sends in seconds.", Name: "gubernator_async_durations", + Help: "The duration of GLOBAL async sends in seconds.", Objectives: map[float64]float64{0.5: 0.05, 0.99: 0.001}, }), metricBroadcastDuration: prometheus.NewSummary(prometheus.SummaryOpts{ - Help: "The duration of GLOBAL broadcasts to peers in seconds.", Name: "gubernator_broadcast_durations", + Help: "The duration of GLOBAL broadcasts to peers in seconds.", Objectives: map[float64]float64{0.5: 0.05, 0.99: 0.001}, }), metricBroadcastCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ - Help: "The count of broadcasts.", Name: "gubernator_broadcast_counter", + Help: "The count of broadcasts.", }, []string{"condition"}), asyncQueue: make(chan *RateLimitReq, conf.GlobalBatchLimit), broadcastQueue: make(chan *RateLimitReq, conf.GlobalBatchLimit), @@ -128,8 +126,8 @@ func (gm *globalManager) sendHits(hits map[string]*RateLimitReq) { client *PeerClient req GetPeerRateLimitsReq } + defer prometheus.NewTimer(gm.metricAsyncDuration).ObserveDuration() peerRequests := make(map[string]*pair) - start := clock.Now() // Assign each request to a peer for _, r := range hits { @@ -162,7 +160,6 @@ func (gm *globalManager) sendHits(hits map[string]*RateLimitReq) { continue } } - gm.metricAsyncDuration.Observe(time.Since(start).Seconds()) } // runBroadcasts collects status changes for global rate limits and broadcasts the changes to each peer in the cluster. @@ -206,9 +203,8 @@ func (gm *globalManager) runBroadcasts() { // broadcastPeers broadcasts global rate limit statuses to all other peers func (gm *globalManager) broadcastPeers(ctx context.Context, updates map[string]*RateLimitReq) { - defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("globalManager.broadcastPeers")).ObserveDuration() + defer prometheus.NewTimer(gm.metricBroadcastDuration).ObserveDuration() var req UpdatePeerGlobalsReq - start := clock.Now() metricGlobalQueueLength.Set(float64(len(updates))) @@ -239,7 +235,7 @@ func (gm *globalManager) broadcastPeers(ctx context.Context, updates map[string] continue } - ctx, cancel := ctxutil.WithTimeout(context.Background(), gm.conf.GlobalTimeout) + ctx, cancel := ctxutil.WithTimeout(ctx, gm.conf.GlobalTimeout) _, err := peer.UpdatePeerGlobals(ctx, &req) cancel() @@ -251,8 +247,6 @@ func (gm *globalManager) broadcastPeers(ctx context.Context, updates map[string] continue } } - - gm.metricBroadcastDuration.Observe(time.Since(start).Seconds()) } func (gm *globalManager) Close() { From 348e44b78c6389af21b214d278988f29147664b0 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Tue, 10 Oct 2023 11:03:50 -0400 Subject: [PATCH 27/32] Remove experimental `WorkerPool2`. --- workers2.go | 507 ---------------------------------------------------- 1 file changed, 507 deletions(-) delete mode 100644 workers2.go diff --git a/workers2.go b/workers2.go deleted file mode 100644 index b105692f..00000000 --- a/workers2.go +++ /dev/null @@ -1,507 +0,0 @@ -package gubernator - -import ( - "context" - "io" - "strconv" - "sync" - "sync/atomic" - - "github.com/mailgun/holster/v4/setter" - "github.com/pkg/errors" - "github.com/prometheus/client_golang/prometheus" - "github.com/sirupsen/logrus" - "go.opentelemetry.io/otel/trace" -) - -type WorkerPool2 struct { - conf *Config - cache Cache - cacheMu sync.Mutex - done chan struct{} - getRateLimitRequest chan request - storeRequest chan workerStoreRequest - loadRequest chan workerLoadRequest - addCacheItemRequest chan workerAddCacheItemRequest - getCacheItemRequest chan workerGetCacheItemRequest - workerCounter atomic.Int64 -} - -type Worker2 struct { - name string - conf *Config -} - -var _ io.Closer = &WorkerPool2{} - -func NewWorkerPool2(conf *Config) *WorkerPool2 { - setter.SetDefault(&conf.CacheSize, 50_000) - - // Compute hashRingStep as interval between workers' 63-bit hash ranges. - // 64th bit is used here as a max value that is just out of range of 63-bit space to calculate the step. - chp := &WorkerPool2{ - conf: conf, - cache: conf.CacheFactory(conf.CacheSize), - done: make(chan struct{}), - getRateLimitRequest: make(chan request), - storeRequest: make(chan workerStoreRequest), - loadRequest: make(chan workerLoadRequest), - addCacheItemRequest: make(chan workerAddCacheItemRequest), - getCacheItemRequest: make(chan workerGetCacheItemRequest), - } - - // Create workers. - conf.Logger.Infof("Starting %d Gubernator workers...", conf.Workers) - for i := 0; i < conf.Workers; i++ { - worker := chp.newWorker() - go worker.dispatch(chp) - } - - return chp -} - -func (p *WorkerPool2) Close() error { - close(p.done) - return nil -} - -// Create a new pool worker instance. -func (p *WorkerPool2) newWorker() *Worker2 { - workerNumber := p.workerCounter.Add(1) - 1 - worker := &Worker2{ - name: strconv.FormatInt(workerNumber, 10), - conf: p.conf, - } - return worker -} - -// Pool worker for processing Gubernator requests. -// Each worker maintains its own state. -// A hash ring will distribute requests to an assigned worker by key. -// See: getWorker() -func (worker *Worker2) dispatch(p *WorkerPool2) { - for { - // Dispatch requests from each channel. - select { - case req, ok := <-p.getRateLimitRequest: - if !ok { - // Channel closed. Unexpected, but should be handled. - logrus.Error("workerPool worker stopped because channel closed") - return - } - - resp := new(response) - p.cacheMu.Lock() - resp.rl, resp.err = worker.handleGetRateLimit(req.ctx, req.request, p.cache) - p.cacheMu.Unlock() - select { - case req.resp <- resp: - // Success. - - case <-req.ctx.Done(): - // Context canceled. - trace.SpanFromContext(req.ctx).RecordError(resp.err) - } - metricCommandCounter.WithLabelValues(worker.name, "GetRateLimit").Inc() - - case req, ok := <-p.storeRequest: - if !ok { - // Channel closed. Unexpected, but should be handled. - logrus.Error("workerPool worker stopped because channel closed") - return - } - - p.cacheMu.Lock() - worker.handleStore(req, p.cache) - p.cacheMu.Unlock() - metricCommandCounter.WithLabelValues(worker.name, "Store").Inc() - - case req, ok := <-p.loadRequest: - if !ok { - // Channel closed. Unexpected, but should be handled. - logrus.Error("workerPool worker stopped because channel closed") - return - } - - p.cacheMu.Lock() - worker.handleLoad(req, p.cache) - p.cacheMu.Unlock() - metricCommandCounter.WithLabelValues(worker.name, "Load").Inc() - - case req, ok := <-p.addCacheItemRequest: - if !ok { - // Channel closed. Unexpected, but should be handled. - logrus.Error("workerPool worker stopped because channel closed") - return - } - - p.cacheMu.Lock() - worker.handleAddCacheItem(req, p.cache) - p.cacheMu.Unlock() - metricCommandCounter.WithLabelValues(worker.name, "AddCacheItem").Inc() - - case req, ok := <-p.getCacheItemRequest: - if !ok { - // Channel closed. Unexpected, but should be handled. - logrus.Error("workerPool worker stopped because channel closed") - return - } - - p.cacheMu.Lock() - worker.handleGetCacheItem(req, p.cache) - p.cacheMu.Unlock() - metricCommandCounter.WithLabelValues(worker.name, "GetCacheItem").Inc() - - case <-p.done: - // Clean up. - return - } - } -} - -// GetRateLimit sends a GetRateLimit request to worker pool. -func (p *WorkerPool2) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) (retval *RateLimitResp, reterr error) { - // Delegate request to assigned channel based on request key. - queueGauge := metricWorkerQueue.WithLabelValues("GetRateLimit") - queueGauge.Inc() - defer queueGauge.Dec() - timer2 := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("WorkerPool2.GetRateLimit_2")) - handlerRequest := request{ - ctx: ctx, - resp: make(chan *response, 1), - request: rlRequest, - } - - // Send request. - select { - case p.getRateLimitRequest <- handlerRequest: - // Successfully sent request. - case <-ctx.Done(): - return nil, ctx.Err() - } - timer2.ObserveDuration() - - // Wait for response. - defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("WorkerPool2.GetRateLimit_3")).ObserveDuration() - select { - case handlerResponse := <-handlerRequest.resp: - // Successfully read response. - return handlerResponse.rl, handlerResponse.err - case <-ctx.Done(): - return nil, ctx.Err() - } -} - -// Handle request received by worker. -func (worker *Worker2) handleGetRateLimit(ctx context.Context, req *RateLimitReq, cache Cache) (*RateLimitResp, error) { - defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("Worker2.handleGetRateLimit")).ObserveDuration() - var rlResponse *RateLimitResp - var err error - - switch req.Algorithm { - case Algorithm_TOKEN_BUCKET: - rlResponse, err = tokenBucket(ctx, worker.conf.Store, cache, req) - if err != nil { - msg := "Error in tokenBucket" - countError(err, msg) - err = errors.Wrap(err, msg) - trace.SpanFromContext(ctx).RecordError(err) - } - - case Algorithm_LEAKY_BUCKET: - rlResponse, err = leakyBucket(ctx, worker.conf.Store, cache, req) - if err != nil { - msg := "Error in leakyBucket" - countError(err, msg) - err = errors.Wrap(err, msg) - trace.SpanFromContext(ctx).RecordError(err) - } - - default: - err = errors.Errorf("Invalid rate limit algorithm '%d'", req.Algorithm) - trace.SpanFromContext(ctx).RecordError(err) - metricCheckErrorCounter.WithLabelValues("Invalid algorithm").Add(1) - } - - return rlResponse, err -} - -// Load atomically loads cache from persistent storage. -// Read from persistent storage. Load into each appropriate worker's cache. -// Workers are locked during this load operation to prevent race conditions. -func (p *WorkerPool2) Load(ctx context.Context) (err error) { - queueGauge := metricWorkerQueue.WithLabelValues("Load") - queueGauge.Inc() - defer queueGauge.Dec() - ch, err := p.conf.Loader.Load() - if err != nil { - return errors.Wrap(err, "Error in loader.Load") - } - - type loadChannel struct { - ch chan *CacheItem - respChan chan workerLoadResponse - } - - loadCh := loadChannel{ - ch: make(chan *CacheItem), - respChan: make(chan workerLoadResponse), - } - p.loadRequest <- workerLoadRequest{ - ctx: ctx, - response: loadCh.respChan, - in: loadCh.ch, - } - - // Send each item. -MAIN: - for { - var item *CacheItem - var ok bool - - select { - case item, ok = <-ch: - if !ok { - break MAIN - } - // Successfully received item. - - case <-ctx.Done(): - // Context canceled. - return ctx.Err() - } - - // Send item to worker's load channel. - select { - case loadCh.ch <- item: - // Successfully sent item. - - case <-ctx.Done(): - // Context canceled. - return ctx.Err() - } - } - - // Clean up. - close(loadCh.ch) - - select { - case <-loadCh.respChan: - // Successfully received response. - - case <-ctx.Done(): - // Context canceled. - return ctx.Err() - } - - return nil -} - -func (worker *Worker2) handleLoad(request workerLoadRequest, cache Cache) { -MAIN: - for { - var item *CacheItem - var ok bool - - select { - case item, ok = <-request.in: - if !ok { - break MAIN - } - // Successfully received item. - - case <-request.ctx.Done(): - // Context canceled. - return - } - - cache.Add(item) - } - - response := workerLoadResponse{} - - select { - case request.response <- response: - // Successfully sent response. - - case <-request.ctx.Done(): - // Context canceled. - trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) - } -} - -// Store atomically stores cache to persistent storage. -// Save all workers' caches to persistent storage. -// Workers are locked during this store operation to prevent race conditions. -func (p *WorkerPool2) Store(ctx context.Context) (err error) { - queueGauge := metricWorkerQueue.WithLabelValues("Store") - queueGauge.Inc() - defer queueGauge.Dec() - var wg sync.WaitGroup - out := make(chan *CacheItem, 500) - - // Iterate each worker's cache to `out` channel. - wg.Add(1) - - go func() { - defer wg.Done() - - respChan := make(chan workerStoreResponse) - req := workerStoreRequest{ - ctx: ctx, - response: respChan, - out: out, - } - - select { - case p.storeRequest <- req: - // Successfully sent request. - select { - case <-respChan: - // Successfully received response. - return - - case <-ctx.Done(): - // Context canceled. - trace.SpanFromContext(ctx).RecordError(ctx.Err()) - return - } - - case <-ctx.Done(): - // Context canceled. - trace.SpanFromContext(ctx).RecordError(ctx.Err()) - return - } - }() - - // When all iterators are done, close `out` channel. - go func() { - wg.Wait() - close(out) - }() - - if ctx.Err() != nil { - return ctx.Err() - } - - if err = p.conf.Loader.Save(out); err != nil { - return errors.Wrap(err, "while calling p.conf.Loader.Save()") - } - - return nil -} - -func (worker *Worker2) handleStore(request workerStoreRequest, cache Cache) { - for item := range cache.Each() { - select { - case request.out <- item: - // Successfully sent item. - - case <-request.ctx.Done(): - // Context canceled. - trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) - return - } - } - - response := workerStoreResponse{} - - select { - case request.response <- response: - // Successfully sent response. - - case <-request.ctx.Done(): - // Context canceled. - trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) - } -} - -// AddCacheItem adds an item to the worker's cache. -func (p *WorkerPool2) AddCacheItem(ctx context.Context, key string, item *CacheItem) (err error) { - queueGauge := metricWorkerQueue.WithLabelValues("AddCacheItem") - queueGauge.Inc() - defer queueGauge.Dec() - respChan := make(chan workerAddCacheItemResponse) - req := workerAddCacheItemRequest{ - ctx: ctx, - response: respChan, - item: item, - } - - select { - case p.addCacheItemRequest <- req: - // Successfully sent request. - select { - case <-respChan: - // Successfully received response. - return nil - - case <-ctx.Done(): - // Context canceled. - return ctx.Err() - } - - case <-ctx.Done(): - // Context canceled. - return ctx.Err() - } -} - -func (worker *Worker2) handleAddCacheItem(request workerAddCacheItemRequest, cache Cache) { - exists := cache.Add(request.item) - response := workerAddCacheItemResponse{exists} - - select { - case request.response <- response: - // Successfully sent response. - - case <-request.ctx.Done(): - // Context canceled. - trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) - } -} - -// GetCacheItem gets item from worker's cache. -func (p *WorkerPool2) GetCacheItem(ctx context.Context, key string) (item *CacheItem, found bool, err error) { - queueGauge := metricWorkerQueue.WithLabelValues("GetCacheItem") - queueGauge.Inc() - defer queueGauge.Dec() - respChan := make(chan workerGetCacheItemResponse) - req := workerGetCacheItemRequest{ - ctx: ctx, - response: respChan, - key: key, - } - - select { - case p.getCacheItemRequest <- req: - // Successfully sent request. - select { - case resp := <-respChan: - // Successfully received response. - return resp.item, resp.ok, nil - - case <-ctx.Done(): - // Context canceled. - return nil, false, ctx.Err() - } - - case <-ctx.Done(): - // Context canceled. - return nil, false, ctx.Err() - } -} - -func (worker *Worker2) handleGetCacheItem(request workerGetCacheItemRequest, cache Cache) { - item, ok := cache.GetItem(request.key) - response := workerGetCacheItemResponse{item, ok} - - select { - case request.response <- response: - // Successfully sent response. - - case <-request.ctx.Done(): - // Context canceled. - trace.SpanFromContext(request.ctx).RecordError(request.ctx.Err()) - } -} From 2d13aa2714368bdea24578a6c7a80f88f3856bc9 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Tue, 10 Oct 2023 14:49:19 -0400 Subject: [PATCH 28/32] Clean up metrics. Rename, remove, add. Document in prometheus.md. --- docs/prometheus.md | 31 +++++--- global.go | 26 +----- gubernator.go | 192 ++++++++++++++++++++++++--------------------- peer_client.go | 2 +- workers.go | 25 ++++-- 5 files changed, 143 insertions(+), 133 deletions(-) diff --git a/docs/prometheus.md b/docs/prometheus.md index 411b8c2d..bcdaf2be 100644 --- a/docs/prometheus.md +++ b/docs/prometheus.md @@ -16,19 +16,28 @@ Finally, configure a Prometheus job to scrape the server's `/metrics` URI. | Metric | Type | Description | | -------------------------------------- | ------- | ----------- | -| `gubernator_async_durations` | Summary | The timings of GLOBAL async sends in seconds. | -| `gubernator_asyncrequest_retries` | Counter | The count of retries occurred in asyncRequests() forwarding a request to another peer. | -| `gubernator_batch_send_duration` | Summary | The timings of batch send operations to a remote peer. | -| `gubernator_broadcast_durations` | Summary | The timings of GLOBAL broadcasts to peers in seconds. | | `gubernator_cache_access_count` | Counter | The count of LRUCache accesses during rate checks. | | `gubernator_cache_size` | Gauge | The number of items in LRU Cache which holds the rate limits. | -| `gubernator_check_counter` | Counter | The number of rate limits checked. | | `gubernator_check_error_counter` | Counter | The number of errors while checking rate limits. | -| `gubernator_concurrent_checks_counter` | Summary | 99th quantile of concurrent rate checks. This includes rate checks processed locally and forwarded to other peers. | -| `gubernator_func_duration` | Summary | The 99th quantile of key function timings in seconds. | -| `gubernator_getratelimit_counter` | Counter | The count of getRateLimit() calls. Label \"calltype\" may be \"local\" for calls handled by the same peer, \"forward\" for calls forwarded to another peer, or \"global\" for global rate limits. | +| `gubernator_command_counter` | Counter | The count of commands processed by each worker in WorkerPool. | +| `gubernator_concurrent_checks_counter` | Gauge | The number of concurrent GetRateLimits API calls. | +| `gubernator_func_duration` | Summary | The timings of key functions in Gubernator in seconds. | +| `gubernator_getratelimit_counter` | Counter | The count of getLocalRateLimit() calls. Label \"calltype\" may be \"local\" for calls handled by the same peer, \"forward\" for calls forwarded to another peer, or \"global\" for global rate limits. | | `gubernator_grpc_request_counts` | Counter | The count of gRPC requests. | -| `gubernator_grpc_request_duration` | Summary | The 99th quantile timings of gRPC requests in seconds. | +| `gubernator_grpc_request_duration` | Summary | The timings of gRPC requests in seconds. | | `gubernator_over_limit_counter` | Counter | The number of rate limit checks that are over the limit. | -| `gubernator_pool_queue_length` | Summary | The 99th quantile of rate check requests queued up in GubernatorPool. The is the work queue for local rate checks. | -| `gubernator_queue_length` | Summary | The 99th quantile of rate check requests queued up for batching to other peers by getPeerRateLimitsBatch(). This is the work queue for remote rate checks. Label "peerAddr" indicates queued requests to that peer. | +| `gubernator_worker_queue_length` | Gauge | The count of requests queued up in WorkerPool. | + +### Global Behavior +| Metric | Type | Description | +| -------------------------------------- | ------- | ----------- | +| `gubernator_broadcast_counter` | Counter | The count of broadcasts. | +| `gubernator_broadcast_duration` | Summary | The timings of GLOBAL broadcasts to peers in seconds. | +| `gubernator_global_queue_length` | Gauge | The count of requests queued up for global broadcast. This is only used for GetRateLimit requests using global behavior. | + +### Batch Behavior +| Metric | Type | Description | +| -------------------------------------- | ------- | ----------- | +| `gubernator_batch_queue_length` | Gauge | The getRateLimitsBatch() queue length in PeerClient. This represents rate checks queued by for batching to a remote peer. | +| `gubernator_batch_send_duration` | Summary | The timings of batch send operations to a remote peer. | +| `gubernator_batch_send_retries` | Counter | The count of retries occurred in asyncRequests() forwarding a request to another peer. | diff --git a/global.go b/global.go index a10a7cf4..72a82aaf 100644 --- a/global.go +++ b/global.go @@ -34,29 +34,11 @@ type globalManager struct { conf BehaviorConfig log FieldLogger instance *V1Instance - - metricAsyncDuration prometheus.Summary - metricBroadcastDuration prometheus.Summary - metricBroadcastCounter *prometheus.CounterVec } func newGlobalManager(conf BehaviorConfig, instance *V1Instance) *globalManager { gm := globalManager{ log: instance.log, - metricAsyncDuration: prometheus.NewSummary(prometheus.SummaryOpts{ - Name: "gubernator_async_durations", - Help: "The duration of GLOBAL async sends in seconds.", - Objectives: map[float64]float64{0.5: 0.05, 0.99: 0.001}, - }), - metricBroadcastDuration: prometheus.NewSummary(prometheus.SummaryOpts{ - Name: "gubernator_broadcast_durations", - Help: "The duration of GLOBAL broadcasts to peers in seconds.", - Objectives: map[float64]float64{0.5: 0.05, 0.99: 0.001}, - }), - metricBroadcastCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "gubernator_broadcast_counter", - Help: "The count of broadcasts.", - }, []string{"condition"}), asyncQueue: make(chan *RateLimitReq, conf.GlobalBatchLimit), broadcastQueue: make(chan *RateLimitReq, conf.GlobalBatchLimit), instance: instance, @@ -126,7 +108,7 @@ func (gm *globalManager) sendHits(hits map[string]*RateLimitReq) { client *PeerClient req GetPeerRateLimitsReq } - defer prometheus.NewTimer(gm.metricAsyncDuration).ObserveDuration() + defer prometheus.NewTimer(metricGlobalSendDuration).ObserveDuration() peerRequests := make(map[string]*pair) // Assign each request to a peer @@ -174,7 +156,7 @@ func (gm *globalManager) runBroadcasts() { // Send the hits if we reached our batch limit if len(updates) >= gm.conf.GlobalBatchLimit { - gm.metricBroadcastCounter.WithLabelValues("queue_full").Inc() + metricBroadcastCounter.WithLabelValues("queue_full").Inc() gm.broadcastPeers(context.Background(), updates) updates = make(map[string]*RateLimitReq) return true @@ -188,7 +170,7 @@ func (gm *globalManager) runBroadcasts() { case <-interval.C: if len(updates) != 0 { - gm.metricBroadcastCounter.WithLabelValues("timer").Inc() + metricBroadcastCounter.WithLabelValues("timer").Inc() gm.broadcastPeers(context.Background(), updates) updates = make(map[string]*RateLimitReq) } else { @@ -203,7 +185,7 @@ func (gm *globalManager) runBroadcasts() { // broadcastPeers broadcasts global rate limit statuses to all other peers func (gm *globalManager) broadcastPeers(ctx context.Context, updates map[string]*RateLimitReq) { - defer prometheus.NewTimer(gm.metricBroadcastDuration).ObserveDuration() + defer prometheus.NewTimer(metricBroadcastDuration).ObserveDuration() var req UpdatePeerGlobalsReq metricGlobalQueueLength.Set(float64(len(updates))) diff --git a/gubernator.go b/gubernator.go index 3476f8da..41aa5fea 100644 --- a/gubernator.go +++ b/gubernator.go @@ -44,70 +44,86 @@ const ( type V1Instance struct { UnimplementedV1Server UnimplementedPeersV1Server - global *globalManager - peerMutex sync.RWMutex - log FieldLogger - conf Config - isClosed bool - workerPool *WorkerPool + global *globalManager + peerMutex sync.RWMutex + log FieldLogger + conf Config + isClosed bool + workerPool *WorkerPool } -var metricGetRateLimitCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "gubernator_getratelimit_counter", - Help: "The count of getLocalRateLimit() calls. Label \"calltype\" may be \"local\" for calls handled by the same peer, \"forward\" for calls forwarded to another peer, or \"global\" for global rate limits.", -}, []string{"calltype"}) -var metricFuncTimeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ - Name: "gubernator_func_duration", - Help: "The timings of key functions in Gubernator in seconds.", - Objectives: map[float64]float64{ - 1: 0.001, - 0.99: 0.001, - 0.5: 0.01, - }, -}, []string{"name"}) -var metricAsyncRequestRetriesCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "gubernator_asyncrequest_retries", - Help: "The count of retries occurred in asyncRequest() forwarding a request to another peer.", -}, []string{"name"}) -var metricQueueLength = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "gubernator_queue_length", - Help: "The getRateLimitsBatch() queue length in PeerClient. This represents rate checks queued by for batching to a remote peer.", -}, []string{"peerAddr"}) -var metricCheckCounter = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "gubernator_check_counter", - Help: "The number of rate limits checked.", -}) -var metricOverLimitCounter = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "gubernator_over_limit_counter", - Help: "The number of rate limit checks that are over the limit.", -}) -var metricConcurrentChecks = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "gubernator_concurrent_checks_counter", - Help: "The number of concurrent GetRateLimits API calls.", -}) -var metricCheckErrorCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "gubernator_check_error_counter", - Help: "The number of errors while checking rate limits.", -}, []string{"error"}) -var metricBatchSendDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ - Name: "gubernator_batch_send_duration", - Help: "The timings of batch send operations to a remote peer.", - Objectives: map[float64]float64{ - 0.99: 0.001, - }, -}, []string{"peerAddr"}) -var metricCommandCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ - Name: "gubernator_command_counter", - Help: "The count of commands processed by each worker in WorkerPool.", -}, []string{"worker", "method"}) -var metricWorkerQueue = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "gubernator_worker_queue", - Help: "The count of requests queued up in WorkerPool.", -}, []string{"method"}) -var metricGlobalQueueLength = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "gubernator_global_queue_length", - Help: "The count of requests queued up for global broadcast. This is only used for GetRateLimit requests using global behavior.", -}) +var ( + metricGetRateLimitCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "gubernator_getratelimit_counter", + Help: "The count of getLocalRateLimit() calls. Label \"calltype\" may be \"local\" for calls handled by the same peer, \"forward\" for calls forwarded to another peer, or \"global\" for global rate limits.", + }, []string{"calltype"}) + metricFuncTimeDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "gubernator_func_duration", + Help: "The timings of key functions in Gubernator in seconds.", + Objectives: map[float64]float64{ + 1: 0.001, + 0.99: 0.001, + 0.5: 0.01, + }, + }, []string{"name"}) + metricOverLimitCounter = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "gubernator_over_limit_counter", + Help: "The number of rate limit checks that are over the limit.", + }) + metricConcurrentChecks = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "gubernator_concurrent_checks_counter", + Help: "The number of concurrent GetRateLimits API calls.", + }) + metricCheckErrorCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "gubernator_check_error_counter", + Help: "The number of errors while checking rate limits.", + }, []string{"error"}) + metricCommandCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "gubernator_command_counter", + Help: "The count of commands processed by each worker in WorkerPool.", + }, []string{"worker", "method"}) + metricWorkerQueue = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "gubernator_worker_queue_length", + Help: "The count of requests queued up in WorkerPool.", + }, []string{"method", "worker"}) + + // Batch behavior. + metricBatchSendRetries = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "gubernator_batch_send_retries", + Help: "The count of retries occurred in asyncRequest() forwarding a request to another peer.", + }, []string{"name"}) + metricBatchQueueLength = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "gubernator_batch_queue_length", + Help: "The getRateLimitsBatch() queue length in PeerClient. This represents rate checks queued by for batching to a remote peer.", + }, []string{"peerAddr"}) + metricBatchSendDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "gubernator_batch_send_duration", + Help: "The timings of batch send operations to a remote peer.", + Objectives: map[float64]float64{ + 0.99: 0.001, + }, + }, []string{"peerAddr"}) + + // Global behavior. + metricGlobalSendDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "gubernator_global_send_duration", + Help: "The duration of GLOBAL async sends in seconds.", + Objectives: map[float64]float64{0.5: 0.05, 0.99: 0.001}, + }) + metricBroadcastDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "gubernator_broadcast_duration", + Help: "The duration of GLOBAL broadcasts to peers in seconds.", + Objectives: map[float64]float64{0.5: 0.05, 0.99: 0.001}, + }) + metricBroadcastCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "gubernator_broadcast_counter", + Help: "The count of broadcasts.", + }, []string{"condition"}) + metricGlobalQueueLength = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "gubernator_global_queue_length", + Help: "The count of requests queued up for global broadcast. This is only used for GetRateLimit requests using global behavior.", + }) +) // NewV1Instance instantiate a single instance of a gubernator peer and register this // instance with the provided GRPCServer. @@ -360,7 +376,7 @@ func (s *V1Instance) asyncRequest(ctx context.Context, req *AsyncReq) { if err != nil { if IsNotReady(err) { attempts++ - metricAsyncRequestRetriesCounter.WithLabelValues(req.Req.Name).Add(1) + metricBatchSendRetries.WithLabelValues(req.Req.Name).Add(1) req.Peer, err = s.GetPeer(ctx, req.Key) if err != nil { errPart := fmt.Sprintf("Error finding peer that owns rate limit '%s'", req.Key) @@ -575,16 +591,12 @@ func (s *V1Instance) getLocalRateLimit(ctx context.Context, r *RateLimitReq) (*R )) defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("V1Instance.getLocalRateLimit")).ObserveDuration() - metricCheckCounter.Add(1) if HasBehavior(r.Behavior, Behavior_GLOBAL) { s.global.QueueUpdate(r) } resp, err := s.workerPool.GetRateLimit(ctx, r) - if isDeadlineExceeded(err) { - metricCheckErrorCounter.WithLabelValues("Timeout").Add(1) - } tracing.EndScope(ctx, err) return resp, err @@ -710,40 +722,38 @@ func (s *V1Instance) GetRegionPickers() map[string]PeerPicker { // Describe fetches prometheus metrics to be registered func (s *V1Instance) Describe(ch chan<- *prometheus.Desc) { - ch <- s.global.metricAsyncDuration.Desc() - ch <- s.global.metricBroadcastDuration.Desc() - s.global.metricBroadcastCounter.Describe(ch) - metricGetRateLimitCounter.Describe(ch) - metricFuncTimeDuration.Describe(ch) - metricAsyncRequestRetriesCounter.Describe(ch) - metricQueueLength.Describe(ch) - metricConcurrentChecks.Describe(ch) - metricCheckErrorCounter.Describe(ch) - metricOverLimitCounter.Describe(ch) - metricCheckCounter.Describe(ch) + metricBatchQueueLength.Describe(ch) metricBatchSendDuration.Describe(ch) + metricBatchSendRetries.Describe(ch) + metricBroadcastCounter.Describe(ch) + metricBroadcastDuration.Describe(ch) + metricCheckErrorCounter.Describe(ch) metricCommandCounter.Describe(ch) - metricWorkerQueue.Describe(ch) + metricConcurrentChecks.Describe(ch) + metricFuncTimeDuration.Describe(ch) + metricGetRateLimitCounter.Describe(ch) metricGlobalQueueLength.Describe(ch) + metricGlobalSendDuration.Describe(ch) + metricOverLimitCounter.Describe(ch) + metricWorkerQueue.Describe(ch) } // Collect fetches metrics from the server for use by prometheus func (s *V1Instance) Collect(ch chan<- prometheus.Metric) { - ch <- s.global.metricAsyncDuration - ch <- s.global.metricBroadcastDuration - s.global.metricBroadcastCounter.Collect(ch) - metricGetRateLimitCounter.Collect(ch) - metricFuncTimeDuration.Collect(ch) - metricAsyncRequestRetriesCounter.Collect(ch) - metricQueueLength.Collect(ch) - metricConcurrentChecks.Collect(ch) - metricCheckErrorCounter.Collect(ch) - metricOverLimitCounter.Collect(ch) - metricCheckCounter.Collect(ch) + metricBatchQueueLength.Collect(ch) metricBatchSendDuration.Collect(ch) + metricBatchSendRetries.Collect(ch) + metricBroadcastCounter.Collect(ch) + metricBroadcastDuration.Collect(ch) + metricCheckErrorCounter.Collect(ch) metricCommandCounter.Collect(ch) - metricWorkerQueue.Collect(ch) + metricConcurrentChecks.Collect(ch) + metricFuncTimeDuration.Collect(ch) + metricGetRateLimitCounter.Collect(ch) metricGlobalQueueLength.Collect(ch) + metricGlobalSendDuration.Collect(ch) + metricOverLimitCounter.Collect(ch) + metricWorkerQueue.Collect(ch) } // HasBehavior returns true if the provided behavior is set diff --git a/peer_client.go b/peer_client.go index 18a5efc8..ebb48a90 100644 --- a/peer_client.go +++ b/peer_client.go @@ -318,7 +318,7 @@ func (c *PeerClient) getPeerRateLimitsBatch(ctx context.Context, r *RateLimitReq // Enqueue the request to be sent peerAddr := c.Info().GRPCAddress - metricQueueLength.WithLabelValues(peerAddr).Set(float64(len(c.queue))) + metricBatchQueueLength.WithLabelValues(peerAddr).Set(float64(len(c.queue))) select { case c.queue <- &req: diff --git a/workers.go b/workers.go index 6481f957..07ba177f 100644 --- a/workers.go +++ b/workers.go @@ -260,11 +260,10 @@ func (p *WorkerPool) dispatch(worker *Worker) { // GetRateLimit sends a GetRateLimit request to worker pool. func (p *WorkerPool) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) (retval *RateLimitResp, reterr error) { // Delegate request to assigned channel based on request key. - timer1 := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("WorkerPool.GetRateLimit_1")) worker := p.getWorker(rlRequest.HashKey()) - timer1.ObserveDuration() - - timer2 := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("WorkerPool.GetRateLimit_2")) + queueGauge := metricWorkerQueue.WithLabelValues("GetRateLimit", worker.name) + queueGauge.Inc() + defer queueGauge.Dec() handlerRequest := request{ ctx: ctx, resp: make(chan *response, 1), @@ -278,10 +277,8 @@ func (p *WorkerPool) GetRateLimit(ctx context.Context, rlRequest *RateLimitReq) case <-ctx.Done(): return nil, ctx.Err() } - timer2.ObserveDuration() // Wait for response. - defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("WorkerPool.GetRateLimit_3")).ObserveDuration() select { case handlerResponse := <-handlerRequest.resp: // Successfully read response. @@ -329,6 +326,9 @@ func (worker *Worker) handleGetRateLimit(ctx context.Context, req *RateLimitReq, // Read from persistent storage. Load into each appropriate worker's cache. // Workers are locked during this load operation to prevent race conditions. func (p *WorkerPool) Load(ctx context.Context) (err error) { + queueGauge := metricWorkerQueue.WithLabelValues("Load", "") + queueGauge.Inc() + defer queueGauge.Dec() ch, err := p.conf.Loader.Load() if err != nil { return errors.Wrap(err, "Error in loader.Load") @@ -448,6 +448,9 @@ MAIN: // Save all workers' caches to persistent storage. // Workers are locked during this store operation to prevent race conditions. func (p *WorkerPool) Store(ctx context.Context) (err error) { + queueGauge := metricWorkerQueue.WithLabelValues("Store", "") + queueGauge.Inc() + defer queueGauge.Dec() var wg sync.WaitGroup out := make(chan *CacheItem, 500) @@ -531,8 +534,11 @@ func (worker *Worker) handleStore(request workerStoreRequest, cache Cache) { // AddCacheItem adds an item to the worker's cache. func (p *WorkerPool) AddCacheItem(ctx context.Context, key string, item *CacheItem) (err error) { - respChan := make(chan workerAddCacheItemResponse) worker := p.getWorker(key) + queueGauge := metricWorkerQueue.WithLabelValues("AddCacheItem", worker.name) + queueGauge.Inc() + defer queueGauge.Dec() + respChan := make(chan workerAddCacheItemResponse) req := workerAddCacheItemRequest{ ctx: ctx, response: respChan, @@ -574,8 +580,11 @@ func (worker *Worker) handleAddCacheItem(request workerAddCacheItemRequest, cach // GetCacheItem gets item from worker's cache. func (p *WorkerPool) GetCacheItem(ctx context.Context, key string) (item *CacheItem, found bool, err error) { - respChan := make(chan workerGetCacheItemResponse) worker := p.getWorker(key) + queueGauge := metricWorkerQueue.WithLabelValues("GetCacheItem", worker.name) + queueGauge.Inc() + defer queueGauge.Dec() + respChan := make(chan workerGetCacheItemResponse) req := workerGetCacheItemRequest{ ctx: ctx, response: respChan, From 1202908a89d4faa6724866ebbcbed143673c50c6 Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Tue, 10 Oct 2023 14:53:11 -0400 Subject: [PATCH 29/32] Update golangci-lint config, fix lint errors. --- .golangci.yml | 112 +++++++++++++++++++++++++++++++++++++++ cmd/healthcheck/main.go | 1 + functional_test.go | 16 +++--- global.go | 2 +- tls.go | 2 +- tls_test.go | 9 ++-- workers_internal_test.go | 2 +- 7 files changed, 130 insertions(+), 14 deletions(-) create mode 100644 .golangci.yml diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 00000000..844e1358 --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,112 @@ +linters: + # Please, do not use `enable-all`: it's deprecated and will be removed soon. + # Inverted configuration with `enable-all` and `disable` is not scalable during updates of golangci-lint. + # Full list of linters - https://golangci-lint.run/usage/linters + disable-all: true + enable: + - bodyclose # https://github.com/timakin/bodyclose + # - gomodguard + - errcheck # Mandatory. Do not disable. + # - gocritic + - goimports + # - gosec + - gosimple + - govet + # - noctx + - nolintlint + - ineffassign # Mandatory. Do not disable. + - staticcheck # Mandatory. Do not disable. + - stylecheck + - typecheck + - unused + +# Other linters: +# - dogsled +# - dupl +# - exportloopref +# - exhaustive # e.g. missing cases in switch of type +# - funlen +# - gochecknoinits +# - gocognit +# - goconst +# - gocyclo +# - goerr113 +# - gofmt +# - goprintffuncname +# - lll +# - misspell +# - nakedret +# - nlreturn +# - prealloc +# - revive +# - rowserrcheck +# - stylecheck +# - unconvert +# - unparam + +linters-settings: + gocritic: + enabled-tags: + - diagnostic + - experimental + - opinionated + - performance + - style + disabled-checks: + - dupImport # https://github.com/go-critic/go-critic/issues/845 + - whyNoLint # checked by nolintlint linter + - hugeParam # TODO(vtopc): configure(80 bytes is probably not so much) and enable. + - rangeValCopy # TODO(vtopc): configure(disable for tests) and enable. + + errcheck: + # List of functions to exclude from checking, where each entry is a single function to exclude. + # See https://github.com/kisielk/errcheck#excluding-functions for details. + exclude-functions: + - (io.Closer).Close + - (io.ReadCloser).Close + + govet: + enable-all: true + disable: + - shadow + - fieldalignment + + gomodguard: + blocked: + # List of blocked modules. + # Default: [] + modules: + - github.com/golang/protobuf: + recommendations: + - google.golang.org/protobuf + reason: "see https://developers.google.com/protocol-buffers/docs/reference/go/faq#modules" + - github.com/pkg/errors: + recommendations: + - errors + - github.com/mailgun/errors + reason: "Deprecated" + + stylecheck: + # Select the Go version to target. + # Default: 1.13 + go: "1.19" + # https://staticcheck.io/docs/options#checks + checks: ["all"] + +issues: + # Maximum issues count per one linter. Set to 0 to disable. Default is 50. + max-issues-per-linter: 0 + + # Maximum count of issues with the same text. Set to 0 to disable. Default is 3. + max-same-issues: 50 + +run: + # include test files or not, default is true + tests: true + + # Timeout for analysis, e.g. 30s, 5m. + # Default: 1m + timeout: 5m + + skip-dirs: + - googleapis diff --git a/cmd/healthcheck/main.go b/cmd/healthcheck/main.go index 4a26fb7c..a98259d9 100644 --- a/cmd/healthcheck/main.go +++ b/cmd/healthcheck/main.go @@ -35,6 +35,7 @@ func main() { if err != nil { panic(err) } + defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { diff --git a/functional_test.go b/functional_test.go index 20b64cb3..23ac62a2 100644 --- a/functional_test.go +++ b/functional_test.go @@ -26,7 +26,6 @@ import ( "strings" "testing" - "github.com/mailgun/gubernator/v2" guber "github.com/mailgun/gubernator/v2" "github.com/mailgun/gubernator/v2/cluster" "github.com/mailgun/holster/v4/clock" @@ -1156,6 +1155,7 @@ func TestGRPCGateway(t *testing.T) { address := cluster.GetRandomPeer(cluster.DataCenterNone).HTTPAddress resp, err := http.DefaultClient.Get("http://" + address + "/v1/HealthCheck") require.NoError(t, err) + defer resp.Body.Close() assert.Equal(t, http.StatusOK, resp.StatusCode) b, err := io.ReadAll(resp.Body) @@ -1196,12 +1196,12 @@ func TestGRPCGateway(t *testing.T) { // error. require.NoError(t, json.Unmarshal(b, &r)) require.Equal(t, 1, len(r.Responses)) - assert.Equal(t, gubernator.Status_UNDER_LIMIT, r.Responses[0].Status) + assert.Equal(t, guber.Status_UNDER_LIMIT, r.Responses[0].Status) } func TestGetPeerRateLimits(t *testing.T) { ctx := context.Background() - peerClient := gubernator.NewPeerClient(gubernator.PeerConfig{ + peerClient := guber.NewPeerClient(guber.PeerConfig{ Info: cluster.GetRandomPeer(cluster.DataCenterNone), }) @@ -1213,18 +1213,18 @@ func TestGetPeerRateLimits(t *testing.T) { for _, n := range testCases { t.Run(fmt.Sprintf("Batch size %d", n), func(t *testing.T) { // Build request. - req := &gubernator.GetPeerRateLimitsReq{ - Requests: make([]*gubernator.RateLimitReq, n), + req := &guber.GetPeerRateLimitsReq{ + Requests: make([]*guber.RateLimitReq, n), } for i := 0; i < n; i++ { - req.Requests[i] = &gubernator.RateLimitReq{ + req.Requests[i] = &guber.RateLimitReq{ Name: "Foobar", UniqueKey: fmt.Sprintf("%08x", i), Hits: 0, Limit: 1000 + int64(i), Duration: 1000, - Algorithm: gubernator.Algorithm_TOKEN_BUCKET, - Behavior: gubernator.Behavior_BATCHING, + Algorithm: guber.Algorithm_TOKEN_BUCKET, + Behavior: guber.Behavior_BATCHING, } } diff --git a/global.go b/global.go index 72a82aaf..630a4a14 100644 --- a/global.go +++ b/global.go @@ -38,7 +38,7 @@ type globalManager struct { func newGlobalManager(conf BehaviorConfig, instance *V1Instance) *globalManager { gm := globalManager{ - log: instance.log, + log: instance.log, asyncQueue: make(chan *RateLimitReq, conf.GlobalBatchLimit), broadcastQueue: make(chan *RateLimitReq, conf.GlobalBatchLimit), instance: instance, diff --git a/tls.go b/tls.go index 7345e3d3..5a050479 100644 --- a/tls.go +++ b/tls.go @@ -269,7 +269,7 @@ func SetupTLS(conf *TLSConfig) error { } // error if neither was provided - // nolint:staticcheck // ignoring tlsCert.RootCAs.Subjects is deprecated because cert does not come from SystemCertPool. + //nolint:staticcheck // ignoring tlsCert.RootCAs.Subjects is deprecated because cert does not come from SystemCertPool. if len(clientPool.Subjects()) == 0 { return errors.New("client auth enabled, but no CA's provided") } diff --git a/tls_test.go b/tls_test.go index 0748c1de..38b203fa 100644 --- a/tls_test.go +++ b/tls_test.go @@ -323,18 +323,21 @@ func TestHTTPSClientAuth(t *testing.T) { // Test that a client without a cert can access /v1/HealthCheck at status address resp, err := clientWithoutCert.Do(reqNoClientCertRequired) require.NoError(t, err) + defer resp.Body.Close() b, err := io.ReadAll(resp.Body) require.NoError(t, err) assert.Equal(t, `{"status":"healthy","message":"","peer_count":1}`, strings.ReplaceAll(string(b), " ", "")) // Verify we get an error when we try to access existing HTTPListenAddress without cert - _, err = clientWithoutCert.Do(reqCertRequired) + resp2, err := clientWithoutCert.Do(reqCertRequired) assert.Error(t, err) + defer resp2.Body.Close() // Check that with a valid client cert we can access /v1/HealthCheck at existing HTTPListenAddress - resp, err = clientWithCert.Do(reqCertRequired) + resp3, err := clientWithCert.Do(reqCertRequired) require.NoError(t, err) - b, err = io.ReadAll(resp.Body) + defer resp3.Body.Close() + b, err = io.ReadAll(resp3.Body) require.NoError(t, err) assert.Equal(t, `{"status":"healthy","message":"","peer_count":1}`, strings.ReplaceAll(string(b), " ", "")) } diff --git a/workers_internal_test.go b/workers_internal_test.go index 9b15c73e..d7a2a6bf 100644 --- a/workers_internal_test.go +++ b/workers_internal_test.go @@ -38,7 +38,7 @@ func TestWorkersInternal(t *testing.T) { t.Run("getWorker()", func(t *testing.T) { const concurrency = 32 conf := &Config{ - Workers: concurrency, + Workers: concurrency, } require.NoError(t, conf.SetDefaults()) From f5350678e6e5cff9b10a0c51167f13284d3c7148 Mon Sep 17 00:00:00 2001 From: Shawn Poulson <92753637+Baliedge@users.noreply.github.com> Date: Tue, 10 Oct 2023 14:54:22 -0400 Subject: [PATCH 30/32] Update config.go Co-authored-by: Derrick J. Wippler --- config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.go b/config.go index 49f53042..a87d90a8 100644 --- a/config.go +++ b/config.go @@ -53,7 +53,7 @@ type BehaviorConfig struct { BatchWait time.Duration // The max number of requests we can batch into a single peer request BatchLimit int - // DisableBatching disables batching behavior. + // DisableBatching disables batching behavior for all ratelimits. DisableBatching bool // How long a non-owning peer should wait before syncing hits to the owning peer From 2d5782287689cb20ece4ecf62753a9bd40cb68bd Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Tue, 10 Oct 2023 15:01:32 -0400 Subject: [PATCH 31/32] Tidy tracing code. --- algorithms.go | 27 +++++++++------------------ gubernator.go | 2 ++ 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/algorithms.go b/algorithms.go index 42826c70..c7e6315c 100644 --- a/algorithms.go +++ b/algorithms.go @@ -48,8 +48,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * if ok { if item.Value == nil { msgPart := "tokenBucket: Invalid cache item; Value is nil" - span := trace.SpanFromContext(ctx) - span.AddEvent(msgPart, trace.WithAttributes( + trace.SpanFromContext(ctx).AddEvent(msgPart, trace.WithAttributes( attribute.String("hashKey", hashKey), attribute.String("key", r.UniqueKey), attribute.String("name", r.Name), @@ -58,8 +57,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * ok = false } else if item.Key != hashKey { msgPart := "tokenBucket: Invalid cache item; key mismatch" - span := trace.SpanFromContext(ctx) - span.AddEvent(msgPart, trace.WithAttributes( + trace.SpanFromContext(ctx).AddEvent(msgPart, trace.WithAttributes( attribute.String("itemKey", item.Key), attribute.String("hashKey", hashKey), attribute.String("name", r.Name), @@ -93,8 +91,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * t, ok := item.Value.(*TokenBucketItem) if !ok { // Client switched algorithms; perhaps due to a migration? - span := trace.SpanFromContext(ctx) - span.AddEvent("Client switched algorithms; perhaps due to a migration?") + trace.SpanFromContext(ctx).AddEvent("Client switched algorithms; perhaps due to a migration?") c.Remove(hashKey) @@ -163,8 +160,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * // If we are already at the limit. if rl.Remaining == 0 && r.Hits > 0 { - span := trace.SpanFromContext(ctx) - span.AddEvent("Already over the limit") + trace.SpanFromContext(ctx).AddEvent("Already over the limit") metricOverLimitCounter.Add(1) rl.Status = Status_OVER_LIMIT t.Status = rl.Status @@ -173,8 +169,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * // If requested hits takes the remainder. if t.Remaining == r.Hits { - span := trace.SpanFromContext(ctx) - span.AddEvent("At the limit") + trace.SpanFromContext(ctx).AddEvent("At the limit") t.Remaining = 0 rl.Remaining = 0 return rl, nil @@ -183,8 +178,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * // If requested is more than available, then return over the limit // without updating the cache. if r.Hits > t.Remaining { - span := trace.SpanFromContext(ctx) - span.AddEvent("Over the limit") + trace.SpanFromContext(ctx).AddEvent("Over the limit") metricOverLimitCounter.Add(1) rl.Status = Status_OVER_LIMIT return rl, nil @@ -235,8 +229,7 @@ func tokenBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) // Client could be requesting that we always return OVER_LIMIT. if r.Hits > r.Limit { - span := trace.SpanFromContext(ctx) - span.AddEvent("Over the limit") + trace.SpanFromContext(ctx).AddEvent("Over the limit") metricOverLimitCounter.Add(1) rl.Status = Status_OVER_LIMIT rl.Remaining = r.Limit @@ -279,8 +272,7 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * if ok { if item.Value == nil { msgPart := "leakyBucket: Invalid cache item; Value is nil" - span := trace.SpanFromContext(ctx) - span.AddEvent(msgPart, trace.WithAttributes( + trace.SpanFromContext(ctx).AddEvent(msgPart, trace.WithAttributes( attribute.String("hashKey", hashKey), attribute.String("key", r.UniqueKey), attribute.String("name", r.Name), @@ -289,8 +281,7 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp * ok = false } else if item.Key != hashKey { msgPart := "leakyBucket: Invalid cache item; key mismatch" - span := trace.SpanFromContext(ctx) - span.AddEvent(msgPart, trace.WithAttributes( + trace.SpanFromContext(ctx).AddEvent(msgPart, trace.WithAttributes( attribute.String("itemKey", item.Key), attribute.String("hashKey", hashKey), attribute.String("name", r.Name), diff --git a/gubernator.go b/gubernator.go index 41aa5fea..9bb46807 100644 --- a/gubernator.go +++ b/gubernator.go @@ -588,6 +588,8 @@ func (s *V1Instance) getLocalRateLimit(ctx context.Context, r *RateLimitReq) (*R ctx = tracing.StartNamedScope(ctx, "V1Instance.getLocalRateLimit", trace.WithAttributes( attribute.String("ratelimit.key", r.UniqueKey), attribute.String("ratelimit.name", r.Name), + attribute.Int64("ratelimit.limit", r.Limit), + attribute.Int64("ratelimit.hits", r.Hits), )) defer prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("V1Instance.getLocalRateLimit")).ObserveDuration() From d08fa23e7fb5299b8db52ac67da8d03245650b7a Mon Sep 17 00:00:00 2001 From: Shawn Poulson Date: Tue, 10 Oct 2023 15:10:19 -0400 Subject: [PATCH 32/32] Increase `GlobalSyncWait` to 500ms. --- config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.go b/config.go index a87d90a8..811bd1e9 100644 --- a/config.go +++ b/config.go @@ -124,7 +124,7 @@ func (c *Config) SetDefaults() error { setter.SetDefault(&c.Behaviors.GlobalTimeout, time.Millisecond*500) setter.SetDefault(&c.Behaviors.GlobalBatchLimit, maxBatchSize) - setter.SetDefault(&c.Behaviors.GlobalSyncWait, time.Microsecond*500) + setter.SetDefault(&c.Behaviors.GlobalSyncWait, time.Millisecond*500) setter.SetDefault(&c.LocalPicker, NewReplicatedConsistentHash(nil, defaultReplicas)) setter.SetDefault(&c.RegionPicker, NewRegionPicker(nil))