mailgun · Baliedge · Oct 10, 2023 · Sep 27, 2023 · Sep 27, 2023 · Sep 28, 2023
@@ -1,14 +1,15 @@
 .DEFAULT_GOAL := release
 VERSION=$(shell cat version)
 LDFLAGS="-X main.Version=$(VERSION)"
-GOLINT = $(GOPATH)/bin/golangci-lint
+GOLANGCI_LINT = $(GOPATH)/bin/golangci-lint
+GOLANGCI_LINT_VERSION = 1.54.2
 
-$(GOLINT):
-	curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(GOPATH)/bin v1.54.1
+$(GOLANGCI_LINT):
+	curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(GOPATH)/bin $(GOLANGCI_LINT_VERSION)
 
 .PHONY: lint
-lint: $(GOLINT)
-	$(GOLINT) run --out-format tab --path-prefix `pwd`
+lint: $(GOLANGCI_LINT)
+	$(GOLANGCI_LINT) run
 
 .PHONY: test
 test:

@@ -20,7 +20,6 @@ import (
 	"context"
 
 	"github.com/mailgun/holster/v4/clock"
-	"github.com/mailgun/holster/v4/tracing"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/sirupsen/logrus"
 	"go.opentelemetry.io/otel/attribute"
@@ -29,9 +28,6 @@ import (
 
 // Implements token bucket algorithm for rate limiting. https://en.wikipedia.org/wiki/Token_bucket
 func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) {
-	ctx = tracing.StartNamedScopeDebug(ctx, "tokenBucket")
-	defer func() { tracing.EndScope(ctx, err) }()
-	span := trace.SpanFromContext(ctx)
 
 	tokenBucketTimer := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("tokenBucket"))
 	defer tokenBucketTimer.ObserveDuration()
@@ -52,6 +48,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 	if ok {
 		if item.Value == nil {
 			msgPart := "tokenBucket: Invalid cache item; Value is nil"
+			span := trace.SpanFromContext(ctx)
 			span.AddEvent(msgPart, trace.WithAttributes(
 				attribute.String("hashKey", hashKey),
 				attribute.String("key", r.UniqueKey),
@@ -61,6 +58,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 			ok = false
 		} else if item.Key != hashKey {
 			msgPart := "tokenBucket: Invalid cache item; key mismatch"
+			span := trace.SpanFromContext(ctx)
 			span.AddEvent(msgPart, trace.WithAttributes(
 				attribute.String("itemKey", item.Key),
 				attribute.String("hashKey", hashKey),
@@ -95,6 +93,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 		t, ok := item.Value.(*TokenBucketItem)
 		if !ok {
 			// Client switched algorithms; perhaps due to a migration?
+			span := trace.SpanFromContext(ctx)
 			span.AddEvent("Client switched algorithms; perhaps due to a migration?")
 
 			c.Remove(hashKey)
@@ -125,6 +124,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 
 		// If the duration config changed, update the new ExpireAt.
 		if t.Duration != r.Duration {
+			span := trace.SpanFromContext(ctx)
 			span.AddEvent("Duration changed")
 			expire := t.CreatedAt + r.Duration
 			if HasBehavior(r.Behavior, Behavior_DURATION_IS_GREGORIAN) {
@@ -163,6 +163,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 
 		// If we are already at the limit.
 		if rl.Remaining == 0 && r.Hits > 0 {
+			span := trace.SpanFromContext(ctx)
 			span.AddEvent("Already over the limit")
 			metricOverLimitCounter.Add(1)
 			rl.Status = Status_OVER_LIMIT
@@ -172,6 +173,7 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 
 		// If requested hits takes the remainder.
 		if t.Remaining == r.Hits {
+			span := trace.SpanFromContext(ctx)
 			span.AddEvent("At the limit")
 			t.Remaining = 0
 			rl.Remaining = 0
@@ -181,13 +183,13 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 		// If requested is more than available, then return over the limit
 		// without updating the cache.
 		if r.Hits > t.Remaining {
+			span := trace.SpanFromContext(ctx)
 			span.AddEvent("Over the limit")
 			metricOverLimitCounter.Add(1)
 			rl.Status = Status_OVER_LIMIT
 			return rl, nil
 		}
 
-		span.AddEvent("Under the limit")
 		t.Remaining -= r.Hits
 		rl.Remaining = t.Remaining
 		return rl, nil
@@ -199,10 +201,6 @@ func tokenBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 
 // Called by tokenBucket() when adding a new item in the store.
 func tokenBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) {
-	ctx = tracing.StartNamedScopeDebug(ctx, "tokenBucketNewItem")
-	defer func() { tracing.EndScope(ctx, err) }()
-	span := trace.SpanFromContext(ctx)
-
 	now := MillisecondNow()
 	expire := now + r.Duration
 
@@ -237,6 +235,7 @@ func tokenBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq)
 
 	// Client could be requesting that we always return OVER_LIMIT.
 	if r.Hits > r.Limit {
+		span := trace.SpanFromContext(ctx)
 		span.AddEvent("Over the limit")
 		metricOverLimitCounter.Add(1)
 		rl.Status = Status_OVER_LIMIT
@@ -255,10 +254,6 @@ func tokenBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq)
 
 // Implements leaky bucket algorithm for rate limiting https://en.wikipedia.org/wiki/Leaky_bucket
 func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) {
-	ctx = tracing.StartNamedScopeDebug(ctx, "leakyBucket")
-	defer func() { tracing.EndScope(ctx, err) }()
-	span := trace.SpanFromContext(ctx)
-
 	leakyBucketTimer := prometheus.NewTimer(metricFuncTimeDuration.WithLabelValues("V1Instance.getRateLimit_leakyBucket"))
 	defer leakyBucketTimer.ObserveDuration()
 
@@ -284,6 +279,7 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 	if ok {
 		if item.Value == nil {
 			msgPart := "leakyBucket: Invalid cache item; Value is nil"
+			span := trace.SpanFromContext(ctx)
 			span.AddEvent(msgPart, trace.WithAttributes(
 				attribute.String("hashKey", hashKey),
 				attribute.String("key", r.UniqueKey),
@@ -293,6 +289,7 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 			ok = false
 		} else if item.Key != hashKey {
 			msgPart := "leakyBucket: Invalid cache item; key mismatch"
+			span := trace.SpanFromContext(ctx)
 			span.AddEvent(msgPart, trace.WithAttributes(
 				attribute.String("itemKey", item.Key),
 				attribute.String("hashKey", hashKey),
@@ -425,9 +422,6 @@ func leakyBucket(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *
 
 // Called by leakyBucket() when adding a new item in the store.
 func leakyBucketNewItem(ctx context.Context, s Store, c Cache, r *RateLimitReq) (resp *RateLimitResp, err error) {
-	ctx = tracing.StartNamedScopeDebug(ctx, "leakyBucketNewItem")
-	defer func() { tracing.EndScope(ctx, err) }()
-
 	now := MillisecondNow()
 	duration := r.Duration
 	rate := float64(duration) / float64(r.Limit)

@@ -53,13 +53,17 @@ type BehaviorConfig struct {
 	BatchWait time.Duration
 	// The max number of requests we can batch into a single peer request
 	BatchLimit int
+	// DisableBatching disables batching behavior.
+	DisableBatching bool
 
 	// How long a non-owning peer should wait before syncing hits to the owning peer
 	GlobalSyncWait time.Duration
 	// How long we should wait for global sync responses from peers
 	GlobalTimeout time.Duration
 	// The max number of global updates we can batch into a single peer request
 	GlobalBatchLimit int
+	// ForceGlobal forces global mode on all rate limit checks.
+	ForceGlobal bool
 }
 
 // Config for a gubernator instance
@@ -125,7 +129,9 @@ func (c *Config) SetDefaults() error {
 	setter.SetDefault(&c.LocalPicker, NewReplicatedConsistentHash(nil, defaultReplicas))
 	setter.SetDefault(&c.RegionPicker, NewRegionPicker(nil))
 
+	setter.SetDefault(&c.CacheSize, 50_000)
 	setter.SetDefault(&c.Workers, runtime.NumCPU())
+	setter.SetDefault(&c.Logger, logrus.New().WithField("category", "gubernator"))
 
 	if c.CacheFactory == nil {
 		c.CacheFactory = func(maxSize int) Cache {
@@ -333,10 +339,12 @@ func SetupDaemonConfig(logger *logrus.Logger, configFile string) (DaemonConfig,
 	setter.SetDefault(&conf.Behaviors.BatchTimeout, getEnvDuration(log, "GUBER_BATCH_TIMEOUT"))
 	setter.SetDefault(&conf.Behaviors.BatchLimit, getEnvInteger(log, "GUBER_BATCH_LIMIT"))
 	setter.SetDefault(&conf.Behaviors.BatchWait, getEnvDuration(log, "GUBER_BATCH_WAIT"))
+	setter.SetDefault(&conf.Behaviors.DisableBatching, getEnvBool(log, "GUBER_DISABLE_BATCHING"))
 
 	setter.SetDefault(&conf.Behaviors.GlobalTimeout, getEnvDuration(log, "GUBER_GLOBAL_TIMEOUT"))
 	setter.SetDefault(&conf.Behaviors.GlobalBatchLimit, getEnvInteger(log, "GUBER_GLOBAL_BATCH_LIMIT"))
 	setter.SetDefault(&conf.Behaviors.GlobalSyncWait, getEnvDuration(log, "GUBER_GLOBAL_SYNC_WAIT"))
+	setter.SetDefault(&conf.Behaviors.ForceGlobal, getEnvBool(log, "GUBER_FORCE_GLOBAL"))
 
 	// TLS Config
 	if anyHasPrefix("GUBER_TLS_", os.Environ()) {

@@ -18,9 +18,7 @@ package gubernator
 
 import (
 	"context"
-	"time"
 
-	"github.com/mailgun/holster/v4/clock"
 	"github.com/mailgun/holster/v4/ctxutil"
 	"github.com/mailgun/holster/v4/syncutil"
 	"github.com/prometheus/client_golang/prometheus"
@@ -37,23 +35,28 @@ type globalManager struct {
 	log            FieldLogger
 	instance       *V1Instance
 
-	asyncMetrics     prometheus.Summary
-	broadcastMetrics prometheus.Summary
+	metricAsyncDuration     prometheus.Summary
+	metricBroadcastDuration prometheus.Summary
+	metricBroadcastCounter  *prometheus.CounterVec
 }
 
 func newGlobalManager(conf BehaviorConfig, instance *V1Instance) *globalManager {
 	gm := globalManager{
 		log: instance.log,
-		asyncMetrics: prometheus.NewSummary(prometheus.SummaryOpts{
-			Help:       "The duration of GLOBAL async sends in seconds.",
+		metricAsyncDuration: prometheus.NewSummary(prometheus.SummaryOpts{
 			Name:       "gubernator_async_durations",
+			Help:       "The duration of GLOBAL async sends in seconds.",
 			Objectives: map[float64]float64{0.5: 0.05, 0.99: 0.001},
 		}),
-		broadcastMetrics: prometheus.NewSummary(prometheus.SummaryOpts{
-			Help:       "The duration of GLOBAL broadcasts to peers in seconds.",
+		metricBroadcastDuration: prometheus.NewSummary(prometheus.SummaryOpts{
 			Name:       "gubernator_broadcast_durations",
+			Help:       "The duration of GLOBAL broadcasts to peers in seconds.",
 			Objectives: map[float64]float64{0.5: 0.05, 0.99: 0.001},
 		}),
+		metricBroadcastCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "gubernator_broadcast_counter",
+			Help: "The count of broadcasts.",
+		}, []string{"condition"}),
 		asyncQueue:     make(chan *RateLimitReq, conf.GlobalBatchLimit),
 		broadcastQueue: make(chan *RateLimitReq, conf.GlobalBatchLimit),
 		instance:       instance,
@@ -123,8 +126,8 @@ func (gm *globalManager) sendHits(hits map[string]*RateLimitReq) {
 		client *PeerClient
 		req    GetPeerRateLimitsReq
 	}
+	defer prometheus.NewTimer(gm.metricAsyncDuration).ObserveDuration()
 	peerRequests := make(map[string]*pair)
-	start := clock.Now()
 
 	// Assign each request to a peer
 	for _, r := range hits {
@@ -157,7 +160,6 @@ func (gm *globalManager) sendHits(hits map[string]*RateLimitReq) {
 			continue
 		}
 	}
-	gm.asyncMetrics.Observe(time.Since(start).Seconds())
 }
 
 // runBroadcasts collects status changes for global rate limits and broadcasts the changes to each peer in the cluster.
@@ -171,7 +173,8 @@ func (gm *globalManager) runBroadcasts() {
 			updates[r.HashKey()] = r
 
 			// Send the hits if we reached our batch limit
-			if len(updates) == gm.conf.GlobalBatchLimit {
+			if len(updates) >= gm.conf.GlobalBatchLimit {
+				gm.metricBroadcastCounter.WithLabelValues("queue_full").Inc()
 				gm.broadcastPeers(context.Background(), updates)
 				updates = make(map[string]*RateLimitReq)
 				return true
@@ -185,8 +188,11 @@ func (gm *globalManager) runBroadcasts() {
 
 		case <-interval.C:
 			if len(updates) != 0 {
+				gm.metricBroadcastCounter.WithLabelValues("timer").Inc()
 				gm.broadcastPeers(context.Background(), updates)
 				updates = make(map[string]*RateLimitReq)
+			} else {
+				metricGlobalQueueLength.Set(0)
 			}
 		case <-done:
 			return false
@@ -197,8 +203,10 @@ func (gm *globalManager) runBroadcasts() {
 
 // broadcastPeers broadcasts global rate limit statuses to all other peers
 func (gm *globalManager) broadcastPeers(ctx context.Context, updates map[string]*RateLimitReq) {
+	defer prometheus.NewTimer(gm.metricBroadcastDuration).ObserveDuration()
 	var req UpdatePeerGlobalsReq
-	start := clock.Now()
+
+	metricGlobalQueueLength.Set(float64(len(updates)))
 
 	for _, r := range updates {
 		// Copy the original since we are removing the GLOBAL behavior
@@ -227,7 +235,7 @@ func (gm *globalManager) broadcastPeers(ctx context.Context, updates map[string]
 			continue
 		}
 
-		ctx, cancel := ctxutil.WithTimeout(context.Background(), gm.conf.GlobalTimeout)
+		ctx, cancel := ctxutil.WithTimeout(ctx, gm.conf.GlobalTimeout)
 		_, err := peer.UpdatePeerGlobals(ctx, &req)
 		cancel()
 
@@ -239,8 +247,6 @@ func (gm *globalManager) broadcastPeers(ctx context.Context, updates map[string]
 			continue
 		}
 	}
-
-	gm.broadcastMetrics.Observe(time.Since(start).Seconds())
 }
 
 func (gm *globalManager) Close() {

@@ -7,6 +7,7 @@ require (
 	github.com/davecgh/go-spew v1.1.1
 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3
 	github.com/hashicorp/memberlist v0.5.0
+	github.com/mailgun/errors v0.1.5
 	github.com/mailgun/holster/v4 v4.14.2
 	github.com/miekg/dns v1.1.50
 	github.com/pkg/errors v0.9.1
@@ -23,6 +24,7 @@ require (
 	go.opentelemetry.io/otel/trace v1.16.0
 	golang.org/x/net v0.10.0
 	golang.org/x/time v0.3.0
+	google.golang.org/api v0.108.0
 	google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4
 	google.golang.org/grpc v1.55.0
 	google.golang.org/protobuf v1.30.0
@@ -33,6 +35,8 @@ require (
 )
 
 require (
+	cloud.google.com/go/compute v1.18.0 // indirect
+	cloud.google.com/go/compute/metadata v0.2.3 // indirect
 	github.com/armon/go-metrics v0.4.0 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/cenkalti/backoff/v4 v4.2.1 // indirect
@@ -43,10 +47,14 @@ require (
 	github.com/go-logr/logr v1.2.4 // indirect
 	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
 	github.com/golang/protobuf v1.5.3 // indirect
 	github.com/google/btree v1.1.1 // indirect
 	github.com/google/go-cmp v0.5.9 // indirect
 	github.com/google/gofuzz v1.1.0 // indirect
+	github.com/google/uuid v1.3.0 // indirect
+	github.com/googleapis/enterprise-certificate-proxy v0.2.1 // indirect
+	github.com/googleapis/gax-go/v2 v2.7.0 // indirect
 	github.com/googleapis/gnostic v0.5.5 // indirect
 	github.com/hashicorp/errwrap v1.1.0 // indirect
 	github.com/hashicorp/go-immutable-radix v1.3.1 // indirect
@@ -68,6 +76,7 @@ require (
 	github.com/uptrace/opentelemetry-go-extra/otelutil v0.2.1 // indirect
 	go.etcd.io/etcd/api/v3 v3.5.5 // indirect
 	go.etcd.io/etcd/client/pkg/v3 v3.5.5 // indirect
+	go.opencensus.io v0.24.0 // indirect
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.42.0 // indirect
 	go.opentelemetry.io/otel/exporters/jaeger v1.16.0 // indirect
 	go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.16.0 // indirect