Skip to content

Commit 770b234

Browse files
committed
[BUGFIX] add Health Check for Range over gRPC Connection Loop
Signed-off-by: kpango <[email protected]>
1 parent 6065fd9 commit 770b234

File tree

8 files changed

+51
-31
lines changed

8 files changed

+51
-31
lines changed

internal/net/grpc/client.go

+35-19
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ type Client interface {
9090
GetCallOption() []CallOption
9191
GetBackoff() backoff.Backoff
9292
SetDisableResolveDNSAddr(addr string, disabled bool)
93-
ConnectedAddrs() []string
93+
ConnectedAddrs(context.Context) []string
9494
Close(ctx context.Context) error
9595
}
9696

@@ -249,7 +249,7 @@ func (g *gRPCClient) StartConnectionMonitor(ctx context.Context) (<-chan error,
249249
return ctx.Err()
250250
case <-prTick.C:
251251
if g.enablePoolRebalance {
252-
err = g.rangeConns(func(addr string, p pool.Conn) bool {
252+
err = g.rangeConns(ctx, func(addr string, p pool.Conn) bool {
253253
// if addr or pool is nil or empty the registration of conns is invalid let's disconnect them
254254
if addr == "" || p == nil {
255255
disconnectTargets = append(disconnectTargets, addr)
@@ -286,7 +286,7 @@ func (g *gRPCClient) StartConnectionMonitor(ctx context.Context) (<-chan error,
286286
})
287287
}
288288
case <-hcTick.C:
289-
err = g.rangeConns(func(addr string, p pool.Conn) bool {
289+
err = g.rangeConns(ctx, func(addr string, p pool.Conn) bool {
290290
// if addr or pool is nil or empty the registration of conns is invalid let's disconnect them
291291
if addr == "" || p == nil {
292292
disconnectTargets = append(disconnectTargets, addr)
@@ -415,7 +415,7 @@ func (g *gRPCClient) Range(
415415
if g.conns.Len() == 0 {
416416
return errors.ErrGRPCClientConnNotFound("*")
417417
}
418-
err = g.rangeConns(func(addr string, p pool.Conn) bool {
418+
err = g.rangeConns(ctx, func(addr string, p pool.Conn) bool {
419419
ssctx, sspan := trace.StartSpan(sctx, apiName+"/Client.Range/"+addr)
420420
defer func() {
421421
if sspan != nil {
@@ -478,7 +478,7 @@ func (g *gRPCClient) RangeConcurrent(
478478
if g.conns.Len() == 0 {
479479
return errors.ErrGRPCClientConnNotFound("*")
480480
}
481-
err = g.rangeConns(func(addr string, p pool.Conn) bool {
481+
err = g.rangeConns(ctx, func(addr string, p pool.Conn) bool {
482482
eg.Go(safety.RecoverFunc(func() (err error) {
483483
ssctx, sspan := trace.StartSpan(egctx, apiName+"/Client.RangeConcurrent/"+addr)
484484
defer func() {
@@ -565,7 +565,7 @@ func (g *gRPCClient) OrderedRange(
565565
return nil
566566
default:
567567
p, ok := g.conns.Load(addr)
568-
if !ok || p == nil {
568+
if !ok || p == nil || !p.IsHealthy(sctx) {
569569
g.crl.Store(addr, true)
570570
log.Warnf("gRPCClient.OrderedRange operation failed, gRPC connection pool for %s is invalid,\terror: %v", addr, errors.ErrGRPCClientConnNotFound(addr))
571571
continue
@@ -634,7 +634,7 @@ func (g *gRPCClient) OrderedRangeConcurrent(
634634
addr := order
635635
eg.Go(safety.RecoverFunc(func() (err error) {
636636
p, ok := g.conns.Load(addr)
637-
if !ok || p == nil {
637+
if !ok || p == nil || !p.IsHealthy(sctx) {
638638
g.crl.Store(addr, true)
639639
log.Warnf("gRPCClient.OrderedRangeConcurrent operation failed, gRPC connection pool for %s is invalid,\terror: %v", addr, errors.ErrGRPCClientConnNotFound(addr))
640640
return nil
@@ -701,7 +701,7 @@ func (g *gRPCClient) RoundRobin(
701701
}
702702

703703
do := func() (data any, err error) {
704-
cerr := g.rangeConns(func(addr string, p pool.Conn) bool {
704+
cerr := g.rangeConns(ctx, func(addr string, p pool.Conn) bool {
705705
select {
706706
case <-ctx.Done():
707707
err = ctx.Err()
@@ -879,14 +879,14 @@ func (g *gRPCClient) connectWithBackoff(
879879
errors.Is(err, context.DeadlineExceeded) {
880880
return nil, false, err
881881
}
882-
return nil, err != nil, err
882+
return nil, p.IsHealthy(ctx), err
883883
}
884884
status.Log(st.Code(), err)
885885
switch st.Code() {
886886
case codes.Internal,
887887
codes.Unavailable,
888888
codes.ResourceExhausted:
889-
return nil, err != nil, err
889+
return nil, p.IsHealthy(ctx), err
890890
}
891891
return nil, false, err
892892
}
@@ -1066,7 +1066,7 @@ func (g *gRPCClient) Disconnect(ctx context.Context, addr string) error {
10661066
atomic.AddUint64(&g.clientCount, ^uint64(0))
10671067
if p != nil {
10681068
log.Debugf("gRPC client connection pool addr = %s will disconnect soon...", addr)
1069-
return nil, p.Disconnect()
1069+
return nil, p.Disconnect(ctx)
10701070
}
10711071
return nil, nil
10721072
})
@@ -1085,10 +1085,10 @@ func (g *gRPCClient) Disconnect(ctx context.Context, addr string) error {
10851085
return nil
10861086
}
10871087

1088-
func (g *gRPCClient) ConnectedAddrs() (addrs []string) {
1088+
func (g *gRPCClient) ConnectedAddrs(ctx context.Context) (addrs []string) {
10891089
addrs = make([]string, 0, g.conns.Len())
1090-
err := g.rangeConns(func(addr string, p pool.Conn) bool {
1091-
if p != nil && p.IsHealthy(context.Background()) {
1090+
err := g.rangeConns(ctx, func(addr string, p pool.Conn) bool {
1091+
if p != nil && p.IsHealthy(ctx) {
10921092
addrs = append(addrs, addr)
10931093
}
10941094
return true
@@ -1104,18 +1104,34 @@ func (g *gRPCClient) Close(ctx context.Context) (err error) {
11041104
g.stopMonitor()
11051105
}
11061106
g.conns.Range(func(addr string, p pool.Conn) bool {
1107-
derr := g.Disconnect(ctx, addr)
1108-
if derr != nil && !errors.Is(derr, errors.ErrGRPCClientConnNotFound(addr)) {
1109-
err = errors.Join(err, derr)
1107+
select {
1108+
case <-ctx.Done():
1109+
return false
1110+
default:
1111+
derr := g.Disconnect(ctx, addr)
1112+
if derr != nil && !errors.Is(derr, errors.ErrGRPCClientConnNotFound(addr)) {
1113+
err = errors.Join(err, derr)
1114+
}
1115+
return true
11101116
}
1111-
return true
11121117
})
11131118
return err
11141119
}
11151120

1116-
func (g *gRPCClient) rangeConns(fn func(addr string, p pool.Conn) bool) error {
1121+
func (g *gRPCClient) rangeConns(ctx context.Context, fn func(addr string, p pool.Conn) bool) error {
11171122
var cnt int
11181123
g.conns.Range(func(addr string, p pool.Conn) bool {
1124+
if p == nil || !p.IsHealthy(ctx) {
1125+
pc, err := p.Connect(ctx)
1126+
if pc == nil || err != nil || !pc.IsHealthy(ctx) {
1127+
if pc != nil {
1128+
pc.Disconnect(ctx)
1129+
}
1130+
log.Debugf("Unhealthy connection detected for %s during gRPC Connection Range over Loop:\t%s", addr, p.String())
1131+
return true
1132+
}
1133+
p = pc
1134+
}
11191135
cnt++
11201136
return fn(addr, p)
11211137
})

internal/net/grpc/client_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -3107,7 +3107,7 @@ package grpc
31073107
// stopMonitor: test.fields.stopMonitor,
31083108
// }
31093109
//
3110-
// gotAddrs := g.ConnectedAddrs()
3110+
// gotAddrs := g.ConnectedAddrs(context.Background)
31113111
// if err := checkFunc(test.want, gotAddrs); err != nil {
31123112
// tt.Errorf("error = %v", err)
31133113
// }
@@ -3303,6 +3303,7 @@ package grpc
33033303
//
33043304
// func Test_gRPCClient_rangeConns(t *testing.T) {
33053305
// type args struct {
3306+
// ctx context.Context
33063307
// fn func(addr string, p pool.Conn) bool
33073308
// }
33083309
// type fields struct {

internal/net/grpc/pool/pool.go

+6-7
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ type (
4545

4646
type Conn interface {
4747
Connect(context.Context) (Conn, error)
48-
Disconnect() error
48+
Disconnect(context.Context) error
4949
Do(ctx context.Context, f func(*ClientConn) error) error
50-
Get(ctx context.Context) (conn *ClientConn, ok bool)
50+
Get(context.Context) (conn *ClientConn, ok bool)
5151
IsHealthy(context.Context) bool
5252
IsIPConn() bool
5353
Len() uint64
@@ -437,8 +437,7 @@ func (p *pool) singleTargetConnect(ctx context.Context) (c Conn, err error) {
437437
return p, nil
438438
}
439439

440-
func (p *pool) Disconnect() (err error) {
441-
ctx := context.Background()
440+
func (p *pool) Disconnect(ctx context.Context) (err error) {
442441
p.closing.Store(true)
443442
defer p.closing.Store(false)
444443
emap := make(map[string]error, p.len())
@@ -618,7 +617,7 @@ func (p *pool) getHealthyConn(
618617
if retry <= 0 || retry > math.MaxUint64-pl || pl <= 0 {
619618
if p.isIP {
620619
log.Warnf("failed to find gRPC IP connection pool for %s.\tlen(pool): %d,\tretried: %d,\tseems IP %s is unhealthy will going to disconnect...", p.addr, pl, cnt, p.addr)
621-
if err := p.Disconnect(); err != nil {
620+
if err := p.Disconnect(ctx); err != nil {
622621
log.Debugf("failed to disconnect gRPC IP direct connection for %s,\terr: %v", p.addr, err)
623622
}
624623
return 0, nil, false
@@ -757,8 +756,8 @@ func (p *pool) String() (str string) {
757756

758757
func (pc *poolConn) Close(ctx context.Context, delay time.Duration) error {
759758
tdelay := delay / 10
760-
if tdelay < time.Millisecond*200 {
761-
tdelay = time.Millisecond * 200
759+
if tdelay < time.Millisecond*5 {
760+
tdelay = time.Millisecond * 5
762761
} else if tdelay > time.Minute {
763762
tdelay = time.Second * 5
764763
}

internal/net/grpc/pool/pool_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -2344,7 +2344,7 @@ package pool
23442344
// reconnectHash: test.fields.reconnectHash,
23452345
// }
23462346
//
2347-
// err := p.Disconnect()
2347+
// err := p.Disconnect(context.Background)
23482348
// if err := checkFunc(test.want, err); err != nil {
23492349
// tt.Errorf("error = %v", err)
23502350
// }

internal/test/mock/grpc/grpc_client_mock.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ func (gc *GRPCClientMock) OrderedRangeConcurrent(
5151
}
5252

5353
// ConnectedAddrs calls the ConnectedAddrsFunc object.
54-
func (gc *GRPCClientMock) ConnectedAddrs() []string {
54+
func (gc *GRPCClientMock) ConnectedAddrs(_ context.Context) []string {
5555
return gc.ConnectedAddrsFunc()
5656
}
5757

internal/test/mock/grpc_testify_mock.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ func (c *ClientInternal) GetBackoff() backoff.Backoff {
199199
return v
200200
}
201201

202-
func (c *ClientInternal) ConnectedAddrs() []string {
202+
func (c *ClientInternal) ConnectedAddrs(ctx context.Context) []string {
203203
args := c.Called()
204204
v, ok := args.Get(0).([]string)
205205
if !ok {

pkg/gateway/lb/handler/grpc/aggregation.go

+4
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ func (s *server) aggregationSearch(
101101
target + " canceled: " + err.Error())...)
102102
sspan.SetStatus(trace.StatusError, err.Error())
103103
}
104+
log.Debug(err)
104105
return nil
105106
case errors.Is(err, context.DeadlineExceeded),
106107
errors.Is(err, errors.ErrRPCCallFailed(target, context.DeadlineExceeded)):
@@ -112,6 +113,7 @@ func (s *server) aggregationSearch(
112113
target + " deadline_exceeded: " + err.Error())...)
113114
sspan.SetStatus(trace.StatusError, err.Error())
114115
}
116+
log.Debug(err)
115117
return nil
116118
default:
117119
st, msg, err := status.ParseError(err, codes.Unknown, "failed to parse search gRPC error response",
@@ -168,6 +170,7 @@ func (s *server) aggregationSearch(
168170
target + " canceled: " + err.Error())...)
169171
sspan.SetStatus(trace.StatusError, err.Error())
170172
}
173+
log.Debug(err)
171174
return nil
172175
case errors.Is(err, context.DeadlineExceeded),
173176
errors.Is(err, errors.ErrRPCCallFailed(target, context.DeadlineExceeded)):
@@ -179,6 +182,7 @@ func (s *server) aggregationSearch(
179182
target + " deadline_exceeded: " + err.Error())...)
180183
sspan.SetStatus(trace.StatusError, err.Error())
181184
}
185+
log.Debug(err)
182186
return nil
183187
default:
184188
st, msg, err := status.ParseError(err, codes.Unknown, "failed to parse search gRPC error response",

pkg/gateway/mirror/service/mirror.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ func (m *mirr) Start(ctx context.Context) <-chan error { // skipcq: GO-R1005
160160
}
161161
}
162162
}
163-
log.Debugf("[mirror]: connected mirror gateway targets: %v", m.gateway.GRPCClient().ConnectedAddrs())
163+
log.Debugf("[mirror]: connected mirror gateway targets: %v", m.gateway.GRPCClient().ConnectedAddrs(ctx))
164164
}
165165
}
166166
})

0 commit comments

Comments
 (0)