diff --git a/plugins/ipam/dhcp/lease.go b/plugins/ipam/dhcp/lease.go index ffc379570..693b07670 100644 --- a/plugins/ipam/dhcp/lease.go +++ b/plugins/ipam/dhcp/lease.go @@ -16,10 +16,12 @@ package main import ( "context" + "errors" "fmt" "log" "math/rand" "net" + "reflect" "strings" "sync" "sync/atomic" @@ -55,6 +57,13 @@ const ( leaseStateRebinding ) +// Timing for retrying link existence check +const ( + linkCheckDelay0 = 1 * time.Second + linkCheckRetryMax = 10 * time.Second + linkCheckTotalTimeout = 30 * time.Second +) + // This implementation uses 1 OS thread per lease. This is because // all the network operations have to be done in network namespace // of the interface. This can be improved by switching to the proper @@ -65,6 +74,7 @@ type DHCPLease struct { clientID string latestLease *nclient4.Lease link netlink.Link + linkexists bool renewalTime time.Time rebindingTime time.Time expireTime time.Time @@ -292,6 +302,14 @@ func (l *DHCPLease) maintain() { for { var sleepDur time.Duration + linkCheckCtx, cancel := context.WithTimeoutCause(l.ctx, l.resendTimeout, errNoMoreTries) + defer cancel() + linkExists, _ := l.checkLinkExistsWithBackoff(linkCheckCtx) + if !linkExists { + log.Printf("%v: interface %s no longer exists or link check failed, terminating lease maintenance", l.clientID, l.link.Attrs().Name) + return + } + switch state { case leaseStateBound: sleepDur = time.Until(l.renewalTime) @@ -344,6 +362,50 @@ func (l *DHCPLease) maintain() { } } +func (l *DHCPLease) checkLinkExistsWithBackoff(ctx context.Context) (bool, error) { + checkFunc := func() (bool, error) { + _, err := netlink.LinkByName(l.link.Attrs().Name) + if err != nil { + log.Printf("!bang: Error checking network link '%s': %v, type: %v", l.link.Attrs().Name, err, reflect.TypeOf(err)) + var linkNotFoundErr *netlink.LinkNotFoundError + if errors.As(err, &linkNotFoundErr) { + log.Printf("!bang Link not found error detected! this is what I want to see.") + return false, nil + } + log.Printf("!bang Unexpected error type: %v, retrying...", reflect.TypeOf(err)) + return false, err + } + return true, nil + } + + ctx, cancel := context.WithTimeout(ctx, linkCheckTotalTimeout) + defer cancel() + + exists, err := backoffRetryLinkExists(ctx, linkCheckRetryMax, checkFunc) + return exists, err +} + +func backoffRetryLinkExists(ctx context.Context, maxDelay time.Duration, f func() (bool, error)) (bool, error) { + var baseDelay time.Duration = linkCheckDelay0 + for { + exists, err := f() + if err == nil { + return exists, nil + } + + log.Printf("!bang: Retrying due to error: %v. Next retry in %v seconds", err, baseDelay) + + select { + case <-ctx.Done(): + return false, ctx.Err() // Context's done, return with its error + case <-time.After(baseDelay): + if baseDelay < maxDelay { + baseDelay *= 2 + } + } + } +} + func (l *DHCPLease) downIface() { if err := netlink.LinkSetDown(l.link); err != nil { log.Printf("%v: failed to bring %v interface DOWN: %v", l.clientID, l.link.Attrs().Name, err)