From 3af5fcdddab56b9dd8fce60cb8cb26135f5591b5 Mon Sep 17 00:00:00 2001 From: wwestgarth Date: Fri, 10 May 2024 16:15:01 +0100 Subject: [PATCH] fix: fix restart counter and add new config for connection attempts after the first binary start --- visor/config/visor_config.go | 33 ++++++++++++++++++++++++++------- visor/visor.go | 18 +++++++++++++----- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/visor/config/visor_config.go b/visor/config/visor_config.go index 9fa176cb6ef..ed25c404f81 100644 --- a/visor/config/visor_config.go +++ b/visor/config/visor_config.go @@ -128,9 +128,20 @@ type VisorConfigFile struct { The `maxNumberOfFirstConnectionRetries` is only taken into account during the first start up of the Core node process - not restarts. note: | There is a 2 second delay between each attempt. Setting the max retry number to 5 means Visor will try to establish a connection 5 times in 10 seconds. - default: 10 + default: 175000 */ MaxNumberOfFirstConnectionRetries int `toml:"maxNumberOfFirstConnectionRetries,optional"` + /* + description: | + Visor communicates with the core node via RPC API. + This variable allows a validator to specify how many times Visor should try to establish a connection to the core node before the Visor process fails. + The `MaxNumberOfRestartConnectionRetries` is only taken into account after the first start up of the Core node process where it is expected that the + time to restart will be much shorter than when originally started. + note: | + There is a 2 second delay between each attempt. Setting the max retry number to 5 means Visor will try to establish a connection 5 times in 10 seconds. + default: 10 + */ + MaxNumberOfRestartConnectionRetries int `toml:"maxNumberOfRestartConnectionRetries,optional"` /* description: | Defines the maximum number of restarts in case any of @@ -220,12 +231,13 @@ func DefaultVisorConfig(log *logging.Logger, homePath string) *VisorConfig { homePath: homePath, configPath: path.Join(homePath, configFileName), data: &VisorConfigFile{ - UpgradeFolders: map[string]string{"vX.X.X": "vX.X.X"}, - MaxNumberOfRestarts: 3, - MaxNumberOfFirstConnectionRetries: 175000, - RestartsDelaySeconds: 5, - StopDelaySeconds: 0, - StopSignalTimeoutSeconds: 15, + UpgradeFolders: map[string]string{"vX.X.X": "vX.X.X"}, + MaxNumberOfRestarts: 3, + MaxNumberOfFirstConnectionRetries: 175000, + MaxNumberOfRestartConnectionRetries: 10, + RestartsDelaySeconds: 5, + StopDelaySeconds: 0, + StopSignalTimeoutSeconds: 15, AutoInstall: AutoInstallConfig{ Enabled: true, GithubRepositoryOwner: "vegaprotocol", @@ -367,6 +379,13 @@ func (pc *VisorConfig) MaxNumberOfFirstConnectionRetries() int { return pc.data.MaxNumberOfFirstConnectionRetries } +func (pc *VisorConfig) MaxNumberOfRestartConnectionRetries() int { + pc.mut.RLock() + defer pc.mut.RUnlock() + + return pc.data.MaxNumberOfRestartConnectionRetries +} + func (pc *VisorConfig) RestartsDelaySeconds() int { pc.mut.RLock() defer pc.mut.RUnlock() diff --git a/visor/visor.go b/visor/visor.go index eb0cfb89187..7a5f845e7bb 100644 --- a/visor/visor.go +++ b/visor/visor.go @@ -29,7 +29,6 @@ import ( const ( upgradeAPICallTickerDuration = time.Second * 2 - maxUpgradeStatusErrs = 10 namedLogger = "visor" ) @@ -114,8 +113,11 @@ func (v *Visor) Run(ctx context.Context) error { runConf.Vega.RCP.HTTPPath, ) + // how many times to try and connect on the first start up of the binaries maxNumberOfFirstConnectionRetries := v.conf.MaxNumberOfFirstConnectionRetries() + // how many times to try and connect in subsequent restarts of the binaries where it is expected to be much quicker + maxUpgradeStatusErrs := v.conf.MaxNumberOfRestartConnectionRetries() numOfUpgradeStatusErrs := 0 maxNumRestarts := v.conf.MaxNumberOfRestarts() restartsDelay := time.Second * time.Duration(v.conf.RestartsDelaySeconds()) @@ -162,12 +164,13 @@ func (v *Visor) Run(ctx context.Context) error { upStatus, err := c.UpgradeStatus(ctx) if err != nil { // Binary has not started yet - waiting for first startup - if numOfRestarts == 0 { - if numOfUpgradeStatusErrs > maxNumberOfFirstConnectionRetries { + isFirstStartup := numOfRestarts == 0 + if isFirstStartup { + if numOfUpgradeStatusErrs >= maxNumberOfFirstConnectionRetries { return failedToGetStatusErr(maxNumberOfFirstConnectionRetries, err) } } else { // Binary has been started already. Something has failed after the startup - if numOfUpgradeStatusErrs > maxUpgradeStatusErrs { + if numOfUpgradeStatusErrs >= maxUpgradeStatusErrs { return failedToGetStatusErr(maxUpgradeStatusErrs, err) } } @@ -176,12 +179,17 @@ func (v *Visor) Run(ctx context.Context) error { numOfUpgradeStatusErrs++ - v.log.Info("Still waiting for vega to start...", logging.Int("attemptLeft", maxUpgradeStatusErrs-numOfUpgradeStatusErrs)) + attemptsLeft := maxUpgradeStatusErrs - numOfUpgradeStatusErrs + if isFirstStartup { + attemptsLeft = maxNumberOfFirstConnectionRetries - numOfUpgradeStatusErrs + } + v.log.Info("Still waiting for vega to start...", logging.Int("attemptLeft", attemptsLeft)) break } if !upStatus.ReadyToUpgrade { + numOfUpgradeStatusErrs = 0 break }