diff --git a/CHANGELOG.md b/CHANGELOG.md index 32c3c7c82ed..889e5667910 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ ### 🛠 Improvements - [11209](https://github.com/vegaprotocol/vega/issues/11209) - Publish ongoing games data. +- [11242](https://github.com/vegaprotocol/vega/issues/11242) - Add configuration to visor to help control binary retries a bit better. - [11196](https://github.com/vegaprotocol/vega/issues/11196) - Add an active field in the price monitoring bounds payload. - [11211](https://github.com/vegaprotocol/vega/issues/11211) - Liquidation engine includes `vAMM` shapes as available volume. - [11217](https://github.com/vegaprotocol/vega/issues/11217) - Allow market proposals to override risk factors. diff --git a/visor/config/visor_config.go b/visor/config/visor_config.go index 9fa176cb6ef..ed25c404f81 100644 --- a/visor/config/visor_config.go +++ b/visor/config/visor_config.go @@ -128,9 +128,20 @@ type VisorConfigFile struct { The `maxNumberOfFirstConnectionRetries` is only taken into account during the first start up of the Core node process - not restarts. note: | There is a 2 second delay between each attempt. Setting the max retry number to 5 means Visor will try to establish a connection 5 times in 10 seconds. - default: 10 + default: 175000 */ MaxNumberOfFirstConnectionRetries int `toml:"maxNumberOfFirstConnectionRetries,optional"` + /* + description: | + Visor communicates with the core node via RPC API. + This variable allows a validator to specify how many times Visor should try to establish a connection to the core node before the Visor process fails. + The `MaxNumberOfRestartConnectionRetries` is only taken into account after the first start up of the Core node process where it is expected that the + time to restart will be much shorter than when originally started. + note: | + There is a 2 second delay between each attempt. Setting the max retry number to 5 means Visor will try to establish a connection 5 times in 10 seconds. + default: 10 + */ + MaxNumberOfRestartConnectionRetries int `toml:"maxNumberOfRestartConnectionRetries,optional"` /* description: | Defines the maximum number of restarts in case any of @@ -220,12 +231,13 @@ func DefaultVisorConfig(log *logging.Logger, homePath string) *VisorConfig { homePath: homePath, configPath: path.Join(homePath, configFileName), data: &VisorConfigFile{ - UpgradeFolders: map[string]string{"vX.X.X": "vX.X.X"}, - MaxNumberOfRestarts: 3, - MaxNumberOfFirstConnectionRetries: 175000, - RestartsDelaySeconds: 5, - StopDelaySeconds: 0, - StopSignalTimeoutSeconds: 15, + UpgradeFolders: map[string]string{"vX.X.X": "vX.X.X"}, + MaxNumberOfRestarts: 3, + MaxNumberOfFirstConnectionRetries: 175000, + MaxNumberOfRestartConnectionRetries: 10, + RestartsDelaySeconds: 5, + StopDelaySeconds: 0, + StopSignalTimeoutSeconds: 15, AutoInstall: AutoInstallConfig{ Enabled: true, GithubRepositoryOwner: "vegaprotocol", @@ -367,6 +379,13 @@ func (pc *VisorConfig) MaxNumberOfFirstConnectionRetries() int { return pc.data.MaxNumberOfFirstConnectionRetries } +func (pc *VisorConfig) MaxNumberOfRestartConnectionRetries() int { + pc.mut.RLock() + defer pc.mut.RUnlock() + + return pc.data.MaxNumberOfRestartConnectionRetries +} + func (pc *VisorConfig) RestartsDelaySeconds() int { pc.mut.RLock() defer pc.mut.RUnlock() diff --git a/visor/visor.go b/visor/visor.go index eb0cfb89187..7a5f845e7bb 100644 --- a/visor/visor.go +++ b/visor/visor.go @@ -29,7 +29,6 @@ import ( const ( upgradeAPICallTickerDuration = time.Second * 2 - maxUpgradeStatusErrs = 10 namedLogger = "visor" ) @@ -114,8 +113,11 @@ func (v *Visor) Run(ctx context.Context) error { runConf.Vega.RCP.HTTPPath, ) + // how many times to try and connect on the first start up of the binaries maxNumberOfFirstConnectionRetries := v.conf.MaxNumberOfFirstConnectionRetries() + // how many times to try and connect in subsequent restarts of the binaries where it is expected to be much quicker + maxUpgradeStatusErrs := v.conf.MaxNumberOfRestartConnectionRetries() numOfUpgradeStatusErrs := 0 maxNumRestarts := v.conf.MaxNumberOfRestarts() restartsDelay := time.Second * time.Duration(v.conf.RestartsDelaySeconds()) @@ -162,12 +164,13 @@ func (v *Visor) Run(ctx context.Context) error { upStatus, err := c.UpgradeStatus(ctx) if err != nil { // Binary has not started yet - waiting for first startup - if numOfRestarts == 0 { - if numOfUpgradeStatusErrs > maxNumberOfFirstConnectionRetries { + isFirstStartup := numOfRestarts == 0 + if isFirstStartup { + if numOfUpgradeStatusErrs >= maxNumberOfFirstConnectionRetries { return failedToGetStatusErr(maxNumberOfFirstConnectionRetries, err) } } else { // Binary has been started already. Something has failed after the startup - if numOfUpgradeStatusErrs > maxUpgradeStatusErrs { + if numOfUpgradeStatusErrs >= maxUpgradeStatusErrs { return failedToGetStatusErr(maxUpgradeStatusErrs, err) } } @@ -176,12 +179,17 @@ func (v *Visor) Run(ctx context.Context) error { numOfUpgradeStatusErrs++ - v.log.Info("Still waiting for vega to start...", logging.Int("attemptLeft", maxUpgradeStatusErrs-numOfUpgradeStatusErrs)) + attemptsLeft := maxUpgradeStatusErrs - numOfUpgradeStatusErrs + if isFirstStartup { + attemptsLeft = maxNumberOfFirstConnectionRetries - numOfUpgradeStatusErrs + } + v.log.Info("Still waiting for vega to start...", logging.Int("attemptLeft", attemptsLeft)) break } if !upStatus.ReadyToUpgrade { + numOfUpgradeStatusErrs = 0 break }