Skip to content

Commit

Permalink
Merge pull request #11260 from vegaprotocol/11242-visor-upgrade-checks
Browse files Browse the repository at this point in the history
fix: reset failed attempts counter when it works, but no upgrade is p…
  • Loading branch information
wwestgarth authored May 15, 2024
2 parents 4a5ca8b + 973b064 commit e0bd34c
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 12 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
### 🛠 Improvements

- [11209](https://github.com/vegaprotocol/vega/issues/11209) - Publish ongoing games data.
- [11242](https://github.com/vegaprotocol/vega/issues/11242) - Add configuration to visor to help control binary retries a bit better.
- [11196](https://github.com/vegaprotocol/vega/issues/11196) - Add an active field in the price monitoring bounds payload.
- [11211](https://github.com/vegaprotocol/vega/issues/11211) - Liquidation engine includes `vAMM` shapes as available volume.
- [11217](https://github.com/vegaprotocol/vega/issues/11217) - Allow market proposals to override risk factors.
Expand Down
33 changes: 26 additions & 7 deletions visor/config/visor_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,20 @@ type VisorConfigFile struct {
The `maxNumberOfFirstConnectionRetries` is only taken into account during the first start up of the Core node process - not restarts.
note: |
There is a 2 second delay between each attempt. Setting the max retry number to 5 means Visor will try to establish a connection 5 times in 10 seconds.
default: 10
default: 175000
*/
MaxNumberOfFirstConnectionRetries int `toml:"maxNumberOfFirstConnectionRetries,optional"`
/*
description: |
Visor communicates with the core node via RPC API.
This variable allows a validator to specify how many times Visor should try to establish a connection to the core node before the Visor process fails.
The `MaxNumberOfRestartConnectionRetries` is only taken into account after the first start up of the Core node process where it is expected that the
time to restart will be much shorter than when originally started.
note: |
There is a 2 second delay between each attempt. Setting the max retry number to 5 means Visor will try to establish a connection 5 times in 10 seconds.
default: 10
*/
MaxNumberOfRestartConnectionRetries int `toml:"maxNumberOfRestartConnectionRetries,optional"`
/*
description: |
Defines the maximum number of restarts in case any of
Expand Down Expand Up @@ -220,12 +231,13 @@ func DefaultVisorConfig(log *logging.Logger, homePath string) *VisorConfig {
homePath: homePath,
configPath: path.Join(homePath, configFileName),
data: &VisorConfigFile{
UpgradeFolders: map[string]string{"vX.X.X": "vX.X.X"},
MaxNumberOfRestarts: 3,
MaxNumberOfFirstConnectionRetries: 175000,
RestartsDelaySeconds: 5,
StopDelaySeconds: 0,
StopSignalTimeoutSeconds: 15,
UpgradeFolders: map[string]string{"vX.X.X": "vX.X.X"},
MaxNumberOfRestarts: 3,
MaxNumberOfFirstConnectionRetries: 175000,
MaxNumberOfRestartConnectionRetries: 10,
RestartsDelaySeconds: 5,
StopDelaySeconds: 0,
StopSignalTimeoutSeconds: 15,
AutoInstall: AutoInstallConfig{
Enabled: true,
GithubRepositoryOwner: "vegaprotocol",
Expand Down Expand Up @@ -367,6 +379,13 @@ func (pc *VisorConfig) MaxNumberOfFirstConnectionRetries() int {
return pc.data.MaxNumberOfFirstConnectionRetries
}

func (pc *VisorConfig) MaxNumberOfRestartConnectionRetries() int {
pc.mut.RLock()
defer pc.mut.RUnlock()

return pc.data.MaxNumberOfRestartConnectionRetries
}

func (pc *VisorConfig) RestartsDelaySeconds() int {
pc.mut.RLock()
defer pc.mut.RUnlock()
Expand Down
18 changes: 13 additions & 5 deletions visor/visor.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (

const (
upgradeAPICallTickerDuration = time.Second * 2
maxUpgradeStatusErrs = 10
namedLogger = "visor"
)

Expand Down Expand Up @@ -114,8 +113,11 @@ func (v *Visor) Run(ctx context.Context) error {
runConf.Vega.RCP.HTTPPath,
)

// how many times to try and connect on the first start up of the binaries
maxNumberOfFirstConnectionRetries := v.conf.MaxNumberOfFirstConnectionRetries()

// how many times to try and connect in subsequent restarts of the binaries where it is expected to be much quicker
maxUpgradeStatusErrs := v.conf.MaxNumberOfRestartConnectionRetries()
numOfUpgradeStatusErrs := 0
maxNumRestarts := v.conf.MaxNumberOfRestarts()
restartsDelay := time.Second * time.Duration(v.conf.RestartsDelaySeconds())
Expand Down Expand Up @@ -162,12 +164,13 @@ func (v *Visor) Run(ctx context.Context) error {
upStatus, err := c.UpgradeStatus(ctx)
if err != nil {
// Binary has not started yet - waiting for first startup
if numOfRestarts == 0 {
if numOfUpgradeStatusErrs > maxNumberOfFirstConnectionRetries {
isFirstStartup := numOfRestarts == 0
if isFirstStartup {
if numOfUpgradeStatusErrs >= maxNumberOfFirstConnectionRetries {
return failedToGetStatusErr(maxNumberOfFirstConnectionRetries, err)
}
} else { // Binary has been started already. Something has failed after the startup
if numOfUpgradeStatusErrs > maxUpgradeStatusErrs {
if numOfUpgradeStatusErrs >= maxUpgradeStatusErrs {
return failedToGetStatusErr(maxUpgradeStatusErrs, err)
}
}
Expand All @@ -176,12 +179,17 @@ func (v *Visor) Run(ctx context.Context) error {

numOfUpgradeStatusErrs++

v.log.Info("Still waiting for vega to start...", logging.Int("attemptLeft", maxUpgradeStatusErrs-numOfUpgradeStatusErrs))
attemptsLeft := maxUpgradeStatusErrs - numOfUpgradeStatusErrs
if isFirstStartup {
attemptsLeft = maxNumberOfFirstConnectionRetries - numOfUpgradeStatusErrs
}
v.log.Info("Still waiting for vega to start...", logging.Int("attemptLeft", attemptsLeft))

break
}

if !upStatus.ReadyToUpgrade {
numOfUpgradeStatusErrs = 0
break
}

Expand Down

0 comments on commit e0bd34c

Please sign in to comment.