diff --git a/go/flags/endtoend/vtorc.txt b/go/flags/endtoend/vtorc.txt index e36c35924b1..247dd240e73 100644 --- a/go/flags/endtoend/vtorc.txt +++ b/go/flags/endtoend/vtorc.txt @@ -17,6 +17,7 @@ vtorc \ Flags: --allow-emergency-reparent Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary (default true) + --allow-recovery Allow recovery actions (default true) --alsologtostderr log to standard error as well as files --audit-file-location string File location where the audit logs are to be stored --audit-purge-duration duration Duration for which audit logs are held before being purged. Should be in multiples of days (default 168h0m0s) diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go index e3b55d64c6b..f79caeae08a 100644 --- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go +++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go @@ -57,6 +57,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { "--topo_global_root", clusterInfo.ClusterInstance.VtctlProcess.TopoGlobalRoot, } servenv.ParseFlags("vtorc") + config.Config.AllowRecovery = true config.Config.RecoveryPeriodBlockSeconds = 1 config.Config.InstancePollSeconds = 1 config.MarkConfigurationLoaded() diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index ba3c41ddc61..8b36341d318 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -56,6 +56,7 @@ var ( auditToBackend = false auditToSyslog = false auditPurgeDuration = 7 * 24 * time.Hour // Equivalent of 7 days + allowRecovery = true recoveryPeriodBlockDuration = 30 * time.Second preventCrossCellFailover = false waitReplicasTimeout = 30 * time.Second @@ -76,6 +77,7 @@ func RegisterFlags(fs *pflag.FlagSet) { fs.BoolVar(&auditToBackend, "audit-to-backend", auditToBackend, "Whether to store the audit log in the VTOrc database") fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog") fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days") + fs.BoolVar(&allowRecovery, "allow-recovery", allowRecovery, "Allow recovery actions") fs.DurationVar(&recoveryPeriodBlockDuration, "recovery-period-block-duration", recoveryPeriodBlockDuration, "Duration for which a new recovery is blocked on an instance after running a recovery") fs.BoolVar(&preventCrossCellFailover, "prevent-cross-cell-failover", preventCrossCellFailover, "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover") fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs") @@ -104,6 +106,7 @@ type Configuration struct { WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. + AllowRecovery bool // Allow recoveries. RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs } @@ -134,12 +137,13 @@ func UpdateConfigValuesFromFlags() { Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second) Config.TolerableReplicationLagSeconds = int(tolerableReplicationLag / time.Second) Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second) + Config.AllowRecovery = allowRecovery Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second) } // ERSEnabled reports whether VTOrc is allowed to run ERS or not. func ERSEnabled() bool { - return ersEnabled + return allowRecovery && ersEnabled } // SetERSEnabled sets the value for the ersEnabled variable. This should only be used from tests. @@ -172,6 +176,7 @@ func newConfiguration() *Configuration { AuditLogFile: "", AuditToSyslog: false, AuditToBackendDB: false, + AllowRecovery: true, AuditPurgeDays: 7, RecoveryPeriodBlockSeconds: 30, PreventCrossDataCenterPrimaryFailover: false, diff --git a/go/vt/vtorc/logic/vtorc.go b/go/vt/vtorc/logic/vtorc.go index b9d62fc2982..c44b66b5994 100644 --- a/go/vt/vtorc/logic/vtorc.go +++ b/go/vt/vtorc/logic/vtorc.go @@ -335,7 +335,6 @@ func ContinuousDiscovery() { healthTick := time.Tick(config.HealthPollSeconds * time.Second) caretakingTick := time.Tick(time.Minute) - recoveryTick := time.Tick(time.Duration(config.Config.RecoveryPollSeconds) * time.Second) tabletTopoTick := OpenTabletDiscovery() var recoveryEntrance int64 var snapshotTopologiesTick <-chan time.Time @@ -343,6 +342,12 @@ func ContinuousDiscovery() { snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour) } + recoveryTicker := time.NewTicker(time.Duration(config.Config.RecoveryPollSeconds) * time.Second) + defer recoveryTicker.Stop() + if !config.Config.AllowRecovery { + recoveryTicker.Stop() + } + runCheckAndRecoverOperationsTimeRipe := func() bool { return time.Since(continuousDiscoveryStartTime) >= checkAndRecoverWaitPeriod } @@ -376,7 +381,7 @@ func ContinuousDiscovery() { go ExpireTopologyRecoveryStepsHistory() } }() - case <-recoveryTick: + case <-recoveryTicker.C: go func() { if IsLeaderOrActive() { go ClearActiveFailureDetections()