diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md
index 96f3cdf8541..61cf6be1cf8 100644
--- a/changelog/18.0/18.0.0/summary.md
+++ b/changelog/18.0/18.0.0/summary.md
@@ -18,6 +18,7 @@
- [Deleted `vtgr`](#deleted-vtgr)
- **[New stats](#new-stats)**
- [VTGate Vindex unknown parameters](#vtgate-vindex-unknown-parameters)
+ - [VTBackup stat `PhaseStatus`](#vtbackup-stat-phase-status)
- **[VTTablet](#vttablet)**
- [VTTablet: New ResetSequences RPC](#vttablet-new-rpc-reset-sequences)
- **[Docker](#docker)**
@@ -120,6 +121,14 @@ The `vtgr` has been deprecated in Vitess 17, also see https://github.com/vitessi
The VTGate stat `VindexUnknownParameters` gauges unknown Vindex parameters found in the latest VSchema pulled from the topology.
+#### VTBackup `PhaseStatus` stat
+
+`PhaseStatus` reports a 1 (active) or a 0 (inactive) for each of the following phases and statuses:
+
+ * `CatchUpReplication` phase has statuses `Stalled` and `Stopped`.
+ * `Stalled` is set to `1` when replication stops advancing.
+ * `Stopped` is set to `1` when replication stops before `vtbackup` catches up with the primary.
+
### VTTablet
#### New ResetSequences rpc
diff --git a/go/cmd/vtbackup/vtbackup.go b/go/cmd/vtbackup/vtbackup.go
index 31b7a4dc596..f27a991d35e 100644
--- a/go/cmd/vtbackup/vtbackup.go
+++ b/go/cmd/vtbackup/vtbackup.go
@@ -99,6 +99,10 @@ const (
// place a hard cap on the overall time for a backup, while also not waiting
// forever for things that should be quick.
operationTimeout = 1 * time.Minute
+
+ phaseNameCatchUpReplication = "CatchUpReplication"
+ phaseStatusCatchUpReplicationStalled = "Stalled"
+ phaseStatusCatchUpReplicationStopped = "Stopped"
)
var (
@@ -128,6 +132,17 @@ var (
"How long it took vtbackup to perform each phase (in seconds).",
"phase",
)
+ phaseStatus = stats.NewGaugesWithMultiLabels(
+ "PhaseStatus",
+ "Internal state of vtbackup phase.",
+ []string{"phase", "status"},
+ )
+ phaseStatuses = map[string][]string{
+ phaseNameCatchUpReplication: {
+ phaseStatusCatchUpReplicationStalled,
+ phaseStatusCatchUpReplicationStopped,
+ },
+ }
)
func registerFlags(fs *pflag.FlagSet) {
@@ -203,6 +218,13 @@ func main() {
topoServer := topo.Open()
defer topoServer.Close()
+ // Initialize stats.
+ for phaseName, statuses := range phaseStatuses {
+ for _, status := range statuses {
+ phaseStatus.Set([]string{phaseName, status}, 0)
+ }
+ }
+
// Try to take a backup, if it's been long enough since the last one.
// Skip pruning if backup wasn't fully successful. We don't want to be
// deleting things if the backup process is not healthy.
@@ -433,7 +455,13 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
backupParams.BackupTime = time.Now()
// Wait for replication to catch up.
- waitStartTime := time.Now()
+ var (
+ lastStatus replication.ReplicationStatus
+ status replication.ReplicationStatus
+ statusErr error
+
+ waitStartTime = time.Now()
+ )
for {
select {
case <-ctx.Done():
@@ -441,7 +469,8 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
case <-time.After(time.Second):
}
- status, statusErr := mysqld.ReplicationStatus()
+ lastStatus = status
+ status, statusErr = mysqld.ReplicationStatus()
if statusErr != nil {
log.Warningf("Error getting replication status: %v", statusErr)
continue
@@ -453,11 +482,21 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
durationByPhase.Set("CatchUpReplication", int64(time.Since(waitStartTime).Seconds()))
break
}
+ if !lastStatus.Position.IsZero() {
+ if status.Position.Equal(lastStatus.Position) {
+ phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStalled}, 1)
+ } else {
+ phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStalled}, 0)
+ }
+ }
if !status.Healthy() {
log.Warning("Replication has stopped before backup could be taken. Trying to restart replication.")
+ phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStopped}, 1)
if err := startReplication(ctx, mysqld, topoServer); err != nil {
log.Warningf("Failed to restart replication: %v", err)
}
+ } else {
+ phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStopped}, 0)
}
}
@@ -467,14 +506,16 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
}
// Did we make any progress?
- status, err := mysqld.ReplicationStatus()
- if err != nil {
+ status, statusErr = mysqld.ReplicationStatus()
+ if statusErr != nil {
return fmt.Errorf("can't get replication status: %v", err)
}
log.Infof("Replication caught up to %v", status.Position)
if !status.Position.AtLeast(primaryPos) && status.Position.Equal(restorePos) {
return fmt.Errorf("not taking backup: replication did not make any progress from restore point: %v", restorePos)
}
+ phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStalled}, 0)
+ phaseStatus.Set([]string{phaseNameCatchUpReplication, phaseStatusCatchUpReplicationStopped}, 0)
// Re-enable redo logging.
if disabledRedoLog {