Skip to content

Commit

Permalink
fix: never remove etcd members which ID is discovered at least once
Browse files Browse the repository at this point in the history
The logic of the etcd audit got outdated with the more recent Talos
versions. `apid` now runs in the states where it wasn't available
before, so the check for the etcd member might lead to the
false-positives.
Also reorder the `auditMember` check sequence to be more correct.

Fixes: #750

Signed-off-by: Artem Chernyshev <[email protected]>
(cherry picked from commit 82da2f4)
  • Loading branch information
Unix4ever committed Dec 25, 2024
1 parent 219f671 commit 322891c
Showing 1 changed file with 20 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ func NewMachineSetEtcdAuditController(talosClientFactory *talos.ClientFactory, m
qtransform.WithExtraMappedInput(
mappers.MapByMachineSetLabel[*omni.ClusterMachineStatus, *omni.MachineSet](),
),
qtransform.WithExtraMappedInput(
mappers.MapByMachineSetLabelOnlyControlplane[*omni.ClusterMachineIdentity, *omni.MachineSet](),
),
qtransform.WithExtraMappedDestroyReadyInput(
mappers.MapByMachineSetLabel[*omni.ClusterMachine, *omni.MachineSet](),
),
Expand Down Expand Up @@ -335,6 +338,15 @@ func (auditor *etcdAuditor) auditEtcd(ctx context.Context, r controller.Reader,

// auditMember audits the etcd member in the given machine. It returns the member ID if it is ok (not an orphan).
func (auditor *etcdAuditor) auditMember(ctx context.Context, r controller.Reader, machine, clusterName string) (uint64, error) {
clusterMachineIdentity, err := safe.ReaderGetByID[*omni.ClusterMachineIdentity](ctx, r, machine)
if err != nil && !state.IsNotFoundError(err) {
return 0, err
}

if clusterMachineIdentity != nil && clusterMachineIdentity.TypedSpec().Value.EtcdMemberId != 0 {
return clusterMachineIdentity.TypedSpec().Value.EtcdMemberId, nil
}

cli, err := auditor.getNodeClient(ctx, r, clusterName, machine)
if err != nil {
if errors.Is(err, errSkipNode) {
Expand All @@ -356,11 +368,17 @@ func (auditor *etcdAuditor) auditMember(ctx context.Context, r controller.Reader
etcdMember *etcd.Member
)

if hasEtcdDirectory, err = auditor.checkEtcdDirectory(ctx, cli); err != nil {
if ephemeralMounted, err = auditor.checkEphemeralMount(ctx, cli); err != nil {
return 0, err
}

if ephemeralMounted, err = auditor.checkEphemeralMount(ctx, cli); err != nil {
if !ephemeralMounted {
requeueErr := fmt.Errorf("etcd audit skipped: machine %q from cluster %q doesn't have ephemeral partition mounted", machine, clusterName)

return 0, controller.NewRequeueError(requeueErr, auditor.requeueAfterDuration)
}

if hasEtcdDirectory, err = auditor.checkEtcdDirectory(ctx, cli); err != nil {
return 0, err
}

Expand All @@ -373,12 +391,6 @@ func (auditor *etcdAuditor) auditMember(ctx context.Context, r controller.Reader
return 0, nil
}

if !ephemeralMounted {
requeueErr := fmt.Errorf("etcd audit skipped: machine %q from cluster %q doesn't have ephemeral partition mounted", machine, clusterName)

return 0, controller.NewRequeueError(requeueErr, auditor.requeueAfterDuration)
}

if hasEtcdDirectory && etcdMember == nil {
requeueErr := fmt.Errorf("etcd audit skipped: machine %q from cluster %q still joining the cluster", machine, clusterName)

Expand Down

0 comments on commit 322891c

Please sign in to comment.