Skip to content

Commit

Permalink
Generate events for NMState Handler failures (#1190)
Browse files Browse the repository at this point in the history
On top of reporting status via nodenetworkconfigurationenactments, also
generate events for failures. These events directly show up in the
output of `kubectl describe nodenetworkconfigurationpolicies` and thus
provide a standardized way for administrators to understand why a
reconciliation failed.

Signed-off-by: Andreas Karis <[email protected]>
  • Loading branch information
andreaskaris authored May 31, 2023
1 parent dd4f9f0 commit 95151c7
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 3 deletions.
1 change: 1 addition & 0 deletions cmd/handler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ func setupHandlerControllers(mgr manager.Manager) error {
APIClient: apiClient,
Log: ctrl.Log.WithName("controllers").WithName("NodeNetworkConfigurationPolicy"),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor(fmt.Sprintf("%s.nmstate-handler", environment.NodeName())),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create NodeNetworkConfigurationPolicy controller", "controller", "NMState")
return err
Expand Down
15 changes: 13 additions & 2 deletions controllers/handler/nodenetworkconfigurationpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

"github.com/go-logr/logr"
"github.com/pkg/errors"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/retry"

ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -60,6 +61,10 @@ import (
"github.com/nmstate/kubernetes-nmstate/pkg/selectors"
)

const (
ReconcileFailed = "ReconcileFailed"
)

var (
nodeName string
nodeRunningUpdateRetryTime = 5 * time.Second
Expand Down Expand Up @@ -103,6 +108,7 @@ type NodeNetworkConfigurationPolicyReconciler struct {
APIClient client.Client
Log logr.Logger
Scheme *runtime.Scheme
Recorder record.EventRecorder
}

func init() {
Expand All @@ -121,7 +127,8 @@ func init() {
// Note:
// The Controller will requeue the Request to be processed again if the returned error is non-nil or
// Result.Requeue is true, otherwise upon completion it will remove the work from the queue.
//nolint: funlen, gocyclo
//
//nolint:funlen,gocyclo
func (r *NodeNetworkConfigurationPolicyReconciler) Reconcile(_ context.Context, request ctrl.Request) (ctrl.Result, error) {
_ = context.Background()
log := r.Log.WithValues("nodenetworkconfigurationpolicy", request.NamespacedName)
Expand Down Expand Up @@ -218,9 +225,13 @@ func (r *NodeNetworkConfigurationPolicyReconciler) Reconcile(_ context.Context,

nmstateOutput, err := nmstate.ApplyDesiredState(r.APIClient, enactmentInstance.Status.DesiredState)
if err != nil {
errmsg := fmt.Errorf("error reconciling NodeNetworkConfigurationPolicy at desired state apply: %s,\n %v", nmstateOutput, err)
errmsg := fmt.Errorf("error reconciling NodeNetworkConfigurationPolicy on node %s at desired state apply: %q,\n %v",
nodeName, nmstateOutput, err)
enactmentConditions.NotifyFailedToConfigure(errmsg)
log.Error(errmsg, fmt.Sprintf("Rolling back network configuration, manual intervention needed: %s", nmstateOutput))
if r.Recorder != nil {
r.Recorder.Event(instance, corev1.EventTypeWarning, ReconcileFailed, errmsg.Error())
}
return ctrl.Result{}, nil
}
log.Info("nmstate", "output", nmstateOutput)
Expand Down
13 changes: 13 additions & 0 deletions deploy/handler/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,16 @@ rules:
verbs:
- use
{{- end }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{template "handlerPrefix" .}}nmstate-handler-events
namespace: default
rules:
- apiGroups:
- ""
resources:
- events
verbs:
- 'create'
15 changes: 15 additions & 0 deletions deploy/handler/role_binding.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,18 @@ roleRef:
kind: ClusterRole
name: {{template "handlerPrefix" .}}nmstate-handler
apiGroup: rbac.authorization.k8s.io
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{template "handlerPrefix" .}}nmstate-handler-events
namespace: default
subjects:
- kind: ServiceAccount
name: {{template "handlerPrefix" .}}nmstate-handler
namespace: {{ .HandlerNamespace }}
roleRef:
kind: Role
apiGroup: rbac.authorization.k8s.io
name: {{template "handlerPrefix" .}}nmstate-handler-events
namespace: default
25 changes: 24 additions & 1 deletion docs/user-guide/103-troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ the operator protect the user from breaking the cluster networking.
## Invalid configuration

If any of the following cases render the configuration faulty, the setup will be
automatically rolled back and Enactment will report the failure.
automatically rolled back and Enactment will report the failure. In addition,
an event for the NodeNetworkConfigurationPolicy will be created.

* Configuration fails to be applied on the host (due to missing interfaces, inability to obtain IP, invalid attributes, ...)
* Connectivity to the default gateway is broken
Expand Down Expand Up @@ -110,6 +111,28 @@ the `ERROR` log line:
Connection activation failed on connection_id eth666: error=nm-manager-error-quark: No suitable device found for this connection
```

In addition, events are generated in the default namespace:
```shell
$ kubectl get events | grep Warning
6m55s Warning ReconcileFailed nodenetworkconfigurationpolicy/eth666 error reconciling NodeNetworkConfigurationPolicy on node node02 at desired state apply: "",...
```

And these events are also linked to the NodeNetworkConfigurationPolicy when using `kubectl describe`:
```shell
kubectl describe nodenetworkconfigurationpolicies.nmstate.io eth666
```

```
# output truncated
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning ReconcileFailed 8m25s node02.nmstate-handler error reconciling NodeNetworkConfigurationPolicy on node node02 at desired state apply: "",
(...)
NmstateError: InvalidArgument: Ethernet interface eth666 does not exists
'
```

The configuration therefore failed due to absence of NIC `eth666` on the node.
Now we can either fix the Policy to edit an available interface or safely remove
it:
Expand Down

0 comments on commit 95151c7

Please sign in to comment.