Skip to content

Commit ce5dd53

Browse files
authored
node pod probe daemon (#1077)
Signed-off-by: liheng.zms <[email protected]>
1 parent 4cef459 commit ce5dd53

21 files changed

+2080
-62
lines changed

.github/workflows/ci.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ jobs:
4848
version: ${{ env.GOLANGCI_VERSION }}
4949
args: --verbose
5050
skip-pkg-cache: true
51+
mod: readonly
5152

5253
markdownlint-misspell-shellcheck:
5354
runs-on: ubuntu-18.04

.github/workflows/e2e-1.16.yaml

+61-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ on:
1010

1111
env:
1212
# Common versions
13-
GO_VERSION: '1.17'
13+
GO_VERSION: '1.18'
1414
KIND_VERSION: 'v0.14.0'
1515
KIND_IMAGE: 'kindest/node:v1.16.15'
1616
KIND_CLUSTER_NAME: 'ci-testing'
@@ -85,6 +85,18 @@ jobs:
8585
kubectl get pod -n kruise-system --no-headers -l control-plane=controller-manager | awk '{print $1}' | xargs kubectl logs -p -n kruise-system
8686
exit 1
8787
fi
88+
kubectl get pods -n kruise-system -l control-plane=daemon -o=jsonpath="{range .items[*]}{.metadata.namespace}{\"\t\"}{.metadata.name}{\"\n\"}{end}" | while read ns name;
89+
do
90+
restartCount=$(kubectl get pod -n ${ns} ${name} --no-headers | awk '{print $4}')
91+
if [ "${restartCount}" -eq "0" ];then
92+
echo "Kruise-daemon has not restarted"
93+
else
94+
kubectl get pods -n ${ns} -l control-plane=daemon --no-headers
95+
echo "Kruise-daemon has restarted, abort!!!"
96+
kubectl logs -p -n ${ns} ${name}
97+
exit 1
98+
fi
99+
done
88100
exit $retVal
89101
90102
pullimages-containerrecreate:
@@ -154,6 +166,18 @@ jobs:
154166
kubectl get pod -n kruise-system --no-headers -l control-plane=controller-manager | awk '{print $1}' | xargs kubectl logs -p -n kruise-system
155167
exit 1
156168
fi
169+
kubectl get pods -n kruise-system -l control-plane=daemon -o=jsonpath="{range .items[*]}{.metadata.namespace}{\"\t\"}{.metadata.name}{\"\n\"}{end}" | while read ns name;
170+
do
171+
restartCount=$(kubectl get pod -n ${ns} ${name} --no-headers | awk '{print $4}')
172+
if [ "${restartCount}" -eq "0" ];then
173+
echo "Kruise-daemon has not restarted"
174+
else
175+
kubectl get pods -n ${ns} -l control-plane=daemon --no-headers
176+
echo "Kruise-daemon has restarted, abort!!!"
177+
kubectl logs -p -n ${ns} ${name}
178+
exit 1
179+
fi
180+
done
157181
exit $retVal
158182
159183
advanced-daemonset:
@@ -223,6 +247,18 @@ jobs:
223247
kubectl get pod -n kruise-system --no-headers -l control-plane=controller-manager | awk '{print $1}' | xargs kubectl logs -p -n kruise-system
224248
exit 1
225249
fi
250+
kubectl get pods -n kruise-system -l control-plane=daemon -o=jsonpath="{range .items[*]}{.metadata.namespace}{\"\t\"}{.metadata.name}{\"\n\"}{end}" | while read ns name;
251+
do
252+
restartCount=$(kubectl get pod -n ${ns} ${name} --no-headers | awk '{print $4}')
253+
if [ "${restartCount}" -eq "0" ];then
254+
echo "Kruise-daemon has not restarted"
255+
else
256+
kubectl get pods -n ${ns} -l control-plane=daemon --no-headers
257+
echo "Kruise-daemon has restarted, abort!!!"
258+
kubectl logs -p -n ${ns} ${name}
259+
exit 1
260+
fi
261+
done
226262
exit $retVal
227263
228264
sidecarset:
@@ -292,6 +328,18 @@ jobs:
292328
kubectl get pod -n kruise-system --no-headers -l control-plane=controller-manager | awk '{print $1}' | xargs kubectl logs -p -n kruise-system
293329
exit 1
294330
fi
331+
kubectl get pods -n kruise-system -l control-plane=daemon -o=jsonpath="{range .items[*]}{.metadata.namespace}{\"\t\"}{.metadata.name}{\"\n\"}{end}" | while read ns name;
332+
do
333+
restartCount=$(kubectl get pod -n ${ns} ${name} --no-headers | awk '{print $4}')
334+
if [ "${restartCount}" -eq "0" ];then
335+
echo "Kruise-daemon has not restarted"
336+
else
337+
kubectl get pods -n ${ns} -l control-plane=daemon --no-headers
338+
echo "Kruise-daemon has restarted, abort!!!"
339+
kubectl logs -p -n ${ns} ${name}
340+
exit 1
341+
fi
342+
done
295343
exit $retVal
296344
297345
podUnavailableBudget:
@@ -419,4 +467,16 @@ jobs:
419467
kubectl get pod -n kruise-system --no-headers -l control-plane=controller-manager | awk '{print $1}' | xargs kubectl logs -p -n kruise-system
420468
exit 1
421469
fi
470+
kubectl get pods -n kruise-system -l control-plane=daemon -o=jsonpath="{range .items[*]}{.metadata.namespace}{\"\t\"}{.metadata.name}{\"\n\"}{end}" | while read ns name;
471+
do
472+
restartCount=$(kubectl get pod -n ${ns} ${name} --no-headers | awk '{print $4}')
473+
if [ "${restartCount}" -eq "0" ];then
474+
echo "Kruise-daemon has not restarted"
475+
else
476+
kubectl get pods -n ${ns} -l control-plane=daemon --no-headers
477+
echo "Kruise-daemon has restarted, abort!!!"
478+
kubectl logs -p -n ${ns} ${name}
479+
exit 1
480+
fi
481+
done
422482
exit $retVal

.github/workflows/e2e-1.24.yaml

+48
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,18 @@ jobs:
154154
kubectl get pod -n kruise-system --no-headers -l control-plane=controller-manager | awk '{print $1}' | xargs kubectl logs -p -n kruise-system
155155
exit 1
156156
fi
157+
kubectl get pods -n kruise-system -l control-plane=daemon -o=jsonpath="{range .items[*]}{.metadata.namespace}{\"\t\"}{.metadata.name}{\"\n\"}{end}" | while read ns name;
158+
do
159+
restartCount=$(kubectl get pod -n ${ns} ${name} --no-headers | awk '{print $4}')
160+
if [ "${restartCount}" -eq "0" ];then
161+
echo "Kruise-daemon has not restarted"
162+
else
163+
kubectl get pods -n ${ns} -l control-plane=daemon --no-headers
164+
echo "Kruise-daemon has restarted, abort!!!"
165+
kubectl logs -p -n ${ns} ${name}
166+
exit 1
167+
fi
168+
done
157169
exit $retVal
158170
159171
advanced-daemonset:
@@ -223,6 +235,18 @@ jobs:
223235
kubectl get pod -n kruise-system --no-headers -l control-plane=controller-manager | awk '{print $1}' | xargs kubectl logs -p -n kruise-system
224236
exit 1
225237
fi
238+
kubectl get pods -n kruise-system -l control-plane=daemon -o=jsonpath="{range .items[*]}{.metadata.namespace}{\"\t\"}{.metadata.name}{\"\n\"}{end}" | while read ns name;
239+
do
240+
restartCount=$(kubectl get pod -n ${ns} ${name} --no-headers | awk '{print $4}')
241+
if [ "${restartCount}" -eq "0" ];then
242+
echo "Kruise-daemon has not restarted"
243+
else
244+
kubectl get pods -n ${ns} -l control-plane=daemon --no-headers
245+
echo "Kruise-daemon has restarted, abort!!!"
246+
kubectl logs -p -n ${ns} ${name}
247+
exit 1
248+
fi
249+
done
226250
exit $retVal
227251
228252
sidecarset:
@@ -292,6 +316,18 @@ jobs:
292316
kubectl get pod -n kruise-system --no-headers -l control-plane=controller-manager | awk '{print $1}' | xargs kubectl logs -p -n kruise-system
293317
exit 1
294318
fi
319+
kubectl get pods -n kruise-system -l control-plane=daemon -o=jsonpath="{range .items[*]}{.metadata.namespace}{\"\t\"}{.metadata.name}{\"\n\"}{end}" | while read ns name;
320+
do
321+
restartCount=$(kubectl get pod -n ${ns} ${name} --no-headers | awk '{print $4}')
322+
if [ "${restartCount}" -eq "0" ];then
323+
echo "Kruise-daemon has not restarted"
324+
else
325+
kubectl get pods -n ${ns} -l control-plane=daemon --no-headers
326+
echo "Kruise-daemon has restarted, abort!!!"
327+
kubectl logs -p -n ${ns} ${name}
328+
exit 1
329+
fi
330+
done
295331
exit $retVal
296332
297333
ephemeraljob:
@@ -477,4 +513,16 @@ jobs:
477513
kubectl get pod -n kruise-system --no-headers -l control-plane=controller-manager | awk '{print $1}' | xargs kubectl logs -p -n kruise-system
478514
exit 1
479515
fi
516+
kubectl get pods -n kruise-system -l control-plane=daemon -o=jsonpath="{range .items[*]}{.metadata.namespace}{\"\t\"}{.metadata.name}{\"\n\"}{end}" | while read ns name;
517+
do
518+
restartCount=$(kubectl get pod -n ${ns} ${name} --no-headers | awk '{print $4}')
519+
if [ "${restartCount}" -eq "0" ];then
520+
echo "Kruise-daemon has not restarted"
521+
else
522+
kubectl get pods -n ${ns} -l control-plane=daemon --no-headers
523+
echo "Kruise-daemon has restarted, abort!!!"
524+
kubectl logs -p -n ${ns} ${name}
525+
exit 1
526+
fi
527+
done
480528
exit $retVal

CONTRIBUTING.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ We encourage contributors to follow the [PR template](./.github/PULL_REQUEST_TEM
8888
As a contributor, if you want to make any contribution to Kruise project, we should reach an agreement on the version of tools used in the development environment.
8989
Here are some dependents with specific version:
9090

91-
- Golang : v1.17+
91+
- Golang : v1.18+
9292
- Kubernetes: v1.16+
9393

9494
### Developing guide

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Build the manager and daemon binaries
2-
FROM golang:1.17 as builder
2+
FROM golang:1.18 as builder
33

44
WORKDIR /workspace
55
# Copy the Go Modules manifests

Dockerfile_multiarch

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Build the manager and daemon binaries
2-
FROM --platform=$BUILDPLATFORM golang:1.17 as builder
2+
FROM --platform=$BUILDPLATFORM golang:1.18 as builder
33

44
WORKDIR /workspace
55
# Copy the Go Modules manifests

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ all: build
2222
##@ Development
2323

2424
go_check:
25-
@scripts/check_go_version "1.17.0"
25+
@scripts/check_go_version "1.18.0"
2626

2727
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
2828
@scripts/generate_client.sh

config/rbac/daemon_role.yaml

+16
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,19 @@ rules:
7272
- get
7373
- list
7474
- watch
75+
- apiGroups:
76+
- apps.kruise.io
77+
resources:
78+
- nodepodprobes
79+
verbs:
80+
- get
81+
- list
82+
- watch
83+
- apiGroups:
84+
- apps.kruise.io
85+
resources:
86+
- nodepodprobes/status
87+
verbs:
88+
- get
89+
- patch
90+
- update

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module github.com/openkruise/kruise
22

3-
go 1.17
3+
go 1.18
44

55
require (
66
github.com/alibaba/pouch v0.0.0-20190328125340-37051654f368

pkg/controller/nodepodprobe/node_pod_probe_controller.go

+13-16
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,19 @@ import (
2323
"strings"
2424
"time"
2525

26-
"k8s.io/apimachinery/pkg/util/sets"
27-
2826
appsv1alpha1 "github.com/openkruise/kruise/apis/apps/v1alpha1"
27+
"github.com/openkruise/kruise/pkg/features"
2928
"github.com/openkruise/kruise/pkg/util"
3029
utilclient "github.com/openkruise/kruise/pkg/util/client"
3130
"github.com/openkruise/kruise/pkg/util/controllerfinder"
3231
utildiscovery "github.com/openkruise/kruise/pkg/util/discovery"
32+
utilfeature "github.com/openkruise/kruise/pkg/util/feature"
3333
"github.com/openkruise/kruise/pkg/util/ratelimiter"
3434
corev1 "k8s.io/api/core/v1"
3535
"k8s.io/apimachinery/pkg/api/errors"
3636
"k8s.io/apimachinery/pkg/runtime"
3737
"k8s.io/apimachinery/pkg/types"
38+
"k8s.io/apimachinery/pkg/util/sets"
3839
"k8s.io/client-go/util/retry"
3940
"k8s.io/klog/v2"
4041
kubecontroller "k8s.io/kubernetes/pkg/controller"
@@ -67,7 +68,7 @@ var (
6768
// Add creates a new NodePodProbe Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller
6869
// and Start it when the Manager is Started.
6970
func Add(mgr manager.Manager) error {
70-
if !utildiscovery.DiscoverGVK(controllerKind) {
71+
if !utildiscovery.DiscoverGVK(controllerKind) || !utilfeature.DefaultFeatureGate.Enabled(features.PodProbeMarkerGate) {
7172
return nil
7273
}
7374
return add(mgr, newReconciler(mgr))
@@ -220,9 +221,10 @@ func (r *ReconcileNodePodProbe) syncPodFromNodePodProbe(npp *appsv1alpha1.NodePo
220221

221222
func (r *ReconcileNodePodProbe) updatePodProbeStatus(pod *corev1.Pod, status appsv1alpha1.PodProbeStatus) error {
222223
// map[probe.name]->probeState
223-
currentConditions := make(map[string]appsv1alpha1.ProbeState)
224-
for _, condition := range pod.Status.Conditions {
225-
currentConditions[string(condition.Type)] = appsv1alpha1.ProbeState(condition.Status)
224+
currentConditions := make(map[string]*corev1.PodCondition)
225+
for i := range pod.Status.Conditions {
226+
condition := &pod.Status.Conditions[i]
227+
currentConditions[string(condition.Type)] = condition
226228
}
227229
type metadata struct {
228230
Labels map[string]interface{} `json:"labels,omitempty"`
@@ -239,11 +241,9 @@ func (r *ReconcileNodePodProbe) updatePodProbeStatus(pod *corev1.Pod, status app
239241
validConditionTypes := sets.NewString()
240242
for i := range status.ProbeStates {
241243
probeState := status.ProbeStates[i]
242-
// ignore the probe state
243-
if probeState.State == "" || probeState.State == currentConditions[probeState.Name] {
244+
if probeState.State == "" {
244245
continue
245246
}
246-
247247
// fetch podProbeMarker
248248
ppmName, probeName := strings.Split(probeState.Name, "#")[0], strings.Split(probeState.Name, "#")[1]
249249
ppm := &appsv1alpha1.PodProbeMarker{}
@@ -267,7 +267,6 @@ func (r *ReconcileNodePodProbe) updatePodProbeStatus(pod *corev1.Pod, status app
267267
break
268268
}
269269
}
270-
271270
if conditionType != "" && validConditionTypes.Has(conditionType) {
272271
klog.Warningf("NodePodProbe(%s) pod(%s/%s) condition(%s) is conflict", ppmName, pod.Namespace, pod.Name, conditionType)
273272
// patch pod condition
@@ -287,11 +286,9 @@ func (r *ReconcileNodePodProbe) updatePodProbeStatus(pod *corev1.Pod, status app
287286
Message: probeState.Message,
288287
})
289288
}
290-
291289
if len(policy) == 0 {
292290
continue
293291
}
294-
295292
// matchedPolicy is when policy.state is equal to probeState.State, otherwise oppositePolicy
296293
// 1. If policy[0].state = Succeeded, policy[1].state = Failed. probeState.State = Succeeded.
297294
// So policy[0] is matchedPolicy, policy[1] is oppositePolicy
@@ -328,15 +325,14 @@ func (r *ReconcileNodePodProbe) updatePodProbeStatus(pod *corev1.Pod, status app
328325
if len(probeConditions) == 0 && len(probeMetadata.Labels) == 0 && len(probeMetadata.Annotations) == 0 {
329326
return nil
330327
}
331-
332328
//update pod metadata and status condition
333329
podClone := pod.DeepCopy()
334330
if err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
335331
if err = r.Client.Get(context.TODO(), types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}, podClone); err != nil {
336332
klog.Errorf("error getting updated pod(%s/%s) from client", pod.Namespace, pod.Name)
337333
return err
338334
}
339-
oldStatus := podClone.DeepCopy()
335+
oldStatus := podClone.Status.DeepCopy()
340336
for i := range probeConditions {
341337
condition := probeConditions[i]
342338
util.SetPodCondition(podClone, condition)
@@ -363,7 +359,7 @@ func (r *ReconcileNodePodProbe) updatePodProbeStatus(pod *corev1.Pod, status app
363359
podClone.Annotations[k] = v.(string)
364360
}
365361
}
366-
if reflect.DeepEqual(oldStatus, podClone.Status) && reflect.DeepEqual(oldMetadata.Labels, podClone.Labels) &&
362+
if reflect.DeepEqual(oldStatus.Conditions, podClone.Status.Conditions) && reflect.DeepEqual(oldMetadata.Labels, podClone.Labels) &&
367363
reflect.DeepEqual(oldMetadata.Annotations, podClone.Annotations) {
368364
return nil
369365
}
@@ -372,6 +368,7 @@ func (r *ReconcileNodePodProbe) updatePodProbeStatus(pod *corev1.Pod, status app
372368
klog.Errorf("NodePodProbe patch pod(%s/%s) status failed: %s", podClone.Namespace, podClone.Name, err.Error())
373369
return err
374370
}
375-
klog.V(3).Infof("NodePodProbe update pod(%s/%s) status conditions(%s) success", podClone.Namespace, podClone.Name, util.DumpJSON(probeConditions))
371+
klog.V(3).Infof("NodePodProbe update pod(%s/%s) metadata(%s) conditions(%s) success", podClone.Namespace, podClone.Name,
372+
util.DumpJSON(probeMetadata), util.DumpJSON(probeConditions))
376373
return nil
377374
}

0 commit comments

Comments
 (0)