Skip to content

Commit

Permalink
neonvm: add readiness probe for sysfs scaling
Browse files Browse the repository at this point in the history
Call runner /ready which, in sysfs scaling mode, proxifies to daemon's /cpu to
check if runner pod and vm is ready. Runner's endpoint /ready does nothing in
case of qmp scaling model.

Move neonvm-daemon line in the inittab to start it right before vmstart.

Modify logic in the migration controller to not waiting for the pod
readiness - neonvm-daemon doesn't start until the migration is finished.
De-facto, that doesn't change behavior for the migration at all since
before the PR we had no readiness probe.

Signed-off-by: Mikhail Sakhnov <[email protected]>
  • Loading branch information
mikhail-sakhnov committed Feb 3, 2025
1 parent e850ab6 commit 17b03f6
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 4 deletions.
12 changes: 10 additions & 2 deletions neonvm-runner/cmd/httpserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ import (
)

type cpuServerCallbacks struct {
get func(*zap.Logger) (*vmv1.MilliCPU, error)
set func(*zap.Logger, vmv1.MilliCPU) error
get func(*zap.Logger) (*vmv1.MilliCPU, error)
set func(*zap.Logger, vmv1.MilliCPU) error
ready func(*zap.Logger) bool
}

func listenForHTTPRequests(
Expand All @@ -42,6 +43,13 @@ func listenForHTTPRequests(
mux.HandleFunc("/cpu_current", func(w http.ResponseWriter, r *http.Request) {
handleCPUCurrent(cpuCurrentLogger, w, r, callbacks.get)
})
mux.HandleFunc("/ready", func(w http.ResponseWriter, r *http.Request) {
if callbacks.ready(logger) {
w.WriteHeader(200)
} else {
w.WriteHeader(500)
}
})
if networkMonitoring {
reg := prometheus.NewRegistry()
metrics := NewMonitoringMetrics(reg)
Expand Down
37 changes: 37 additions & 0 deletions neonvm-runner/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,19 @@ func runQEMU(
lastValue.Store(uint32(cpu))
return nil
},
ready: func(logger *zap.Logger) bool {
// if we are in sysfs mode, we need to check if the NeonVM Daemon is ready
if cfg.cpuScalingMode == vmv1.CpuScalingModeSysfs {
err := checkNeonvmDaemonCPU()
if err != nil {
logger.Warn("neonvm-daemon ready probe failed", zap.Error(err))
return false
}
return true
}
// immediately return ready status for QMP mode
return true
},
}

wg.Add(1)
Expand Down Expand Up @@ -753,3 +766,27 @@ func setNeonvmDaemonCPU(cpu vmv1.MilliCPU) error {

return nil
}

// checkNeonvmDaemonCPU sends a GET request to the NeonVM Daemon to get the current CPU limit for the sake of readiness probe.
func checkNeonvmDaemonCPU() error {
_, vmIP, _, err := calcIPs(defaultNetworkCIDR)
if err != nil {
return fmt.Errorf("could not calculate VM IP address: %w", err)
}
ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
defer cancel()
url := fmt.Sprintf("http://%s:25183/cpu", vmIP)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return fmt.Errorf("could not build request: %w", err)
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return fmt.Errorf("could not send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return fmt.Errorf("neonvm-daemon responded with status %d", resp.StatusCode)
}
return nil
}
1 change: 1 addition & 0 deletions neonvm/samples/vm-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ kind: VirtualMachine
metadata:
name: example
spec:
cpuScalingMode: SysfsScaling
guest:
cpus:
min: 1
Expand Down
53 changes: 52 additions & 1 deletion pkg/neonvm/controllers/vm_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apiserver/pkg/storage/names"
"k8s.io/client-go/tools/record"

Expand Down Expand Up @@ -864,12 +865,45 @@ func runnerStatus(pod *corev1.Pod) runnerStatusKind {
case corev1.PodFailed:
return runnerFailed
case corev1.PodRunning:
return runnerRunning
return isRunnerPodReady(pod)
default:
panic(fmt.Errorf("unknown pod phase: %q", pod.Status.Phase))
}
}

// isRunnerPodReady returns whether the runner pod is ready respecting the readiness probe of its containers.
func isRunnerPodReady(pod *corev1.Pod) runnerStatusKind {
// if the pod has no container statuses, we consider it pending
if len(pod.Status.ContainerStatuses) == 0 {
return runnerPending
}

// first we check if the pod is a target pod for the migration
// in that case we don't wait for the readiness probe
// from neonvm-daemon because qemu started
// in incoming migration mode
for _, containerSpec := range pod.Spec.Containers {
if containerSpec.Name == "neonvm-runner" {
for _, env := range containerSpec.Env {
if env.Name == "RECEIVE_MIGRATION" && env.Value == "true" {
return runnerRunning
}
}
}
}

// if the pod is not a target pod for the migration
// we check the neonvm-runner container
// for the readiness probe
for _, c := range pod.Status.ContainerStatuses {
// we only care about the neonvm-runner container
if c.Name == "neonvm-runner" && !c.Ready {
return runnerPending
}
}
return runnerRunning
}

// deleteRunnerPodIfEnabled deletes the runner pod if buildtag.NeverDeleteRunnerPods is false, and
// then emits an event and log line about what it did, whether it actually deleted the runner pod.
func (r *VMReconciler) deleteRunnerPodIfEnabled(
Expand Down Expand Up @@ -1178,6 +1212,10 @@ func imageForVmRunner() (string, error) {
return image, nil
}

// podSpec returns a VirtualMachine Pod object
// withReadinessProbe - if true, adds a readiness probe to the container
// we don't need readiness probe for the VM runner pod if it is a target pod
// for migration because VM is not started until migration is complete
func podSpec(
vm *vmv1.VirtualMachine,
sshSecret *corev1.Secret,
Expand Down Expand Up @@ -1378,6 +1416,19 @@ func podSpec(
},
}

pod.Spec.Containers[0].ReadinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/ready",
Port: intstr.FromInt32(vm.Spec.RunnerPort),
Scheme: corev1.URISchemeHTTP,
},
},
InitialDelaySeconds: 5,
PeriodSeconds: 5,
FailureThreshold: 3,
}

if sshSecret != nil {
pod.Spec.Containers[0].VolumeMounts = append(pod.Spec.Containers[0].VolumeMounts,
corev1.VolumeMount{
Expand Down
10 changes: 10 additions & 0 deletions pkg/neonvm/controllers/vm_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,18 @@ func TestRunningPod(t *testing.T) {
prettyPrint(t, pod)

pod.Status.Phase = corev1.PodRunning
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: "neonvm-runner",
Ready: true,
})
err = params.client.Status().Update(params.ctx, &pod)
require.NoError(t, err)
prettyPrint(t, pod)
// assert pod is ready
assert.True(t, lo.ContainsBy(pod.Status.ContainerStatuses, func(c corev1.ContainerStatus) bool {
return c.Name == "neonvm-runner" && c.Ready
}))

assert.Equal(t, runnerRunning, runnerStatus(&pod))

// Round 2
Expand Down
1 change: 1 addition & 0 deletions pkg/neonvm/controllers/vmmigration_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,7 @@ func (r *VirtualMachineMigrationReconciler) targetPodForVirtualMachine(
pod.Name = migration.Status.TargetPodName

// add env variable to turn on migration receiver
// TODO: make it false or empty after the migration is done to enable correct readiness probe
pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, corev1.EnvVar{Name: "RECEIVE_MIGRATION", Value: "true"})

// add podAntiAffinity to schedule target pod to another k8s node
Expand Down
2 changes: 1 addition & 1 deletion vm-builder/files/inittab
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
::sysinit:/neonvm/bin/vminit
::once:/neonvm/bin/touch /neonvm/vmstart.allowed
::respawn:/neonvm/bin/neonvmd --addr=0.0.0.0:25183
::respawn:/neonvm/bin/udhcpc -t 1 -T 1 -A 1 -f -i eth0 -O 121 -O 119 -s /neonvm/bin/udhcpc.script
::respawn:/neonvm/bin/udevd
::wait:/neonvm/bin/udev-init.sh
::respawn:/neonvm/bin/acpid -f -c /neonvm/acpi
::respawn:/neonvm/bin/vector -c /neonvm/config/vector.yaml --config-dir /etc/vector --color never
::respawn:/neonvm/bin/chronyd -n -f /neonvm/config/chrony.conf -l /var/log/chrony/chrony.log
::respawn:/neonvm/bin/sshd -E /var/log/ssh.log -f /neonvm/config/sshd_config
::respawn:/neonvm/bin/neonvmd --addr=0.0.0.0:25183
::respawn:/neonvm/bin/vmstart
{{ range .InittabCommands }}
::{{.SysvInitAction}}:su -p {{.CommandUser}} -c {{.ShellEscapedCommand}}
Expand Down

0 comments on commit 17b03f6

Please sign in to comment.