Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

neonvm: add readiness probe for runner container #1190

Merged
merged 5 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions neonvm-runner/cmd/httpserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ import (
)

type cpuServerCallbacks struct {
get func(*zap.Logger) (*vmv1.MilliCPU, error)
set func(*zap.Logger, vmv1.MilliCPU) error
get func(*zap.Logger) (*vmv1.MilliCPU, error)
set func(*zap.Logger, vmv1.MilliCPU) error
ready func(*zap.Logger) bool
}

func listenForHTTPRequests(
Expand All @@ -42,6 +43,13 @@ func listenForHTTPRequests(
mux.HandleFunc("/cpu_current", func(w http.ResponseWriter, r *http.Request) {
handleCPUCurrent(cpuCurrentLogger, w, r, callbacks.get)
})
mux.HandleFunc("/ready", func(w http.ResponseWriter, r *http.Request) {
if callbacks.ready(logger) {
w.WriteHeader(200)
} else {
w.WriteHeader(500)
}
})
if networkMonitoring {
reg := prometheus.NewRegistry()
metrics := NewMonitoringMetrics(reg)
Expand Down
43 changes: 43 additions & 0 deletions neonvm-runner/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,25 @@ func runQEMU(
lastValue.Store(uint32(cpu))
return nil
},
ready: func(logger *zap.Logger) bool {
switch cfg.cpuScalingMode {
case vmv1.CpuScalingModeSysfs:
// check if the NeonVM Daemon is ready to accept requests
err := checkNeonvmDaemonCPU()
if err != nil {
logger.Warn("neonvm-daemon ready probe failed", zap.Error(err))
return false
}
return true
case vmv1.CpuScalingModeQMP:
// no readiness check for QMP mode
return true
default:
// explicit panic for unknown CPU scaling mode
// in case if we add a new CPU scaling mode and forget to update this function
panic(fmt.Errorf("unknown CPU scaling mode %s", cfg.cpuScalingMode))
}
},
}

wg.Add(1)
Expand Down Expand Up @@ -753,3 +772,27 @@ func setNeonvmDaemonCPU(cpu vmv1.MilliCPU) error {

return nil
}

// checkNeonvmDaemonCPU sends a GET request to the NeonVM Daemon to get the current CPU limit for the sake of readiness probe.
func checkNeonvmDaemonCPU() error {
mikhail-sakhnov marked this conversation as resolved.
Show resolved Hide resolved
_, vmIP, _, err := calcIPs(defaultNetworkCIDR)
if err != nil {
return fmt.Errorf("could not calculate VM IP address: %w", err)
}
ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
defer cancel()
url := fmt.Sprintf("http://%s:25183/cpu", vmIP)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return fmt.Errorf("could not build request: %w", err)
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return fmt.Errorf("could not send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return fmt.Errorf("neonvm-daemon responded with status %d", resp.StatusCode)
}
return nil
}
1 change: 1 addition & 0 deletions neonvm/samples/vm-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ kind: VirtualMachine
metadata:
name: example
spec:
cpuScalingMode: SysfsScaling
guest:
cpus:
min: 1
Expand Down
48 changes: 46 additions & 2 deletions pkg/neonvm/controllers/vm_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apiserver/pkg/storage/names"
"k8s.io/client-go/tools/record"

Expand Down Expand Up @@ -641,7 +642,6 @@ func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine)
"Memory in spec", memorySizeFromSpec)
vm.Status.Phase = vmv1.VmScaling
}

case runnerSucceeded:
vm.Status.Phase = vmv1.VmSucceeded
meta.SetStatusCondition(&vm.Status.Conditions,
Expand Down Expand Up @@ -863,6 +863,7 @@ const (
//
// This is *similar* to the value of pod.Status.Phase, but we'd like to retain our own abstraction
// to have more control over the semantics.
// We handle PodRunning phase differently during VM Migration phase.
func runnerStatus(pod *corev1.Pod) runnerStatusKind {
// Add 5 seconds to account for clock skew and k8s lagging behind.
deadline := metav1.NewTime(metav1.Now().Add(-5 * time.Second))
Expand All @@ -880,12 +881,43 @@ func runnerStatus(pod *corev1.Pod) runnerStatusKind {
case corev1.PodFailed:
return runnerFailed
case corev1.PodRunning:
return runnerRunning
return runnerContainerStatus(pod)
default:
panic(fmt.Errorf("unknown pod phase: %q", pod.Status.Phase))
}
}

const (
runnerContainerName = "neonvm-runner"
)

// runnerContainerStatus returns status of the runner container.
func runnerContainerStatus(pod *corev1.Pod) runnerStatusKind {
// if the pod has no container statuses, we consider it pending
if len(pod.Status.ContainerStatuses) == 0 {
return runnerPending
}
_, role, ownedByMigration := vmv1.MigrationOwnerForPod(pod)

// if a target pod for a migration, we consider the pod running
// because the qemu is started in incoming migration mode
// and neonvm-daemon which is used in readiness probe is not available
if ownedByMigration && role == vmv1.MigrationRoleTarget {
return runnerRunning
}

// normal case scenario, pod is not owned by the migration
// and we check the neonvm-runner container for the readiness probe
for _, c := range pod.Status.ContainerStatuses {
// we only care about the neonvm-runner container
if c.Name == runnerContainerName && !c.Ready {
return runnerPending
}
}

return runnerRunning
}

// deleteRunnerPodIfEnabled deletes the runner pod if buildtag.NeverDeleteRunnerPods is false, and
// then emits an event and log line about what it did, whether it actually deleted the runner pod.
func (r *VMReconciler) deleteRunnerPodIfEnabled(
Expand Down Expand Up @@ -1368,6 +1400,18 @@ func podSpec(
}
}(),
Resources: vm.Spec.PodResources,
ReadinessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/ready",
Port: intstr.FromInt32(vm.Spec.RunnerPort),
Scheme: corev1.URISchemeHTTP,
},
},
InitialDelaySeconds: 5,
PeriodSeconds: 5,
FailureThreshold: 3,
},
}

return []corev1.Container{runner}
Expand Down
10 changes: 10 additions & 0 deletions pkg/neonvm/controllers/vm_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,18 @@ func TestRunningPod(t *testing.T) {
prettyPrint(t, pod)

pod.Status.Phase = corev1.PodRunning
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: "neonvm-runner",
Ready: true,
})
err = params.client.Status().Update(params.ctx, &pod)
require.NoError(t, err)
prettyPrint(t, pod)
// assert pod is ready
assert.True(t, lo.ContainsBy(pod.Status.ContainerStatuses, func(c corev1.ContainerStatus) bool {
return c.Name == "neonvm-runner" && c.Ready
}))

assert.Equal(t, runnerRunning, runnerStatus(&pod))

// Round 2
Expand Down
1 change: 1 addition & 0 deletions pkg/neonvm/controllers/vmmigration_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,7 @@ func (r *VirtualMachineMigrationReconciler) targetPodForVirtualMachine(
pod.Name = migration.Status.TargetPodName

// add env variable to turn on migration receiver
// TODO: make it false or empty after the migration is done to enable correct readiness probe
mikhail-sakhnov marked this conversation as resolved.
Show resolved Hide resolved
pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, corev1.EnvVar{Name: "RECEIVE_MIGRATION", Value: "true"})

// add podAntiAffinity to schedule target pod to another k8s node
Expand Down
2 changes: 1 addition & 1 deletion vm-builder/files/inittab
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
::sysinit:/neonvm/bin/vminit
::once:/neonvm/bin/touch /neonvm/vmstart.allowed
::respawn:/neonvm/bin/neonvmd --addr=0.0.0.0:25183
::respawn:/neonvm/bin/udhcpc -t 1 -T 1 -A 1 -f -i eth0 -O 121 -O 119 -s /neonvm/bin/udhcpc.script
::respawn:/neonvm/bin/udevd
::wait:/neonvm/bin/udev-init.sh
::respawn:/neonvm/bin/acpid -f -c /neonvm/acpi
::respawn:/neonvm/bin/vector -c /neonvm/config/vector.yaml --config-dir /etc/vector --color never
::respawn:/neonvm/bin/chronyd -n -f /neonvm/config/chrony.conf -l /var/log/chrony/chrony.log
::respawn:/neonvm/bin/sshd -E /var/log/ssh.log -f /neonvm/config/sshd_config
::respawn:/neonvm/bin/neonvmd --addr=0.0.0.0:25183
sharnoff marked this conversation as resolved.
Show resolved Hide resolved
::respawn:/neonvm/bin/vmstart
{{ range .InittabCommands }}
::{{.SysvInitAction}}:su -p {{.CommandUser}} -c {{.ShellEscapedCommand}}
Expand Down
Loading