vm-runner: external traffic Prometheus metrics endpoint #3836

Workflow file for this run

.github/workflows/e2e-test.yaml at 1004975

	name: e2e-test
	on:
	pull_request:
	push:
	branches:
	- main
	workflow_dispatch:
	inputs:
	kernel-image:
	type: string
	description: 'The kernel image to use for the VMs. If not specified, a kernel will be built from source'
	required: false
	cluster:
	type: choice
	description: 'The cluster to run the tests on'
	options:
	- k3d
	- kind
	default: k3d
	workflow_call:
	inputs:
	tag:
	type: string
	description: 'Tag to use for images, skipping building'
	required: false
	push-yamls:
	type: boolean
	description: 'If true, pushes a tarball containing the rendered yaml manifests as an artifact'
	required: false

	env:
	IMG_E2E_TEST: vm-postgres:15-bullseye

	defaults:
	run:
	shell: bash -euo pipefail {0}

	jobs:
	get-tag:
	outputs:
	tag: ${{ inputs.tag \|\| steps.get-tag.outputs.tag }}
	runs-on: ubuntu-latest
	steps:
	- name: get tag
	if: ${{ inputs.tag == '' }}
	id: get-tag
	env:
	SHA: ${{ github.event.pull_request.head.sha \|\| github.sha }}
	run: \|
	test -n "$SHA"
	sha="${SHA::7}"
	echo "tag=$sha.$GITHUB_RUN_ID" \| tee -a $GITHUB_OUTPUT

	build-images:
	needs: get-tag
	uses: ./.github/workflows/build-images.yaml
	with:
	skip: ${{ inputs.tag != '' }}
	tag: ${{ inputs.tag \|\| needs.get-tag.outputs.tag }}
	kernel-image: ${{ inputs.kernel-image }}
	# note: setting to preserve runner pods will mean that if !skip, they'll be built with those
	# settings and used properly in the tests. But if skip (because inputs.tag != ''), then this
	# setting will have no effect and the release images will be normal.
	controller-preserve-runner-pods: true
	secrets: inherit

	build-test-vm:
	needs: get-tag
	uses: ./.github/workflows/build-test-vm.yaml
	with:
	skip: ${{ inputs.tag != '' }}
	tag: ${{ inputs.tag \|\| needs.get-tag.outputs.tag }}
	secrets: inherit

	e2e-tests:
	needs: [ build-images, build-test-vm ]
	strategy:
	fail-fast: false
	matrix:
	cluster:
	- ${{ inputs.cluster \|\| 'k3d' }}
	runs-on: [ self-hosted, gen3, large ]
	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0 # fetch all, so that we also include tags

	- uses: actions/setup-go@v5
	with:
	go-version-file: 'go.mod'
	# Disable cache on self-hosted runners to avoid /usr/bin/tar errors, see https://github.com/actions/setup-go/issues/403
	cache: false
	# Sometimes setup-go gets stuck. Without this, it'll keep going until the job gets killed
	timeout-minutes: 10

	- name: Install dependencies
	run: \|
	sudo apt install -y python3-venv
	make e2e-tools
	echo $(pwd)/bin >> $GITHUB_PATH

	- name: Check dependencies
	run: \|
	kubectl version --client --output=yaml
	k3d version
	kind version
	kuttl version
	docker version

	- run: make render-release
	env:
	IMG_CONTROLLER: ${{ needs.build-images.outputs.controller }}
	IMG_VXLAN_CONTROLLER: ${{ needs.build-images.outputs.vxlan-controller }}
	IMG_RUNNER: ${{ needs.build-images.outputs.runner }}
	IMG_SCHEDULER: ${{ needs.build-images.outputs.scheduler }}
	IMG_AUTOSCALER_AGENT: ${{ needs.build-images.outputs.autoscaler-agent }}

	- name: upload manifests
	# nb: use format(..) to catch both inputs.push-yamls = true AND inputs.push-yamls = 'true'.
	if: ${{ format('{0}', inputs.push-yamls) == 'true' }}
	uses: actions/upload-artifact@v4
	with:
	name: rendered_manifests
	# nb: prefix before wildcard is removed from the uploaded files, so the artifact should
	# contain e.g.
	# - autoscale-scheduler.yaml
	# - autoscaler-agent.yaml
	# ...
	# ref https://github.com/actions/upload-artifact#upload-using-multiple-paths-and-exclusions
	path: rendered_manifests/*
	if-no-files-found: error
	retention-days: 2 # minimum is 1 day; 0 is default. These are only used temporarily.

	- name: set custom docker config directory
	uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193

	- uses: docker/login-action@v3
	with:
	username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
	password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

	# https://docs.k3s.io/installation/private-registry#registries-configuration-file
	# https://github.com/neondatabase/autoscaling/issues/975
	- name: set k3d registries.yaml
	# TODO: Implement an equivalent for kind?
	# Relevant docs seem to be here: https://kind.sigs.k8s.io/docs/user/private-registries
	if: ${{ matrix.cluster == 'k3d' }}
	env:
	DOCKERHUB_USERNAME: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
	DOCKERHUB_PASSWORD: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
	run: \|
	{
	echo "configs:"
	echo " registry-1.docker.io:"
	echo " auth:"
	echo " username: $DOCKERHUB_USERNAME"
	echo " password: $DOCKERHUB_PASSWORD"
	} >> $(pwd)/k3d/registries.yaml


	- run: make ${{ matrix.cluster }}-setup
	env:
	USE_REGISTRIES_FILE: true

	- name: deploy components
	timeout-minutes: 3
	run: \|
	rendered () { echo "rendered_manifests/$1"; }

	kubectl apply -f $(rendered multus.yaml)
	kubectl -n kube-system rollout status daemonset kube-multus-ds
	kubectl apply -f $(rendered whereabouts.yaml)
	kubectl -n kube-system rollout status daemonset whereabouts
	kubectl apply -f $(rendered neonvm-runner-image-loader.yaml)
	kubectl -n neonvm-system rollout status daemonset neonvm-runner-image-loader
	kubectl apply -f $(rendered neonvm.yaml)
	kubectl -n neonvm-system rollout status daemonset neonvm-device-plugin
	kubectl apply -f $(rendered neonvm-controller.yaml)
	kubectl -n neonvm-system rollout status deployment neonvm-controller
	kubectl apply -f $(rendered neonvm-vxlan-controller.yaml)
	kubectl -n neonvm-system rollout status daemonset neonvm-vxlan-controller
	kubectl apply -f $(rendered autoscale-scheduler.yaml)
	kubectl -n kube-system rollout status deployment autoscale-scheduler
	kubectl apply -f $(rendered autoscaler-agent.yaml)
	kubectl -n kube-system rollout status daemonset autoscaler-agent

	- name: load e2e test vm image
	env:
	TEST_IMAGE: ${{ needs.build-test-vm.outputs.vm-postgres-16-bullseye }}
	timeout-minutes: 2
	run: \|
	# Pull the docker image so we can re-tag it, because using a consistent tag inside the
	# cluster means we can avoid dynamically editing the image used in the kuttl files.
	docker pull "$TEST_IMAGE"
	docker image tag "$TEST_IMAGE" "$IMG_E2E_TEST"
	make load-example-vms

	- run: make e2e
	timeout-minutes: 15

	- name: Get k8s logs and events
	if: always()
	run: \|
	if ! kubectl config current-context; then
	echo "skipping cluster logs because no cluster found in kubectl context"
	exit 0
	fi

	namespaces=$(kubectl get namespaces -o jsonpath='{.items[*].metadata.name}')
	for namespace in $namespaces; do
	if [[ "$namespace" == "neonvm-system" ]] \|\| [[ "$namespace" == kuttl-test-* ]]; then
	tee_if_needed=$GITHUB_STEP_SUMMARY
	else
	tee_if_needed=/dev/null
	fi

	{
	echo "<details>"
	echo "<summary>Namespace=$namespace</summary>"
	} \| tee -a $tee_if_needed

	pods=$(kubectl get pods -n $namespace -o jsonpath='{.items[*].metadata.name}')
	for pod in $pods; do
	{
	echo "<details>"
	echo "<summary>- Namespace=$namespace Pod=$pod Logs</summary>"
	echo "<pre>"
	} \| tee -a $tee_if_needed

	restarts=$(
	kubectl get pod -n $namespace $pod -o jsonpath='{.status.containerStatuses[0].restartCount}' \|\| echo '0'
	)
	{
	if [ "$restarts" -ne 0 ]; then
	echo "CONTAINER RESTARTED $restarts TIME(S)"
	echo "Previous logs:"
	kubectl logs -n $namespace -p $pod \|\| echo 'Error getting logs'
	echo "Current logs:"
	kubectl logs -n $namespace $pod \|\| echo 'Error getting logs'
	else
	echo "Logs:"
	kubectl logs -n $namespace $pod \|\| echo 'Error getting logs'
	fi
	} \| tee -a $tee_if_needed
	{
	echo "</pre>"
	echo "</details>"
	} \| tee -a $tee_if_needed

	{
	echo "<details>"
	echo "<summary>- Namespace=$namespace Pod=$pod Events</summary>"
	echo "<pre>"
	} \| tee -a $tee_if_needed

	(kubectl get events --namespace $namespace --field-selector involvedObject.name=$pod \|\| echo 'Error getting events') \| tee -a $tee_if_needed

	{
	echo "</pre>"
	echo "</pre>"
	echo "</details>"
	} \| tee -a $tee_if_needed
	done

	echo "</details>" \| tee -a $tee_if_needed
	done

	- name: Cleanup
	if: always()
	run: make ${{ matrix.cluster }}-destroy

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

vm-runner: external traffic Prometheus metrics endpoint #3836

Workflow file

vm-runner: external traffic Prometheus metrics endpoint #3836

Jobs

Run details

Workflow file for this run