From 02f31e0dee82b5e67944a44b7be084c7a1d470d9 Mon Sep 17 00:00:00 2001
From: Mark Laing <mark.laing@canonical.com>
Date: Wed, 21 Aug 2024 17:18:41 +0100
Subject: [PATCH] WIP

---
 .github/workflows/tests.yml | 119 ++++++------------------------------
 tests/devlxd-container      |   3 +-
 tests/vm-migration          | 107 ++++++++++++++++++++------------
 3 files changed, 88 insertions(+), 141 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 904efbc8a..1dc842c81 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -83,103 +83,11 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: ${{ fromJSON(inputs.ubuntu-releases || '["20.04", "22.04", "24.04"]') }}
-        track: ${{ fromJSON(inputs.snap-tracks || '["latest/edge", "5.21/edge", "5.0/edge", "4.0/edge"]') }}
+        os: ${{ fromJSON(inputs.ubuntu-releases || '["24.04"]') }}
+        track: ${{ fromJSON(inputs.snap-tracks || '["latest/edge"]') }}
         test:
-          - cgroup
-          - cluster
-          - container
-          - container-copy
-          - conversion
-          - cpu-vm
-          - devlxd-vm
-          - devlxd-container
-          - docker
-          - efi-vars-editor-vm
-          - interception
-          - pylxd
-          - network-bridge-firewall
-          - network-ovn
-          - network-routed
-          - snapd
-          - storage-buckets
-          - storage-disks-vm
-          - "storage-vm dir"
-          - "storage-vm btrfs"
-          - "storage-vm ceph"
-          - "storage-vm lvm"
-          - "storage-vm lvm-thin"
-          - "storage-vm zfs"
-          - storage-volumes-vm
-          - tpm-vm
-          - vm-nesting
           - vm-migration
-        include:
-          - test: qemu-external-vm
-            track: "latest/edge"
-            os: "24.04"
-        exclude:
-          # not compatible with 4.0/*
-          - test: container-copy
-            track: "4.0/edge"
-          - test: conversion
-            track: "4.0/edge"
-          - test: cpu-vm
-            track: "4.0/edge"
-          - test: devlxd-vm
-            track: "4.0/edge"
-          - test: efi-vars-editor-vm
-            track: "4.0/edge"
-          - test: network-bridge-firewall
-            os: 20.04
-            track: "4.0/edge"
-          - test: network-ovn
-            track: "4.0/edge"
-          # https://github.com/canonical/pylxd/issues/590
-          - test: pylxd
-            track: "4.0/edge"
-          - test: storage-buckets
-            track: "4.0/edge"
-          - test: storage-disks-vm
-            track: "4.0/edge"
-          - test: "storage-vm dir"
-            track: "4.0/edge"
-          - test: "storage-vm btrfs"
-            track: "4.0/edge"
-          - test: "storage-vm ceph"
-            track: "4.0/edge"
-          - test: "storage-vm lvm"
-            track: "4.0/edge"
-          - test: "storage-vm lvm-thin"
-            track: "4.0/edge"
-          - test: "storage-vm zfs"
-            track: "4.0/edge"
-          - test: storage-volumes-vm
-            track: "4.0/edge"
-          - test: tpm-vm
-            track: "4.0/edge"
-          # not compatible with 5.0/*
-          - test: efi-vars-editor-vm  # not compatible with 5.0/*
-            track: "5.0/edge"
-          # waiting for integration with microceph
-          - test: "storage-vm ceph"
-          # skip track/os combinaisons that are too far appart
-          - track: "4.0/edge"
-            os: "24.04"
-          - track: "5.0/edge"
-            os: "24.04"
-          - track: "5.0/edge"
-            os: "20.04"
-          - track: "5.21/edge"
-            os: "20.04"
-          - track: "latest/edge"
-            os: "20.04"
-          - track: "latest/edge"
-            os: "22.04"
-          - test: "vm-migration"
-            track: "4.0/edge"
-          - test: "vm-migration"
-            track: "5.0/edge"
+          - devlxd-container
 
     steps:
       - name: Performance tuning
@@ -195,7 +103,7 @@ jobs:
           echo "force-unsafe-io" | sudo tee /etc/dpkg/dpkg.cfg.d/force-unsafe-io
 
       - name: Reclaim some space (storage tests only)
-        if: ${{ startsWith(matrix.test, 'storage') || matrix.test == 'vm-nesting' || matrix.test == 'conversion' }}
+        if: ${{ startsWith(matrix.test, 'storage') || matrix.test == 'vm-nesting' || matrix.test == 'conversion' || matrix.test == 'vm-migration' }}
         run: |
           set -eux
           df -h
@@ -225,6 +133,16 @@ jobs:
           sudo rm -rf /opt/ghc
           df -h
 
+      - name: Reclaim some memory (VM migration tests only)
+        if: ${{ matrix.test == 'vm-migration' }}
+        run: |
+          set -eux
+
+          free -mt
+          sudo systemctl stop dpkg-db-backup.timer e2scrub_all.timer fstrim.timer logrotate.timer man-db.timer motd-news.timer phpsessionclean.timer update-notifier-download.timer update-notifier-motd.timer
+          sudo systemctl stop iscsid.socket multipathd.socket
+          free -mt
+
       - name: Remove docker
         run: |
           set -eux
@@ -255,10 +173,6 @@ jobs:
         run: |
           set -eux
 
-          # XXX: prevent accidental usage of `images:` in CI test jobs.
-          #      All tests should be done using officially supported images.
-          echo '127.0.0.1 images.lxd.canonical.com' | sudo tee /etc/hosts
-
           TEST_SCRIPT="$(echo ${{ matrix.test }} | cut -d " " -f 1)"
           EXTRA_ARGS="$(echo ${{ matrix.test }} | cut -d " " -f 2- --only-delimited)"
           if [ "${TEST_SCRIPT}" = "cluster" ]; then
@@ -266,7 +180,10 @@ jobs:
             src_track="$(echo "${dst_track}" | cut -d/ -f1)/stable"
             EXTRA_ARGS="${EXTRA_ARGS:-3} ${src_track} ${{ matrix.track }}"
           fi
-          sudo --preserve-env=PURGE_LXD,TEST_IMG ./bin/local-run "tests/${TEST_SCRIPT}" ${{ matrix.track }} ${EXTRA_ARGS:-}
+          sudo --preserve-env=PURGE_LXD,TEST_IMG,GITHUB_ACTIONS ./bin/local-run "tests/${TEST_SCRIPT}" ${{ matrix.track }} ${EXTRA_ARGS:-}
+
+      - name: Setup tmate session
+        uses: mxschmitt/action-tmate@v3
 
       # always update cache as we have our own logic of
       # cache invalidation and updates in addition to a date check
diff --git a/tests/devlxd-container b/tests/devlxd-container
index bb0d57afd..5aab8a24f 100755
--- a/tests/devlxd-container
+++ b/tests/devlxd-container
@@ -83,7 +83,8 @@ if hasNeededAPIExtension instance_ready_state; then
   [ "$(lxc config get c1 volatile.last_state.ready)" = "false" ]
 
   lxc start c1
-  waitInstanceBooted c1
+  waitInstanceReady c1
+  lxc exec c1 -- systemctl start snapd # This will wait for snapd to start if it hasn't already.
 else
   echo "Skipping instance Ready state tests, not supported"
 fi
diff --git a/tests/vm-migration b/tests/vm-migration
index 7bdd40aec..c3655f7c2 100644
--- a/tests/vm-migration
+++ b/tests/vm-migration
@@ -12,7 +12,7 @@ lxc network create lxdbr0
 lxc profile device add default eth0 nic network=lxdbr0
 
 poolName="ctpool$$"
-poolDriver=dir
+poolDriver=zfs
 
 echo "==> Create storage pool using driver ${poolDriver}"
 lxc storage create "${poolName}" "${poolDriver}"
@@ -20,8 +20,32 @@ lxc profile device add default root disk path="/" pool="${poolName}"
 
 # Create ceph node
 lxc init "${TEST_IMG:-ubuntu-minimal-daily:24.04}" ceph --vm -c limits.cpu=2 -c limits.memory=4GiB
-lxc storage volume create "${poolName}" ceph-disk size=20GiB --type=block
-lxc config device add ceph ceph-disk disk pool="${poolName}" source=ceph-disk
+if [ -n "${GITHUB_ACTIONS:-}" ]; then
+  # If the rootfs and the ephemeral part are on the same physical disk, giving the whole
+  # disk to microceph would wipe our rootfs. Since it is pretty rare for GitHub Action
+  # runners to have a single disk, we immediately bail rather than trying to gracefully
+  # handle it. Once snapd releases with https://github.com/snapcore/snapd/pull/13150,
+  # we will be able to stop worrying about that special case.
+  if [ "$(stat -c '%d' /)" = "$(stat -c '%d' /mnt)" ]; then
+    echo "FAIL: rootfs and ephemeral part on the same disk, aborting"
+    exit 1
+  fi
+
+  # Free-up the ephemeral disk to use it as ceph OSD.
+  # https://github.com/canonical/microceph/issues/288 and https://github.com/canonical/microceph/issues/289
+  swapoff /mnt/swapfile
+  ephemeral_disk="$(findmnt --noheadings --output SOURCE --target /mnt | sed 's/[0-9]\+$//')"
+  umount /mnt
+
+  lxc config device add ceph ceph-disk disk source="${ephemeral_disk}" path=/dev/sdb
+else
+  lxc storage volume create "${poolName}" ceph-disk size=20GiB --type=block
+  lxc config device add ceph ceph-disk disk pool="${poolName}" source=ceph-disk
+fi
+
+
+# Disable vGPU to save RAM
+lxc config set ceph raw.qemu.conf='[device "qemu_gpu"]'
 lxc start ceph
 
 # Wait for snap in ceph instance.
@@ -40,7 +64,7 @@ lxc exec ceph -- microceph.ceph osd crush rule create-replicated replicated defa
 for flag in nosnaptrim noscrub nobackfill norebalance norecover noscrub nodeep-scrub; do
     lxc exec ceph -- microceph.ceph osd set "${flag}"
 done
-lxc exec ceph -- microceph disk add /dev/sdb
+lxc exec ceph -- microceph disk add /dev/sdb --wipe
 lxc exec ceph -- microceph.ceph osd pool create cephfs_meta 32
 lxc exec ceph -- microceph.ceph osd pool create cephfs_data 32
 lxc exec ceph -- microceph.ceph fs new cephfs cephfs_meta cephfs_data
@@ -53,22 +77,26 @@ for _ in $(seq 60); do
   fi
 done
 
-# Launch two instances for our LXD cluster and wait for them to be ready. If the host supports `devlxd_images_vm` then
-# set `security.devlxd.images=true` so that we don't have to download the image again.
+# Initialise two instances for our LXD cluster. If the host supports `devlxd_images_vm` then set
+# `security.devlxd.images=true` so that we don't have to download the image again.
 if hasNeededAPIExtension devlxd_images_vm; then
-  lxc launch "${TEST_IMG:-ubuntu-minimal-daily:24.04}" member1 --vm -c limits.memory=2GiB -c security.devlxd.images=true
-  lxc launch "${TEST_IMG:-ubuntu-minimal-daily:24.04}" member2 --vm -c limits.memory=2GiB -c security.devlxd.images=true
+  lxc init "${TEST_IMG:-ubuntu-minimal-daily:24.04}" member1 --vm -c limits.cpu=2 -c limits.memory=4GiB -c security.devlxd.images=true
+  lxc init "${TEST_IMG:-ubuntu-minimal-daily:24.04}" member2 --vm -c limits.cpu=2 -c limits.memory=4GiB -c security.devlxd.images=true
 else
-  lxc launch "${TEST_IMG:-ubuntu-minimal-daily:24.04}" member1 --vm -c limits.memory=2GiB
-  lxc launch "${TEST_IMG:-ubuntu-minimal-daily:24.04}" member2 --vm -c limits.memory=2GiB
+  lxc init "${TEST_IMG:-ubuntu-minimal-daily:24.04}" member1 --vm -c limits.cpu=2 -c limits.memory=4GiB
+  lxc init "${TEST_IMG:-ubuntu-minimal-daily:24.04}" member2 --vm -c limits.cpu=2 -c limits.memory=4GiB
 fi
 
+# Disable vGPU to save RAM
+lxc config set member1 raw.qemu.conf='[device "qemu_gpu"]'
+lxc config set member2 raw.qemu.conf='[device "qemu_gpu"]'
+
+# Start the instances and wait for member1 to be ready.
+lxc start member1
+lxc start member2
 waitInstanceReady member1
-waitInstanceReady member2
 # shellcheck disable=SC3044 # Ignore "declare is undefined" shellcheck error.
 lxc exec member1 -- sh -c "$(declare -f waitSnapdSeed); waitSnapdSeed"
-# shellcheck disable=SC3044 # Ignore "declare is undefined" shellcheck error.
-lxc exec member2 -- sh -c "$(declare -f waitSnapdSeed); waitSnapdSeed"
 
 # Install LXD in the first member.
 lxc exec member1 -- snap remove --purge lxd || true
@@ -91,6 +119,11 @@ lxc exec member1 -- lxc config set core.https_address="${member1Address}:8443"
 lxc exec member1 -- lxc cluster enable member1
 joinToken="$(lxc exec member1 -- lxc cluster add member2 --quiet)"
 
+# Ensure member2 is ready.
+waitInstanceReady member2
+# shellcheck disable=SC3044 # Ignore "declare is undefined" shellcheck error.
+lxc exec member2 -- sh -c "$(declare -f waitSnapdSeed); waitSnapdSeed"
+
 # Install LXD on the second member.
 lxc exec member2 -- snap remove --purge lxd || true
 lxc exec member2 -- snap install lxd --channel="${LXD_SNAP_CHANNEL}"
@@ -136,35 +169,31 @@ lxc exec member1 -- lxc storage create ceph ceph
 lxc exec member1 -- lxc storage volume create ceph vol1 --type=block size=500MiB
 
 # Create a VM in the cluster, on member1.
-lxc exec member1 -- lxc init "${TEST_IMG:-ubuntu-minimal-daily:24.04}" v1 --vm --storage ceph --target member1 -c migration.stateful=true -c limits.memory=1GiB
+lxc exec member1 -- lxc init images:alpine/3.20 v1 --vm --storage ceph --target member1 -c migration.stateful=true -c limits.cpu=1 -c limits.memory=1GiB -c security.secureboot=false
 
 # Add vol1 as a disk device to the VM.
 lxc exec member1 -- lxc config device add v1 vol1-disk disk pool=ceph source=vol1
 
-# Start the VM.
-lxc exec member1 -- lxc start v1
-
-# Wait for a long time for it to boot (doubly nested VM takes a while).
-while [ "$(lxc exec member1 -- lxc info v1 | awk '{if ($1 == "Processes:") print $2}')" -le 1 ]; do
-  sleep 30
-done
-
-# vol1 should be available as /dev/sdb. Format it as ext4. Then mount it and create a file.
-lxc exec member1 -- lxc exec v1 -- mkfs -t ext4 /dev/sdb
-lxc exec member1 -- lxc exec v1 -- mkdir /mnt/vol1
-lxc exec member1 -- lxc exec v1 -- mount -t ext4 /dev/sdb /mnt/vol1
-lxc exec member1 -- lxc exec v1 -- cp /etc/hostname /mnt/vol1/bar
-
-# Move the instance
-lxc exec member1 -- lxc move v1 --target member2
-
-# The VM is slow. So the agent isn't immediately available after the live migration.
-while [ "$(lxc exec member1 -- lxc info v1 | awk '{if ($1 == "Processes:") print $2}')" -le 1 ]; do
-  sleep 5
-done
-
-# The volume should be functional, still mounted, and the file we created should still be there with the same contents.
-[ "$(lxc exec member2 -- lxc exec v1 -- cat /mnt/vol1/bar)" = "v1" ]
-
+## Start the VM.
+#lxc exec member1 -- lxc start v1
+#
+## Wait for a long time for it to boot (doubly nested VM takes a while).
+#lxc exec member1 -- sh -c 'while [ "$(lxc info v1 | awk '"'"'{if ($1 == "Processes:") print $2}'"'"')" -le 1 ]; do echo "Instance v1 still not booted, waiting 60s..." && sleep 60; done'
+#
+## vol1 should be available as /dev/sdb. Format it as ext4. Then mount it and create a file.
+#lxc exec member1 -- lxc exec v1 -- mkfs -t ext4 /dev/sdb
+#lxc exec member1 -- lxc exec v1 -- mkdir /mnt/vol1
+#lxc exec member1 -- lxc exec v1 -- mount -t ext4 /dev/sdb /mnt/vol1
+#lxc exec member1 -- lxc exec v1 -- cp /etc/hostname /mnt/vol1/bar
+#
+## Move the instance
+#lxc exec member1 -- lxc move v1 --target member2
+#
+## The VM is slow. So the agent isn't immediately available after the live migration.
+#lxc exec member1 -- sh -c 'while [ "$(lxc info v1 | awk '"'"'{if ($1 == "Processes:") print $2}'"'"')" -le 1 ]; do echo "Instance v1 still not booted, waiting 60s..." && sleep 60; done'
+#
+## The volume should be functional, still mounted, and the file we created should still be there with the same contents.
+#[ "$(lxc exec member2 -- lxc exec v1 -- cat /mnt/vol1/bar)" = "v1" ]
+#
 # shellcheck disable=SC2034
 FAIL=0
\ No newline at end of file